# PART 2 

## BUILD A SAMPLE OF OWNERS

In the following python script, I connect to my GBQ instance, build a list of owners, take a sample of the owners, extract all records associated with those owners, and write them to a local text file. 

In [1]:
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
from pandas.io import gbq

from zipfile import ZipFile

In [2]:
def tbl_exists(client, table_ref):
    from google.cloud.exceptions import NotFound
    try:
        client.get_table(table_ref)
        return True
    except NotFound:
        return False

In [3]:
# connect to my GBQ instance

# set up credentials
service_path = "/Users/austinsmith/Documents/Fall21/ADA/Wedge"
service_file = '/thewedge-austin-4c5ad634b17b.json' 
gbq_proj_id = 'thewedge-austin'
gbq_dataset_id = 'the_wedge_A' 

private_key =service_path + service_file

credentials = service_account.Credentials.from_service_account_file(service_path + service_file)

# establish connection with gbq
client = bigquery.Client(credentials = credentials, project=gbq_proj_id)

In [4]:
# create a table that returns all columns of distinct card numbers in gbq

my_table = "distinct_cards"

table_full_name = ".".join([gbq_proj_id,gbq_dataset_id,my_table])

In [5]:
if not tbl_exists(client, table_full_name) :
    table_ref = client.create_table(
        table = table_full_name
    )
else :
    table_ref = client.get_table(table_full_name) 
    #note to self: also clean table out!
    
    query_text ="".join(['DELETE FROM `',table_full_name,'` WHERE 1=1'])
    # you have to have WHERE clause in a DELETE for GBQ

    job_config = bigquery.QueryJobConfig()

    query_job = client.query(
        query_text,
        location="US",
        job_config=job_config,
    )  # API request - starts the query

    query_job.result()  # Waits for the query to finish

In [6]:
table = client.get_table(table_ref)
print("Table {} contains {} columns".format(table_ref.table_id,len(table.schema)))

Table distinct_cards contains 0 columns


In [7]:
job_config = bigquery.QueryJobConfig(destination=table_ref)

# grab all of the distinct card numbers that are members
sql =  """
    SELECT distinct(card_no)
    FROM `thewedge-austin.the_wedge_A.clean_transArchive_*` 
    WHERE card_no != 3
    """

# Start the query, passing in the extra configuration.
query_job = client.query(sql, job_config=job_config)  # Make an API request.
query_job.result()  # Wait for the job to complete.

print("Query results loaded to the table {}".format(table_ref.table_id))

Query results loaded to the table distinct_cards


In [16]:
# then I execute a new query to get a table of a random 1.25% sample of owners
# note: landing on 2% was trial/error to get close to 250MB of total data returned

my_table = "distinct_cards_sample"

table_full_name = ".".join([gbq_proj_id,gbq_dataset_id,my_table])

if not tbl_exists(client, table_full_name) :
    table_ref = client.create_table(
        table = table_full_name
    )
else :
    table_ref = client.get_table(table_full_name) 
    #note to self: also clean table out!

# NEED TO COMMENT THESE OUT IF CREATING THE TABLE FOR THE FIRST TIME
    query_text ="".join(['DELETE FROM `',table_full_name,'` WHERE 1=1'])
    #you have to have WHERE clause in a DELETE for GBQ

    job_config = bigquery.QueryJobConfig()

    query_job = client.query(
        query_text,
        location="US",
        job_config=job_config,
    )  # API request - starts the query

    query_job.result()  # Waits for the query to finish

job_config = bigquery.QueryJobConfig(destination=table_ref)

sql = """
  SELECT
  *
FROM
  `thewedge-austin.the_wedge_A.distinct_cards` 
  where rand() < 0.0125
"""

query_job = client.query(sql, job_config=job_config)  # Make an API request.
query_job.result()  # Wait for the job to complete.

print("Query results loaded to the table {}".format(table_ref.table_id))

Query results loaded to the table distinct_cards_sample


In [17]:
# finally, I run a query to match all of the transaction data to the owners in that random sample table

my_table = "owners_sample"

table_full_name = ".".join([gbq_proj_id,gbq_dataset_id,my_table])

if not tbl_exists(client, table_full_name) :
    table_ref = client.create_table(
        table = table_full_name
    )
else :
    table_ref = client.get_table(table_full_name) 
    #note to self: also clean table out!

# NEED TO COMMENT THESE OUT IF CREATING THE TABLE FOR THE FIRST TIME
    query_text ="".join(['DELETE FROM `',table_full_name,'` WHERE 1=1'])
    # you have to have WHERE clause in a DELETE for GBQ

    job_config = bigquery.QueryJobConfig()

    query_job = client.query(
        query_text,
        location="US",
        job_config=job_config,
    )  # API request - starts the query

    query_job.result()  # Waits for the query to finish

job_config = bigquery.QueryJobConfig(destination=table_ref)

sql = """
SELECT
  *
FROM
  `thewedge-austin.the_wedge_A.clean_transArchive_*`
WHERE
  card_no IN (
  SELECT
    *
  FROM
    `thewedge-austin.the_wedge_A.distinct_cards_sample`)
"""

query_job = client.query(sql, job_config=job_config)  # Make an API request.
query_job.result()  # Wait for the job to complete.

print("Query results loaded to the table {}".format(table_ref.table_id))

Query results loaded to the table owners_sample


In [18]:
# put the results of the query into a pandas dataframe

df = client.query(sql).to_dataframe()

In [19]:
# export that dataframe to a local text file
# get rid of the index column

df.to_csv('owners_sample.txt', sep= '\t')