In [1]:
# Import modules

from google.cloud import bigquery
from google.oauth2 import service_account

import pandas as pd
import pandas_gbq

import os

In [2]:
# Establish credentials and connection

service_path = "/Users/Owner/Google Drive/MSBA/Classes/bmkt_670_applied_data_analytics/gbq_key/"
service_file = 'evident-catcher-327918-fa366e7e71f5.json'
gbq_proj_id = 'evident-catcher-327918'
gbq_dataset_id = 'wedge_master'

private_key = service_path + service_file

credentials = service_account.Credentials.from_service_account_file(service_path + service_file)

client = bigquery.Client(credentials = credentials, project=gbq_proj_id)

In [3]:
# Query all distinct card numbers that are not 3

query_cards = (
    "SELECT distinct card_no "
    "FROM `evident-catcher-327918.wedge_master.transArchive*` "
    "WHERE card_no !=3"
)

In [4]:
# Query results directly to pandas data frame
# Reference: https://cloud.google.com/bigquery/docs/bigquery-storage-python-pandas

df_cards = (
    client.query(query_cards)
    .result()
    .to_dataframe(
        create_bqstorage_client=True,
    )
)

In [7]:
# Check to see if it worked

len(df_cards)
# df_cards.head()

27208

In [8]:
# Take random sample of 1% of cards and create new data frame

card_sample = df_cards.sample(frac=.01)

In [9]:
# Upload card_sample table to GBQ

table_full_name = ".".join([gbq_proj_id,gbq_dataset_id,"card_sample"])

card_sample.to_gbq(destination_table = table_full_name,
                   if_exists = 'replace', 
                   project_id = gbq_proj_id, 
                   credentials = credentials)

1it [00:03,  3.37s/it]


In [10]:
# Select all transaction records for card_no's in sample table

query_trans = (
    "SELECT * "
    "FROM `evident-catcher-327918.wedge_master.transArchive*` "
    "WHERE card_no IN (SELECT * FROM `evident-catcher-327918.wedge_master.card_sample`) "
)

In [11]:
# Query results directly to pandas data frame
# Reference: https://cloud.google.com/bigquery/docs/bigquery-storage-python-pandas

df_trans = (
    client.query(query_trans)
    .result()
    .to_dataframe(
        create_bqstorage_client=True,
    )
)

In [13]:
# Check to see if it worked

len(df_trans)
# df_trans.head()

710152

In [14]:
# Write data frame to .txt file

file_name = 'trans_sample.txt'

if os.path.isfile(file_name):
    print("That file already exists. Delete it, or choose a new name.")
else:
    df_trans.to_csv(file_name, header = True, index = False)
