In [None]:
from google.cloud import bigquery
import pandas as pd

# Deliverable 1: Connects to your GBQ instance
client = bigquery.Client()

# List of tables from umt-msba project (excluding inactive ones)
table_names = [
    'transArchive_201001_201003', 'transArchive_201004_201006', 'transArchive_201007_201009', 
    'transArchive_201010_201012', 'transArchive_201101_201103', 'transArchive_201104', 
    'transArchive_201105', 'transArchive_201106', 'transArchive_201107_201109', 
    'transArchive_201110_201112', 'transArchive_201201_201203', 'transArchive_201204_201206', 
    'transArchive_201207_201209', 'transArchive_201210_201212', 'transArchive_201301_201303', 
    'transArchive_201304_201306', 'transArchive_201307_201309', 'transArchive_201310_201312', 
    'transArchive_201401_201403', 'transArchive_201404_201406', 'transArchive_201407_201409', 
    'transArchive_201410_201412', 'transArchive_201501_201503', 'transArchive_201504_201506', 
    'transArchive_201507_201509', 'transArchive_201510', 'transArchive_201511', 'transArchive_201512', 
    'transArchive_201601', 'transArchive_201602', 'transArchive_201603', 'transArchive_201604', 
    'transArchive_201605', 'transArchive_201606', 'transArchive_201607', 'transArchive_201608', 
    'transArchive_201609', 'transArchive_201610', 'transArchive_201611', 'transArchive_201612', 
    'transArchive_201701'
]

# Deliverable 2: Builds a list of owners
# Construct a UNION query across all tables to gather data excluding non-owners (card_no != 3)
union_query = " UNION ALL ".join([
    f"SELECT * FROM `umt-msba.transactions.{table_name}` WHERE card_no != 3" 
    for table_name in table_names
])

# Full query to fetch all owner transactions
full_query = f"""
    SELECT * FROM ({union_query})
    ORDER BY RAND()  -- Randomize the data
    LIMIT 425000  -- Adjusting this number adjusts sample size
"""

# Execute the query and fetch the data
df = client.query(full_query).to_dataframe()

# Deliverable 3: Takes a sample of the owners
# The sampling is done by the query itself using LIMIT and ORDER BY RAND()

# Check the size of the sample
print(f"Data size: {df.memory_usage(deep=True).sum() / (1024 * 1024)} MB")

# Deliverable 4: Extracts all records associated with those owners and writes them to a local text file
# Write the sample to a CSV file
df.to_csv("owner_sample.csv", index=False)

print("Data exported to owner_sample.csv")

# This takes around 6 minutes to run

In [20]:
from google.cloud import bigquery
import pandas as pd

# Connect to your GBQ instance
client = bigquery.Client()

# Define the project and dataset
project_id = "umt-msba"
dataset_id = "transactions"

# Define the list of table names
table_names = [
    'transArchive_201001_201003', 'transArchive_201004_201006', 'transArchive_201007_201009',
    'transArchive_201010_201012', 'transArchive_201101_201103', 'transArchive_201104', 
    'transArchive_201105', 'transArchive_201106', 'transArchive_201107_201109', 
    'transArchive_201110_201112', 'transArchive_201201_201203', 'transArchive_201204_201206', 
    'transArchive_201207_201209', 'transArchive_201210_201212', 'transArchive_201301_201303', 
    'transArchive_201304_201306', 'transArchive_201307_201309', 'transArchive_201310_201312', 
    'transArchive_201401_201403', 'transArchive_201404_201406', 'transArchive_201407_201409', 
    'transArchive_201410_201412', 'transArchive_201501_201503', 'transArchive_201504_201506', 
    'transArchive_201507_201509', 'transArchive_201510', 'transArchive_201511', 'transArchive_201512', 
    'transArchive_201601', 'transArchive_201602', 'transArchive_201603', 'transArchive_201604', 
    'transArchive_201605', 'transArchive_201606', 'transArchive_201607', 'transArchive_201608', 
    'transArchive_201609', 'transArchive_201610', 'transArchive_201611', 'transArchive_201612', 
    'transArchive_201701'
]

# Control the sample size with a variable
sample_size = 300

# Sample owners directly within BigQuery
owner_query = f"""
    WITH unique_owners AS (
        SELECT DISTINCT card_no
        FROM `{project_id}.{dataset_id}.{table_names[0]}`
        WHERE card_no != 3
    )
    SELECT card_no
    FROM unique_owners
    ORDER BY RAND()
    LIMIT {sample_size}
"""
sampled_owners_df = client.query(owner_query).to_dataframe()

# Convert owners to a list
owner_list = sampled_owners_df['card_no'].tolist()

# Define the batch size for the IN clause
batch_size = 150

# Function to query transactions for a batch of owners
def fetch_transactions(owner_batch):
    owner_str = ','.join(map(str, owner_batch))
    union_all_query = " UNION ALL ".join([f"""
        SELECT * FROM `{project_id}.{dataset_id}.{table_name}`
        WHERE card_no IN ({owner_str})
    """ for table_name in table_names])

    return client.query(union_all_query).to_dataframe()

# Save results in batches to avoid memory overload
output_file = 'owner_transactions.csv'
first_write = True

with open(output_file, 'w') as f:
    for i in range(0, len(owner_list), batch_size):
        owner_batch = owner_list[i:i+batch_size]
        transaction_df = fetch_transactions(owner_batch)
        
        # Write to CSV
        transaction_df.to_csv(f, header=first_write, index=False, mode='a', lineterminator='\n')
        first_write = False  # Ensure header is only written once


print(f"Sampled transactions extracted and saved to {output_file}")
print(f"Data size: {transaction_df.memory_usage(deep=True).sum() / (1024 * 1024)} MB")

# This takes about 11 minutes to run



Sampled transactions extracted and saved to owner_transactions.csv
Data size: 370.1079978942871 MB
