In [5]:
from google.cloud import bigquery
import pandas as pd

# Connect to your GBQ instance
client = bigquery.Client()

# Define the project and dataset
project_id = "umt-msba"
dataset_id = "transactions"

# Control the sample size with a variable
sample_size = 400 # Samping 400 owners for approximately 250MB sample

# Sample owners directly within BigQuery
owner_query = f"""
    WITH unique_owners AS (
        SELECT DISTINCT card_no
        FROM `{project_id}.{dataset_id}.transArchive_*`
        WHERE card_no != 3
    )
    SELECT card_no
    FROM unique_owners
    ORDER BY RAND()
    LIMIT {sample_size}
"""
sampled_owners_df = client.query(owner_query).to_dataframe()

# Convert owners to a list
owner_list = sampled_owners_df['card_no'].tolist()

# Define the batch size for the IN clause
batch_size = 150

# Function to query transactions for a batch of owners
def fetch_transactions(owner_batch):
    owner_str = ','.join(map(str, owner_batch))
    transaction_query = f"""
        SELECT * FROM `{project_id}.{dataset_id}.transArchive_*`
        WHERE card_no IN ({owner_str})
    """
    return client.query(transaction_query).to_dataframe()

# Save results in batches to avoid memory overload
output_file = 'owner_transactions.csv'
first_write = True

with open(output_file, 'w') as f:
    for i in range(0, len(owner_list), batch_size):
        owner_batch = owner_list[i:i+batch_size]
        transaction_df = fetch_transactions(owner_batch)
        
        # Write to CSV
        transaction_df.to_csv(f, header=first_write, index=False, mode='a', lineterminator='\n')
        first_write = False  # Ensure header is only written once

print(f"Sampled transactions extracted and saved to {output_file}")





Sampled transactions extracted and saved to owner_transactions.csv
Data size: 125.13076877593994 MB
