In [69]:
from google.cloud import bigquery
client = bigquery.Client(project="umt-msba")
import pandas as pd

In [70]:
query = """
    SELECT *
    FROM `umt-msba.transactions.transArchive_*`
    WHERE card_no = 25220 
"""

query_job = client.query(query)


In [71]:
df = query_job.to_dataframe()

In [72]:
print(len(df))

928


In [73]:
df

Unnamed: 0,datetime,register_no,emp_no,trans_no,upc,description,trans_type,trans_subtype,trans_status,department,...,batchHeaderID,local,organic,display,receipt,card_no,store,branch,match_id,trans_id
0,2014-05-01 14:53:41+00:00,7.0,72.0,13.0,0,Cash,T,CA,,0.0,...,,0.0,,,0.0,25220.0,1.0,0.0,0.0,10.0
1,2014-06-01 14:24:51+00:00,6.0,2.0,128.0,0,Cash,T,CA,,0.0,...,,0.0,,,0.0,25220.0,1.0,0.0,0.0,10.0
2,2014-06-01 14:24:51+00:00,6.0,2.0,128.0,TAX,Tax,A,,,0.0,...,,0.0,,,0.0,25220.0,1.0,0.0,0.0,13.0
3,2014-05-11 10:13:08+00:00,7.0,64.0,85.0,DISCOUNT,Discount,I,,,0.0,...,,0.0,,,0.0,25220.0,1.0,0.0,0.0,15.0
4,2014-05-27 13:06:07+00:00,5.0,6.0,64.0,DISCOUNT,Discount,I,,,0.0,...,,0.0,,,0.0,25220.0,1.0,0.0,0.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
923,2010-07-18 20:35:13+00:00,4.0,63.0,86.0,0,Credit Card,T,CC,,0.0,...,,0.0,,,0.0,25220.0,1.0,0.0,0.0,27.0
924,2010-08-14 14:04:40+00:00,7.0,8.0,24.0,0,Credit Card,T,CC,,0.0,...,,0.0,,,0.0,25220.0,1.0,0.0,0.0,10.0
925,2010-08-30 18:24:15+00:00,6.0,24.0,57.0,0,Credit Card,T,CC,,0.0,...,,0.0,,,0.0,25220.0,1.0,0.0,0.0,10.0
926,2010-08-06 20:19:04+00:00,4.0,35.0,39.0,0,Credit Card,T,CC,,0.0,...,,0.0,,,0.0,25220.0,1.0,0.0,0.0,50.0


In [74]:
# Save DataFrame to your current working directory
df.to_csv('query_results.csv', index=False)  # Saves to current directory


In [75]:
# Apply the filter to exclude returns and voids
df_filtered = df[~df['trans_status'].isin(['R', 'V'])]

# Save the filtered DataFrame to a new CSV file
df_filtered.to_csv('filtered_query_results.csv', index=False)


In [76]:
import os

# Path to your CSV file
file_path = 'filtered_query_results.csv'

# Get the file size in bytes
file_size_bytes = os.path.getsize(file_path)

# Convert to megabytes (MB)
file_size_mb = file_size_bytes / (1024 * 1024)

print(f"File size: {file_size_mb:.2f} MB")


File size: 0.21 MB


In [77]:
# Query to pull random sample of 615 owners with <= 99,000 transactions
query_sample = """
WITH owners_filtered AS (
    SELECT 
        card_no, 
        COUNT(trans_no) AS num_transactions
    FROM 
        `umt-msba.transactions.transArchive_*`
    WHERE 
        card_no != 3  -- Exclude non-owners
    GROUP BY 
        card_no
    HAVING 
        COUNT(trans_no) <= 99000  -- Keep owners with <= 99,000 transactions
)
SELECT 
    card_no
FROM 
    owners_filtered
ORDER BY 
    RAND()  -- Randomly order the owners
LIMIT 615;
"""

# Execute the query and store the results in a DataFrame
query_job_sample = client.query(query_sample)
owners_df = query_job_sample.to_dataframe()

# Print the sample to check it
print(owners_df.head())

   card_no
0  50582.0
1  19003.0
2  13562.0
3  20149.0
4  12732.0


In [78]:
# Convert card_no to integers in the DataFrame
owners_df['card_no'] = owners_df['card_no'].astype(int)
sampled_owners_list = owners_df['card_no'].tolist()  # Convert to a list

# SQL query to pull all transaction records for the sampled owners
query_records = """
SELECT *
FROM `umt-msba.transactions.transArchive_*`
WHERE CAST(card_no AS INT64) IN UNNEST(@sampled_owners)  -- Cast card_no to INT64
AND trans_status NOT IN ('R', 'V')  -- Exclude returns and voids
"""

# Use BigQuery's parameterized queries to pass the list of owners
job_config = bigquery.QueryJobConfig(
    query_parameters=[
        bigquery.ArrayQueryParameter("sampled_owners", "INT64", sampled_owners_list)
    ]
)

# Execute the query and retrieve the records into a pandas DataFrame
query_job_records = client.query(query_records, job_config=job_config)
df_records = query_job_records.to_dataframe()

# Save the resulting DataFrame to a CSV file
df_records.to_csv('sampled_owners_records.csv', index=False)

print(f"Number of records pulled: {len(df_records)}")


Number of records pulled: 1249151


In [79]:
# Path to your CSV file
file_path = 'sampled_owners_records.csv'  # Make sure this matches the path where you saved the file

# Get the file size in bytes
file_size_bytes = os.path.getsize(file_path)

# Convert to megabytes (MB)
file_size_mb = file_size_bytes / (1024 * 1024)

print(f"File size: {file_size_mb:.2f} MB")

File size: 284.22 MB
