In [1]:
from google.cloud import bigquery
import pandas as pd

# Deliverable 1: Connects to your GBQ instance
client = bigquery.Client()

# List of tables from umt-msba project (excluding inactive ones)
table_names = [
    'transArchive_201001_201003', 'transArchive_201004_201006', 'transArchive_201007_201009', 
    'transArchive_201010_201012', 'transArchive_201101_201103', 'transArchive_201104', 
    'transArchive_201105', 'transArchive_201106', 'transArchive_201107_201109', 
    'transArchive_201110_201112', 'transArchive_201201_201203', 'transArchive_201204_201206', 
    'transArchive_201207_201209', 'transArchive_201210_201212', 'transArchive_201301_201303', 
    'transArchive_201304_201306', 'transArchive_201307_201309', 'transArchive_201310_201312', 
    'transArchive_201401_201403', 'transArchive_201404_201406', 'transArchive_201407_201409', 
    'transArchive_201410_201412', 'transArchive_201501_201503', 'transArchive_201504_201506', 
    'transArchive_201507_201509', 'transArchive_201510', 'transArchive_201511', 'transArchive_201512', 
    'transArchive_201601', 'transArchive_201602', 'transArchive_201603', 'transArchive_201604', 
    'transArchive_201605', 'transArchive_201606', 'transArchive_201607', 'transArchive_201608', 
    'transArchive_201609', 'transArchive_201610', 'transArchive_201611', 'transArchive_201612', 
    'transArchive_201701'
]

# Deliverable 2: Builds a list of owners
# Construct a UNION query across all tables to gather data excluding non-owners (card_no != 3)
union_query = " UNION ALL ".join([
    f"SELECT * FROM `umt-msba.transactions.{table_name}` WHERE card_no != 3" 
    for table_name in table_names
])

# Full query to fetch all owner transactions (sample will be created later)
full_query = f"""
    SELECT * FROM ({union_query})
    ORDER BY RAND()  -- Randomize the data
    LIMIT 425000  -- Adjusting this number adjusts sample size
"""

# Execute the query and fetch the data
df = client.query(full_query).to_dataframe()
df = df.groupby('card_no').first().reset_index()

# Deliverable 3: Takes a sample of the owners
# The sampling is done by the query itself using LIMIT and ORDER BY RAND()

# Check the size of the sample
print(f"Data size: {df.memory_usage(deep=True).sum() / (1024 * 1024)} MB")

# Deliverable 4: Extracts all records associated with those owners and writes them to a local text file
# Write the sample to a CSV file
df.to_csv("owner_sample.csv", index=False)

print("Data exported to owner_sample.csv")

# This takes around 6 minutes to run



Data size: 253.51870441436768 MB
Data exported to owner_sample.csv


In [38]:
from google.cloud import bigquery
import pandas as pd

# Deliverable 1: Connects to your GBQ instance
client = bigquery.Client()

# List of tables from umt-msba project (excluding inactive ones)
table_names = [
    'transArchive_201001_201003', 'transArchive_201004_201006', 'transArchive_201007_201009', 
    'transArchive_201010_201012', 'transArchive_201101_201103', 'transArchive_201104', 
    'transArchive_201105', 'transArchive_201106', 'transArchive_201107_201109', 
    'transArchive_201110_201112', 'transArchive_201201_201203', 'transArchive_201204_201206', 
    'transArchive_201207_201209', 'transArchive_201210_201212', 'transArchive_201301_201303', 
    'transArchive_201304_201306', 'transArchive_201307_201309', 'transArchive_201310_201312', 
    'transArchive_201401_201403', 'transArchive_201404_201406', 'transArchive_201407_201409', 
    'transArchive_201410_201412', 'transArchive_201501_201503', 'transArchive_201504_201506', 
    'transArchive_201507_201509', 'transArchive_201510', 'transArchive_201511', 'transArchive_201512', 
    'transArchive_201601', 'transArchive_201602', 'transArchive_201603', 'transArchive_201604', 
    'transArchive_201605', 'transArchive_201606', 'transArchive_201607', 'transArchive_201608', 
    'transArchive_201609', 'transArchive_201610', 'transArchive_201611', 'transArchive_201612', 
    'transArchive_201701'
]

# Deliverable 2: Builds a list of owners
# Construct a UNION query across all tables to gather data excluding non-owners (card_no != 3)
union_query = " UNION ALL ".join([
    f"SELECT * FROM `umt-msba.transactions.{table_name}` WHERE card_no != 3" 
    for table_name in table_names
])

# Full query to fetch all owner transactions (sample will be created later)
full_query = f"""
    SELECT * FROM ({union_query})
    ORDER BY RAND()  -- Randomize the data
    LIMIT 1000000  -- Adjusting this number adjusts sample size
"""

# Execute the query and fetch the data
df = client.query(full_query).to_dataframe()
df = df.groupby('card_no').first().reset_index()

# Deliverable 3: Takes a sample of the owners
# The sampling is done by the query itself using LIMIT and ORDER BY RAND()

# Check the size of the sample
print(f"Data size: {df.memory_usage(deep=True).sum() / (1024 * 1024)} MB")

# Deliverable 4: Extracts all records associated with those owners and writes them to a local text file
# Write the sample to a CSV file
# df.to_csv("owner_sample.csv", index=False)

# print("Data exported to owner_sample.csv")

# This takes around 6 minutes to run



Data size: 13.930037498474121 MB


In [26]:
from google.cloud import bigquery
import pandas as pd

# Deliverable 1: Connect to your GBQ instance
client = bigquery.Client()

# List of tables from umt-msba project (excluding inactive ones)
table_names = [
    'transArchive_201001_201003', 'transArchive_201004_201006', 'transArchive_201007_201009', 
    'transArchive_201010_201012', 'transArchive_201101_201103', 'transArchive_201104', 
    'transArchive_201105', 'transArchive_201106', 'transArchive_201107_201109', 
    'transArchive_201110_201112', 'transArchive_201201_201203', 'transArchive_201204_201206', 
    'transArchive_201207_201209', 'transArchive_201210_201212', 'transArchive_201301_201303', 
    'transArchive_201304_201306', 'transArchive_201307_201309', 'transArchive_201310_201312', 
    'transArchive_201401_201403', 'transArchive_201404_201406', 'transArchive_201407_201409', 
    'transArchive_201410_201412', 'transArchive_201501_201503', 'transArchive_201504_201506', 
    'transArchive_201507_201509', 'transArchive_201510', 'transArchive_201511', 'transArchive_201512', 
    'transArchive_201601', 'transArchive_201602', 'transArchive_201603', 'transArchive_201604', 
    'transArchive_201605', 'transArchive_201606', 'transArchive_201607', 'transArchive_201608', 
    'transArchive_201609', 'transArchive_201610', 'transArchive_201611', 'transArchive_201612', 
    'transArchive_201701'
]

# Step 1: Build a UNION query across all tables, excluding non-owners (card_no != 3)
union_query = " UNION ALL ".join([
    f"SELECT * FROM `umt-msba.transactions.{table_name}` WHERE card_no != 3" 
    for table_name in table_names
])

# Step 2: Sample distinct owners by card_no, ensuring no duplicates, and randomizing
distinct_owners_query = f"""
    SELECT card_no,
           ANY_VALUE(datetime) AS datetime,
           ANY_VALUE(register_no) AS register_no,
           ANY_VALUE(emp_no) AS emp_no,
           ANY_VALUE(trans_no) AS trans_no,
           ANY_VALUE(upc) AS upc,
           ANY_VALUE(description) AS description,
           ANY_VALUE(trans_type) AS trans_type,
           ANY_VALUE(trans_subtype) AS trans_subtype,
           ANY_VALUE(trans_status) AS trans_status,
           ANY_VALUE(department) AS department,
           ANY_VALUE(quantity) AS quantity,
           ANY_VALUE(Scale) AS Scale,
           ANY_VALUE(cost) AS cost,
           ANY_VALUE(unitPrice) AS unitPrice,
           ANY_VALUE(total) AS total,
           ANY_VALUE(regPrice) AS regPrice,
           ANY_VALUE(altPrice) AS altPrice,
           ANY_VALUE(tax) AS tax,
           ANY_VALUE(taxexempt) AS taxexempt,
           ANY_VALUE(foodstamp) AS foodstamp,
           ANY_VALUE(wicable) AS wicable,
           ANY_VALUE(discount) AS discount,
           ANY_VALUE(memDiscount) AS memDiscount,
           ANY_VALUE(discountable) AS discountable,
           ANY_VALUE(discounttype) AS discounttype,
           ANY_VALUE(voided) AS voided,
           ANY_VALUE(percentDiscount) AS percentDiscount,
           ANY_VALUE(ItemQtty) AS ItemQtty,
           ANY_VALUE(volDiscType) AS volDiscType,
           ANY_VALUE(volume) AS volume,
           ANY_VALUE(VolSpecial) AS VolSpecial,
           ANY_VALUE(mixMatch) AS mixMatch,
           ANY_VALUE(matched) AS matched,
           ANY_VALUE(memType) AS memType,
           ANY_VALUE(staff) AS staff,
           ANY_VALUE(numflag) AS numflag,
           ANY_VALUE(itemstatus) AS itemstatus,
           ANY_VALUE(tenderstatus) AS tenderstatus,
           ANY_VALUE(charflag) AS charflag,
           ANY_VALUE(varflag) AS varflag,
           ANY_VALUE(batchHeaderID) AS batchHeaderID,
           ANY_VALUE(local) AS local,
           ANY_VALUE(organic) AS organic,
           ANY_VALUE(display) AS display,
           ANY_VALUE(receipt) AS receipt,
           ANY_VALUE(store) AS store,
           ANY_VALUE(branch) AS branch,
           ANY_VALUE(match_id) AS match_id,
           ANY_VALUE(trans_id) AS trans_id
    FROM ({union_query})
    GROUP BY card_no
    ORDER BY RAND()
    LIMIT 400000  -- Adjust this value to control the number of unique owners
"""


# Step 3: Execute the query to fetch the sampled records
df_unique_owners = client.query(distinct_owners_query).to_dataframe()

# Step 4: Check the size of the sample
print(f"Data size: {df_unique_owners.memory_usage(deep=True).sum() / (1024 * 1024)} MB")
print(f"Number of unique owners (card_no): {df_unique_owners['card_no'].nunique()}")
print(f"Total rows (should match unique owners): {len(df_unique_owners)}")

# Step 5: Write the filtered sample (one record per owner) to a CSV file
# df_unique_owners.to_csv("owner_sample_one_per_owner.csv", index=False)

# print("Data exported to owner_sample_one_per_owner.csv")




Data size: 16.04214382171631 MB
Number of unique owners (card_no): 25941
Total rows (should match unique owners): 25941


In [30]:
from google.cloud import bigquery
import pandas as pd

# Deliverable 1: Connect to your GBQ instance
client = bigquery.Client()

# List of tables from umt-msba project (excluding inactive ones)
table_names = [
    'transArchive_201001_201003', 'transArchive_201004_201006', 'transArchive_201007_201009', 
    'transArchive_201010_201012', 'transArchive_201101_201103', 'transArchive_201104', 
    'transArchive_201105', 'transArchive_201106', 'transArchive_201107_201109', 
    'transArchive_201110_201112', 'transArchive_201201_201203', 'transArchive_201204_201206', 
    'transArchive_201207_201209', 'transArchive_201210_201212', 'transArchive_201301_201303', 
    'transArchive_201304_201306', 'transArchive_201307_201309', 'transArchive_201310_201312', 
    'transArchive_201401_201403', 'transArchive_201404_201406', 'transArchive_201407_201409', 
    'transArchive_201410_201412', 'transArchive_201501_201503', 'transArchive_201504_201506', 
    'transArchive_201507_201509', 'transArchive_201510', 'transArchive_201511', 'transArchive_201512', 
    'transArchive_201601', 'transArchive_201602', 'transArchive_201603', 'transArchive_201604', 
    'transArchive_201605', 'transArchive_201606', 'transArchive_201607', 'transArchive_201608', 
    'transArchive_201609', 'transArchive_201610', 'transArchive_201611', 'transArchive_201612', 
    'transArchive_201701'
]

# Step 1: Build a UNION query across all tables, excluding non-owners (card_no != 3)
union_query = " UNION ALL ".join([
    f"SELECT * FROM `umt-msba.transactions.{table_name}` WHERE card_no != 3" 
    for table_name in table_names
])

# Step 2: Sample distinct owners by card_no, ensuring no duplicates, and randomizing
distinct_owners_query = f"""
    SELECT card_no,
           ANY_VALUE(datetime) AS datetime,
           ANY_VALUE(register_no) AS register_no,
           ANY_VALUE(emp_no) AS emp_no,
           ANY_VALUE(trans_no) AS trans_no,
           ANY_VALUE(upc) AS upc,
           ANY_VALUE(description) AS description,
           ANY_VALUE(trans_type) AS trans_type,
           ANY_VALUE(trans_subtype) AS trans_subtype,
           ANY_VALUE(trans_status) AS trans_status,
           ANY_VALUE(department) AS department,
           ANY_VALUE(quantity) AS quantity,
           ANY_VALUE(Scale) AS Scale,
           ANY_VALUE(cost) AS cost,
           ANY_VALUE(unitPrice) AS unitPrice,
           ANY_VALUE(total) AS total,
           ANY_VALUE(regPrice) AS regPrice,
           ANY_VALUE(altPrice) AS altPrice,
           ANY_VALUE(tax) AS tax,
           ANY_VALUE(taxexempt) AS taxexempt,
           ANY_VALUE(foodstamp) AS foodstamp,
           ANY_VALUE(wicable) AS wicable,
           ANY_VALUE(discount) AS discount,
           ANY_VALUE(memDiscount) AS memDiscount,
           ANY_VALUE(discountable) AS discountable,
           ANY_VALUE(discounttype) AS discounttype,
           ANY_VALUE(voided) AS voided,
           ANY_VALUE(percentDiscount) AS percentDiscount,
           ANY_VALUE(ItemQtty) AS ItemQtty,
           ANY_VALUE(volDiscType) AS volDiscType,
           ANY_VALUE(volume) AS volume,
           ANY_VALUE(VolSpecial) AS VolSpecial,
           ANY_VALUE(mixMatch) AS mixMatch,
           ANY_VALUE(matched) AS matched,
           ANY_VALUE(memType) AS memType,
           ANY_VALUE(staff) AS staff,
           ANY_VALUE(numflag) AS numflag,
           ANY_VALUE(itemstatus) AS itemstatus,
           ANY_VALUE(tenderstatus) AS tenderstatus,
           ANY_VALUE(charflag) AS charflag,
           ANY_VALUE(varflag) AS varflag,
           ANY_VALUE(batchHeaderID) AS batchHeaderID,
           ANY_VALUE(local) AS local,
           ANY_VALUE(organic) AS organic,
           ANY_VALUE(display) AS display,
           ANY_VALUE(receipt) AS receipt,
           ANY_VALUE(store) AS store,
           ANY_VALUE(branch) AS branch,
           ANY_VALUE(match_id) AS match_id,
           ANY_VALUE(trans_id) AS trans_id
    FROM ({union_query})
    GROUP BY card_no
    ORDER BY RAND()

"""

# LIMIT 1000000  -- Adjust this value to control the number of unique owners
# Step 3: Execute the query to fetch the sampled records
df_unique_owners = client.query(distinct_owners_query).to_dataframe()

# Step 4: Check the size of the sample
print(f"Data size: {df_unique_owners.memory_usage(deep=True).sum() / (1024 * 1024)} MB")
print(f"Number of unique owners (card_no): {df_unique_owners['card_no'].nunique()}")
print(f"Total rows (should match unique owners): {len(df_unique_owners)}")

# Step 5: Write the filtered sample (one record per owner) to a CSV file
# df_unique_owners.to_csv("owner_sample_one_per_owner.csv", index=False)

# print("Data exported to owner_sample_one_per_owner.csv")




Data size: 15.85622501373291 MB
Number of unique owners (card_no): 25941
Total rows (should match unique owners): 25941


In [27]:
from google.cloud import bigquery
import pandas as pd

# Deliverable 1: Connect to your GBQ instance
client = bigquery.Client()

# List of tables from umt-msba project (excluding inactive ones)
table_names = [
    'transArchive_201001_201003', 'transArchive_201004_201006', 'transArchive_201007_201009', 
    'transArchive_201010_201012', 'transArchive_201101_201103', 'transArchive_201104', 
    'transArchive_201105', 'transArchive_201106', 'transArchive_201107_201109', 
    'transArchive_201110_201112', 'transArchive_201201_201203', 'transArchive_201204_201206', 
    'transArchive_201207_201209', 'transArchive_201210_201212', 'transArchive_201301_201303', 
    'transArchive_201304_201306', 'transArchive_201307_201309', 'transArchive_201310_201312', 
    'transArchive_201401_201403', 'transArchive_201404_201406', 'transArchive_201407_201409', 
    'transArchive_201410_201412', 'transArchive_201501_201503', 'transArchive_201504_201506', 
    'transArchive_201507_201509', 'transArchive_201510', 'transArchive_201511', 'transArchive_201512', 
    'transArchive_201601', 'transArchive_201602', 'transArchive_201603', 'transArchive_201604', 
    'transArchive_201605', 'transArchive_201606', 'transArchive_201607', 'transArchive_201608', 
    'transArchive_201609', 'transArchive_201610', 'transArchive_201611', 'transArchive_201612', 
    'transArchive_201701'
]

# Step 1: Build a UNION query across all tables, excluding non-owners (card_no != 3)
union_query = " UNION ALL ".join([
    f"SELECT * FROM `umt-msba.transactions.{table_name}` WHERE card_no != 3" 
    for table_name in table_names
])

# Step 2: Sample distinct owners by card_no, ensuring no duplicates, and randomizing
# Start with a small sample for estimation (e.g., LIMIT 1000)
sample_query = f"""
    SELECT card_no,
           ANY_VALUE(datetime) AS datetime,
           ANY_VALUE(register_no) AS register_no,
           ANY_VALUE(emp_no) AS emp_no,
           ANY_VALUE(trans_no) AS trans_no,
           ANY_VALUE(upc) AS upc,
           ANY_VALUE(description) AS description,
           ANY_VALUE(trans_type) AS trans_type,
           ANY_VALUE(trans_subtype) AS trans_subtype,
           ANY_VALUE(trans_status) AS trans_status,
           ANY_VALUE(department) AS department,
           ANY_VALUE(quantity) AS quantity,
           ANY_VALUE(Scale) AS Scale,
           ANY_VALUE(cost) AS cost,
           ANY_VALUE(unitPrice) AS unitPrice,
           ANY_VALUE(total) AS total,
           ANY_VALUE(regPrice) AS regPrice,
           ANY_VALUE(altPrice) AS altPrice,
           ANY_VALUE(tax) AS tax,
           ANY_VALUE(taxexempt) AS taxexempt,
           ANY_VALUE(foodstamp) AS foodstamp,
           ANY_VALUE(wicable) AS wicable,
           ANY_VALUE(discount) AS discount,
           ANY_VALUE(memDiscount) AS memDiscount,
           ANY_VALUE(discountable) AS discountable,
           ANY_VALUE(discounttype) AS discounttype,
           ANY_VALUE(voided) AS voided,
           ANY_VALUE(percentDiscount) AS percentDiscount,
           ANY_VALUE(ItemQtty) AS ItemQtty,
           ANY_VALUE(volDiscType) AS volDiscType,
           ANY_VALUE(volume) AS volume,
           ANY_VALUE(VolSpecial) AS VolSpecial,
           ANY_VALUE(mixMatch) AS mixMatch,
           ANY_VALUE(matched) AS matched,
           ANY_VALUE(memType) AS memType,
           ANY_VALUE(staff) AS staff,
           ANY_VALUE(numflag) AS numflag,
           ANY_VALUE(itemstatus) AS itemstatus,
           ANY_VALUE(tenderstatus) AS tenderstatus,
           ANY_VALUE(charflag) AS charflag,
           ANY_VALUE(varflag) AS varflag,
           ANY_VALUE(batchHeaderID) AS batchHeaderID,
           ANY_VALUE(local) AS local,
           ANY_VALUE(organic) AS organic,
           ANY_VALUE(display) AS display,
           ANY_VALUE(receipt) AS receipt,
           ANY_VALUE(store) AS store,
           ANY_VALUE(branch) AS branch,
           ANY_VALUE(match_id) AS match_id,
           ANY_VALUE(trans_id) AS trans_id
    FROM ({union_query})
    GROUP BY card_no
    ORDER BY RAND()
    LIMIT 1000  -- Take a small sample for estimation
"""

# Step 3: Execute the query to fetch the small sample
df_sample = client.query(sample_query).to_dataframe()

# Step 4: Check the size of the small sample
sample_size_mb = df_sample.memory_usage(deep=True).sum() / (1024 * 1024)
print(f"Sample size: {sample_size_mb} MB")

# Step 5: Estimate the number of rows needed for 250 MB
# If 1000 rows = sample_size_mb, then target_rows = (250 / sample_size_mb) * 1000
target_rows = int((250 / sample_size_mb) * 1000)
print(f"Estimated number of rows needed for ~250 MB: {target_rows}")



Sample size: 0.6115951538085938 MB
Estimated number of rows needed for ~250 MB: 408767
