# Task 1 (Part 2): Upload to Big Query

This second part of task 1 takes all the csv files in the `processed_files` folder and uploads them to Google Big Query. 

#### Import Libraries

In [1]:
import os
import pandas as pd
from pandas_gbq import to_gbq
from google.cloud import bigquery


import re


#### Configure Goggle Big Query Project

In [2]:
gbq_project_ID = "wedge-project-np"
gbq_dataset_ID = "wedge"

client = bigquery.Client(project = gbq_project_ID)
credentials = client._credentials

#### Set Data Path

In [3]:
data_directory = "Data/processed_files/"

#### Setup Big Query Schema

In [5]:
# Define the Big Query schema
schema = [
    bigquery.SchemaField("datetime", "TIMESTAMP", mode="NULLABLE"),
    bigquery.SchemaField("register_no", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("emp_no", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("trans_no", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("upc", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("description", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("trans_type", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("trans_subtype", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("trans_status", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("department", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("quantity", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("Scale", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("cost", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("unitPrice", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("total", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("regPrice", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("altPrice", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("tax", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("taxexempt", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("foodstamp", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("wicable", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("discount", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("memDiscount", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("discountable", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("discounttype", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("voided", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("percentDiscount", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("ItemQtty", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("volDiscType", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("volume", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("VolSpecial", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("mixMatch", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("matched", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("memType", "BOOLEAN", mode="NULLABLE"),
    bigquery.SchemaField("staff", "BOOLEAN", mode="NULLABLE"),
    bigquery.SchemaField("numflag", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("itemstatus", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("tenderstatus", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("charflag", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("varflag", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("batchHeaderID", "BOOLEAN", mode="NULLABLE"),
    bigquery.SchemaField("local", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("organic", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("display", "BOOLEAN", mode="NULLABLE"),
    bigquery.SchemaField("receipt", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("card_no", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("store", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("branch", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("match_id", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("trans_id", "FLOAT", mode="NULLABLE")
]

# Set the schema in the job config
job_config = bigquery.LoadJobConfig(schema=schema)

#### Configure Big Query

In [6]:
job_config = bigquery.LoadJobConfig(
    schema=schema,
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1,  # Skip header row
    write_disposition = bigquery.WriteDisposition.WRITE_APPEND,
    schema_update_options = [bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION]  
)

#### Setup Data Directories

In [7]:
clean_files = os.listdir(data_directory)
file_ending = "_processed.csv"

### Loop over to upload all Processed Files

This loop takes each .csv file in the `processed_files` folder, removes the end of the file name, uploads it to Big Query using the configuration settings, and prints out a "uploaded successfully" statement. 

In [8]:
for file_name in os.listdir(data_directory):
    if file_name.endswith("_processed.csv"):
        # Construct full path and table name
        full_path = os.path.join(data_directory, file_name)
        table_name = file_name.replace("_processed.csv", "")  # Remove suffix
        full_table_name = f"{gbq_dataset_ID}.{table_name}"  # Full table name

        # Check if the file exists
        if os.path.exists(full_path):
            print(f"Uploading {file_name} to BigQuery...")

            # Open the CSV and upload to BigQuery
            with open(full_path, 'rb') as source_file:
                job = client.load_table_from_file(
                    source_file,
                    full_table_name,
                    job_config=job_config,
                )
                job.result()  # Wait for the job to complete
                print(f"{file_name} uploaded successfully to {full_table_name}.")
        
        else:
            print(f"File not found: {file_name}")
        

Uploading transArchive_201001_201003_processed.csv to BigQuery...
transArchive_201001_201003_processed.csv uploaded successfully to wedge.transArchive_201001_201003.
Uploading transArchive_201004_201006_processed.csv to BigQuery...
transArchive_201004_201006_processed.csv uploaded successfully to wedge.transArchive_201004_201006.
Uploading transArchive_201007_201009_processed.csv to BigQuery...
transArchive_201007_201009_processed.csv uploaded successfully to wedge.transArchive_201007_201009.
Uploading transArchive_201010_201012_processed.csv to BigQuery...
transArchive_201010_201012_processed.csv uploaded successfully to wedge.transArchive_201010_201012.
Uploading transArchive_201101_201103_processed.csv to BigQuery...
transArchive_201101_201103_processed.csv uploaded successfully to wedge.transArchive_201101_201103.
Uploading transArchive_201104_processed.csv to BigQuery...
transArchive_201104_processed.csv uploaded successfully to wedge.transArchive_201104.
Uploading transArchive_20

#### Toolbox

This tool is used to test just one file without setting the configuration. I used this multiple times when I could not get my configuration setup right and just want to test speed, etc. 

In [None]:
###################################################################
####### Tool to Test 1 Table  #####################################
###################################################################

for file_name in clean_files:
    df= pd.read_csv(data_directory + file_name)
    table_name = file_name.replace(file_ending, "")

    full_table_name = f"{gbq_dataset_ID}.{table_name}"
    to_gbq(df, full_table_name, project_id=gbq_project_ID, if_exists="replace", credentials=credentials)
    break

This tool is used to delete all the existing tables on BigQuery. For some reason I could not get my configure with `.truncate` to work so anytime I had a matching table name, I had to delete it manually. 

In [None]:
###################################################################
####### Tool to Delete GBQ Tables   ###############################
###################################################################

gbq_project_ID = "wedge-project-np"
gbq_dataset_ID = "wedge"

client = bigquery.Client(project = gbq_project_ID)
credentials = client._credentials

def delete_tables(gbq_dataset_ID, table_names):
    """
    Delete multiple tables from a BigQuery dataset.

    Args:
        dataset_id: The ID of the dataset containing the tables.
        table_names: A list of table names to delete.
    """
    for table_name in table_names:
        table_id = f"{gbq_project_ID}.{gbq_dataset_ID}.{table_name}"
        try:
            print(f"Deleting table: {table_id}...")
            client.delete_table(table_id, not_found_ok=True)
            print(f"Deleted table: {table_id}.")
        except Exception as e:
            print(f"Failed to delete {table_id}: {e}")

# Example usage

table_names = [
    "transArchive_201001_201003", 
    "transArchive_201004_201006", 
    "transArchive_201007_201009",
    "transArchive_201010_201012",
    "transArchive_201101_201103",
    "transArchive_201104",
    "transArchive_201105",
    "transArchive_201106",
    "transArchive_201107_201109",
    "transArchive_201110_201112",
    "transArchive_201201_201203",
    "transArchive_201201_201203_inactive",
    "transArchive_201204_201206",
    "transArchive_201204_201206_inactive",
    "transArchive_201207_201209",
    "transArchive_201207_201209_inactive",
    "transArchive_201210_201212",
    "transArchive_201210_201212_inactive",
    "transArchive_201301_201303",
    "transArchive_201301_201303_inactive",
    "transArchive_201304_201306",
    "transArchive_201304_201306_inactive",
    "transArchive_201307_201309",
    "transArchive_201307_201309_inactive",
    "transArchive_201310_201312",
    "transArchive_201310_201312_inactive",
    "transArchive_201401_201403",
    "transArchive_201401_201403_inactive",
    "transArchive_201404_201406",
    "transArchive_201404_201406_inactive",
    "transArchive_201407_201409",
    "transArchive_201407_201409_inactive",
    "transArchive_201410_201412",
    "transArchive_201410_201412_inactive",
    "transArchive_201501_201503",
    "transArchive_201504_201506",
    "transArchive_201507_201509",
    "transArchive_201510",
    "transArchive_201511",
    "transArchive_201512",
    "transArchive_201601",
    "transArchive_201602",
    "transArchive_201603",
    "transArchive_201604",
    "transArchive_201605",
    "transArchive_201606",
    "transArchive_201607",
    "transArchive_201608",
    "transArchive_201609",
    "transArchive_201610",
    "transArchive_201611",
    "transArchive_201612",
    "transArchive_201701"
] 

# Call the function to delete tables
delete_tables(gbq_dataset_ID, table_names)