In [None]:
from google.cloud import bigquery
import pandas as pd
import os
from pandas_gbq import to_gbq


In [None]:
# Define your project and dataset details
project_id = 'juliehilley'
dataset_id = 'wedge_B'

In [None]:
client = bigquery.Client(project = project_id)
# Extract the credentials from your BigQuery client
credentials = client._credentials


## Upload Plan

1. Get all of the file names
1. One at a time, read in each file
1. Define table name
1. Upload data


In [None]:
data_path = "../data/clean-files/"


In [None]:
clean_files = os.listdir(data_path)

In [None]:
for file_name in clean_files:
    df = pd.read_csv(data_path + file_name)
    
    # Display the data types of each column
    print(f"Inspecting DataFrame for {file_name}:")
    print(df.dtypes)
    
    # Check for missing values
    print(f"Missing values in {file_name}:")
    print(df.isnull().sum())
    
    print("\n-------------------------------------\n")

    # Pause the loop after inspecting the first file to verify
    break


In [None]:
# Get the unique values and their counts in 'memType'
print("Unique values in 'memType' and their counts:")
print(df['memType'].value_counts(dropna=False))

# Inspect the data types of the values in 'memType'
print("\nData types of the values in 'memType':")
print(df['memType'].apply(type).value_counts())


In [None]:
for file_name in clean_files:
    df = pd.read_csv(data_path + file_name)
    
    # Drop unnecessary columns if they exist
    columns_to_drop = ['charflag', 'display', 'receipt', 'numflag', 'Itemstatus', 'tenderstatus', 'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'batchHeaderID', 'varflag', 'discountable', 'discounttype', 'percentDiscount', 'ItemQtty', 'staff']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')

    # Replace NaN and empty strings with 'Unknown' in 'memType'
    df['memType'] = df['memType'].fillna('Unknown').replace('', 'Unknown').astype(str)
    
    # Convert 'datetime' column to datetime type
    df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
    
    # Fill NaN values in numeric columns only if they exist
    numeric_columns = ['taxexempt', 'itemstatus', 'tenderstatus', 'batchHeaderID']
    for col in numeric_columns:
        if col in df.columns:
            df[col] = df[col].fillna(0)

    # Proceed with uploading to BigQuery
    table_name = file_name.replace("_clean.csv", "")
    full_table_name = f'{dataset_id}.{table_name}'
    
    # Upload to GBQ
    to_gbq(df, full_table_name, project_id=project_id, credentials=credentials, if_exists='replace')
