# Task One: To the Cloud!

In [1]:
#cleaning data
import pandas as pd
import numpy as np
from zipfile import ZipFile
import os
import io
import csv
import datetime

import data_functions as datafun

In [2]:
#define the data file path and create a delimiter dictionary
outer = "Data/WedgeZipOfZips.zip"

delimiters = dict() 


In [3]:
# Define your schema column names for files without headers
schema_columns = ['datetime', 'register_no', 'emp_no', 'trans_no', 'upc', 'description', 
                  'trans_type', 'trans_subtype', 'trans_status', 'department', 'quantity', 
                  'Scale', 'cost', 'unitPrice', 'total', 'regPrice', 'altPrice', 'tax', 
                  'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 
                  'discountable', 'discounttype', 'voided', 'percentDiscount', 'ItemQtty', 
                  'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'memType', 
                  'staff', 'numflag', 'itemstatus', 'tenderstatus', 'charflag', 'varflag', 
                  'batchHeaderID', 'local', 'organic', 'display', 'receipt', 'card_no', 
                  'store', 'branch', 'match_id', 'trans_id']



In [None]:
# Step 1: Open the ZIP file
with ZipFile(outer, 'r') as outer_zip:
    outer_files = outer_zip.namelist()

    processed_files = 0

    # Step 2: Loop through the files in the outer ZIP
    for outer_file in outer_files:
        if outer_file.endswith('.zip'):  # Check if it's a ZIP file within the outer ZIP
            print(f"Found inner ZIP file: {outer_file}")

            # Step 3: Open the inner ZIP file
            with outer_zip.open(outer_file) as inner_zip_file:
                # Step 4: Read the inner ZIP file as a ZipFile object
                with ZipFile(inner_zip_file) as inner_zip:
                    inner_files = inner_zip.namelist()

                    for file in inner_files:
                        if file.endswith('.csv'):
                            print(f"Processing CSV file: {file}")
                            
                            # Step 3: Read the CSV file
                            with inner_zip.open(file, 'r') as csv_file:
                                csv_file = io.TextIOWrapper(csv_file, encoding="utf-8")
                                

                                 # Read a sample of the file to detect delimiter
                                sample = csv_file.read(3030)
                                csv_file.seek(0)  # Reset the file pointer

                                # Use csv.Sniffer to detect the delimiter
                                try:
                                    sniffer = csv.Sniffer()
                                    dialect = sniffer.sniff(sample, delimiters=[',', ';', '\t', ':'])
                                    delimiter = dialect.delimiter
                                    print(f"Detected delimiter: {delimiter}")
                                except csv.Error:
                                    delimiter = ','  # Default to comma if delimiter detection fails
                                    print("Could not detect delimiter, using default ','")

                                #check for headers
                                has_header = sniffer.has_header(sample)
                                print(f"Has header: {has_header}")

                                #remove the headers if they exist and add the schema columns
                                if not has_header:
                                    df = pd.read_csv(csv_file, delimiter=delimiter, header=None, names=schema_columns)
                                else:
                                    df = pd.read_csv(csv_file, delimiter=delimiter)
                                    df.columns = schema_columns
                               
                                
                                # Step 5: Perform data cleaning view data types
                                print("Cleaning data...")
                                df = datafun.clean_data(df)
                                print("Data cleaned.")

                                 
                                 #open connection to bigquery
                                table_name = "wedgeproject-438019.wedgeproject." + file.split(".")[0].lower()
                                datafun.upload_to_bigquery(df,table_name)
                                
                                processed_files += 1
                                progress = processed_files / len(outer_files) * 100
                                print(f"Progress: {progress:.2f}%")


print("All files processed.")
