Step 1: Extract the Main Zip File

In [21]:
import zipfile
import os
import shutil

def extract_main_zip(main_zip_file, extract_to_folder):
    # Ensure the extraction folder exists
    os.makedirs(extract_to_folder, exist_ok=True)

    # Open the main zip file
    with zipfile.ZipFile(main_zip_file, 'r') as main_zip:
        # Loop through all files in the main zip file
        for zip_info in main_zip.infolist():
            # Skip files with "_inactive" in their names
            if '_inactive' in zip_info.filename:
                print(f"Skipping {zip_info.filename}, marked as inactive.")
                continue
            
            # Create the full output path
            output_file_path = os.path.join(extract_to_folder, zip_info.filename)
            
            # Check if the file already exists
            if not os.path.exists(output_file_path):
                # Create any necessary directories
                os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
                
                # Extract the file
                with main_zip.open(zip_info) as source, open(output_file_path, 'wb') as target:
                    shutil.copyfileobj(source, target)
                print(f"Extracted {zip_info.filename} to {output_file_path}")
            else:
                print(f"Skipping {zip_info.filename}, already exists.")

# Example usage
main_zip = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/WedgeZipOfZips(raw).zip'
extract_folder = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_main_zip'

extract_main_zip(main_zip, extract_folder)




Skipping transArchive_201001_201003.zip, already exists.
Skipping transArchive_201004_201006.zip, already exists.
Skipping transArchive_201007_201009.zip, already exists.
Skipping transArchive_201010_201012.zip, already exists.
Skipping transArchive_201101_201103.zip, already exists.
Skipping transArchive_201104.zip, already exists.
Skipping transArchive_201105.zip, already exists.
Skipping transArchive_201106.zip, already exists.
Skipping transArchive_201107_201109.zip, already exists.
Skipping transArchive_201110_201112.zip, already exists.
Skipping transArchive_201201_201203.zip, already exists.
Skipping transArchive_201201_201203_inactive.zip, marked as inactive.
Skipping transArchive_201204_201206.zip, already exists.
Skipping transArchive_201204_201206_inactive.zip, marked as inactive.
Skipping transArchive_201207_201209.zip, already exists.
Skipping transArchive_201207_201209_inactive.zip, marked as inactive.
Skipping transArchive_201210_201212.zip, already exists.
Skipping tran

Step 2: Extract the Nested Zip Files

In [22]:
import zipfile
import os

def extract_all_csvs_to_one_folder(extract_folder, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Walk through the extracted folder and look for zip files
    for root, dirs, files in os.walk(extract_folder):
        for file in files:
            if file.endswith('.zip'):
                nested_zip_path = os.path.join(root, file)
                
                # Check if the file is a valid zip file before proceeding
                try:
                    with zipfile.ZipFile(nested_zip_path, 'r') as nested_zip:
                        for zip_info in nested_zip.infolist():
                            if zip_info.filename.endswith('.csv'):
                                output_file_path = os.path.join(output_folder, zip_info.filename)
                                # Check if the CSV file already exists in the output folder
                                if not os.path.exists(output_file_path):
                                    # Extract the CSV if it doesn't already exist
                                    nested_zip.extract(zip_info, output_folder)
                                    print(f"Extracted {zip_info.filename} to {output_folder}")
                                else:
                                    print(f"Skipping {zip_info.filename}, already exists.")
                except zipfile.BadZipFile:
                    print(f"Skipping {nested_zip_path}, not a valid zip file.")

# Example usage
extract_folder = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_main_zip'
output_folder = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_csv_files'  

extract_all_csvs_to_one_folder(extract_folder, output_folder)




Skipping transArchive_201001_201003.csv, already exists.
Skipping transArchive_201004_201006.csv, already exists.
Skipping transArchive_201007_201009.csv, already exists.
Skipping transArchive_201010_201012.csv, already exists.
Skipping transArchive_201101_201103.csv, already exists.
Skipping transArchive_201104.csv, already exists.
Skipping transArchive_201105.csv, already exists.
Skipping transArchive_201106.csv, already exists.
Skipping transArchive_201107_201109.csv, already exists.
Skipping transArchive_201110_201112.csv, already exists.
Skipping transArchive_201201_201203.csv, already exists.
Skipping transArchive_201204_201206.csv, already exists.
Skipping transArchive_201207_201209.csv, already exists.
Skipping transArchive_201210_201212.csv, already exists.
Skipping transArchive_201301_201303.csv, already exists.
Skipping transArchive_201304_201306.csv, already exists.
Skipping transArchive_201307_201309.csv, already exists.
Skipping transArchive_201310_201312.csv, already exi

Step 3: Standardize the CSV Files

In [23]:
import pandas as pd
import glob
import os

def clean_and_standardize_file(input_file, output_file):
    try:
        print(f"Processing file: {input_file}")
        
        # Detect delimiter automatically and load the entire file
        df = pd.read_csv(input_file, sep=None, engine='python')  # Removed nrows parameter to load the full file
        print(f"Successfully read {input_file}")
        
        # Replace different forms of NULL values with None/NaN
        df.replace({"NULL": None, r"\\N": None, r"\N": None}, inplace=True)
        
        # Save the cleaned file with a standard delimiter (comma)
        df.to_csv(output_file, index=False, sep=",")
        print(f"Standardized and saved {input_file} to {output_file}")
    except pd.errors.EmptyDataError:
        print(f"Error: {input_file} is empty.")
    except FileNotFoundError:
        print(f"Error: {input_file} not found.")
    except Exception as e:
        print(f"Error processing {input_file}: {e}")

def process_extracted_csvs(extracted_folder, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Get all CSV files in the extracted folder
    csv_files = glob.glob(f"{extracted_folder}/**/*.csv", recursive=True)
    print(f"Found {len(csv_files)} CSV files to process.")
    
    # Process each file one by one (now processing the full file)
    for csv_file in csv_files:
        output_file = os.path.join(output_folder, os.path.basename(csv_file))
        clean_and_standardize_file(csv_file, output_file)
    
    # After processing, list all files in the output directory
    saved_files = glob.glob(f"{output_folder}/*.csv")
    print(f"\nSaved {len(saved_files)} files to {output_folder}:")
    for file in saved_files:
        print(file)

# Example usage
extracted_folder = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_csv_files'  # Folder where your extracted CSVs are located
output_folder = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files'  # Folder to save cleaned CSVs

# Process the full files instead of samples
process_extracted_csvs(extracted_folder, output_folder)


Found 41 CSV files to process.
Processing file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_csv_files\transArchive_201001_201003.csv
Successfully read D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_csv_files\transArchive_201001_201003.csv
Standardized and saved D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_csv_files\transArchive_201001_201003.csv to D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201001_201003.csv
Processing file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_csv_files\transArchive_201004_201006.csv
Successfully read D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_csv_files\transArchive_201004_201006.csv
Standardized and saved D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_csv_files\transArchive_201004_201006.csv to D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201004_201006.csv
Processing file: 

KeyboardInterrupt: 

Check delimiter and Nulls

In [15]:
# Delimiter check and null check
import pandas as pd
import glob

def check_delimiters_and_nulls(output_folder):
    # Get all CSV files in the output folder
    csv_files = glob.glob(f"{output_folder}/*.csv")
    
    for csv_file in csv_files:
        try:
            # Check if the file can be loaded with a comma as the delimiter
            print(f"\nChecking file: {csv_file}")
            df = pd.read_csv(csv_file, sep=",")
            print(f"File loaded successfully with comma delimiter.")
            
            # Check for unstandardized null values (e.g., "NULL", "\N", "\\N")
            unstandardized_nulls = df.isin(["NULL", r"\N", r"\\N"]).sum().sum()
            if unstandardized_nulls == 0:
                print("No unstandardized null values found.")
            else:
                print(f"Found {unstandardized_nulls} instances of unstandardized null values.")
        
        except Exception as e:
            print(f"Error processing {csv_file}: {e}")

# Example usage
output_folder = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files'
check_delimiters_and_nulls(output_folder)



Checking file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201001_201003.csv
File loaded successfully with comma delimiter.
No unstandardized null values found.

Checking file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201004_201006.csv
File loaded successfully with comma delimiter.
No unstandardized null values found.

Checking file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201007_201009.csv
File loaded successfully with comma delimiter.
No unstandardized null values found.

Checking file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201010_201012.csv
File loaded successfully with comma delimiter.
No unstandardized null values found.

Checking file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201101_201103.csv
File loaded successfully with comma delimiter.
No unst

Cleaning Collumn Headers

In [16]:
import pandas as pd
import os
import glob

def get_reference_headers(reference_file):
    """Reads the reference file to get the correct headers."""
    try:
        df = pd.read_csv(reference_file, nrows=0)  # Only read the header
        return list(df.columns)
    except Exception as e:
        print(f"Error reading reference file: {e}")
        return []

def fix_headers_in_file(input_file, output_file, correct_headers):
    """Checks and fixes the headers of the given file."""
    try:
        df = pd.read_csv(input_file)

        # Compare current headers to reference headers
        current_headers = list(df.columns)

        if current_headers != correct_headers:
            print(f"Fixing headers for file: {input_file}")
            
            # Shift the data down by one row
            df = pd.read_csv(input_file, header=None)
            df.columns = correct_headers  # Set the correct headers

        # Save the corrected file
        df.to_csv(output_file, index=False)
        print(f"Saved fixed file: {output_file}")

    except Exception as e:
        print(f"Error processing {input_file}: {e}")

def process_csv_files(input_folder, output_folder, reference_file):
    """Iterates over all CSV files and fixes headers if needed."""
    # Get the correct headers from the reference file
    correct_headers = get_reference_headers(reference_file)

    if not correct_headers:
        print("No valid reference headers found, aborting.")
        return

    # Get all CSV files in the input folder
    csv_files = glob.glob(os.path.join(input_folder, '*.csv'))

    print(f"Found {len(csv_files)} CSV files to process.")

    # Iterate over each CSV file and fix the headers
    for csv_file in csv_files:
        output_file = os.path.join(output_folder, os.path.basename(csv_file))
        fix_headers_in_file(csv_file, output_file, correct_headers)

# Example usage
input_folder = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files'
output_folder = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files'
reference_file = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files/transArchive_201001_201003.csv'  # Path to the reference CSV

# Process the files
process_csv_files(input_folder, output_folder, reference_file)



Found 39 CSV files to process.
Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201001_201003.csv
Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201004_201006.csv
Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201007_201009.csv


  df = pd.read_csv(input_file)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201010_201012.csv
Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201101_201103.csv
Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201104.csv
Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201105.csv
Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201106.csv
Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201107_201109.csv
Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201110_201112.csv
Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201201_201203.csv


  df = pd.read_csv(input_file)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201204_201206.csv


  df = pd.read_csv(input_file)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201207_201209.csv


  df = pd.read_csv(input_file)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201210_201212.csv


  df = pd.read_csv(input_file)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201301_201303.csv


  df = pd.read_csv(input_file)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201304_201306.csv


  df = pd.read_csv(input_file)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201307_201309.csv


  df = pd.read_csv(input_file)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201310_201312.csv


  df = pd.read_csv(input_file)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201401_201403.csv


  df = pd.read_csv(input_file)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201404_201406.csv
Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201407_201409.csv
Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201410_201412.csv
Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201504_201506.csv
Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201507_201509.csv
Fixing headers for file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201511.csv


  df = pd.read_csv(input_file, header=None)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201511.csv
Fixing headers for file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201512.csv


  df = pd.read_csv(input_file, header=None)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201512.csv
Fixing headers for file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201601.csv


  df = pd.read_csv(input_file, header=None)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201601.csv
Fixing headers for file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201602.csv


  df = pd.read_csv(input_file, header=None)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201602.csv
Fixing headers for file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201603.csv


  df = pd.read_csv(input_file, header=None)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201603.csv
Fixing headers for file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201604.csv


  df = pd.read_csv(input_file, header=None)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201604.csv
Fixing headers for file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201605.csv


  df = pd.read_csv(input_file, header=None)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201605.csv
Fixing headers for file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201606.csv


  df = pd.read_csv(input_file, header=None)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201606.csv
Fixing headers for file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201607.csv


  df = pd.read_csv(input_file, header=None)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201607.csv
Fixing headers for file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201608.csv


  df = pd.read_csv(input_file, header=None)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201608.csv
Fixing headers for file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201609.csv


  df = pd.read_csv(input_file, header=None)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201609.csv
Fixing headers for file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201610.csv


  df = pd.read_csv(input_file, header=None)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201610.csv
Fixing headers for file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201611.csv


  df = pd.read_csv(input_file, header=None)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201611.csv
Fixing headers for file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201612.csv


  df = pd.read_csv(input_file, header=None)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201612.csv
Fixing headers for file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/standardized_csv_files\transArchive_201701.csv


  df = pd.read_csv(input_file, header=None)


Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201701.csv


Check Column Headers

In [17]:
import pandas as pd
import os
import glob

def check_column_names(input_folder):
    # Get all CSV files in the input folder
    csv_files = glob.glob(os.path.join(input_folder, '*.csv'))
    
    # Dictionary to store columns for each file
    file_columns = {}
    
    print(f"Found {len(csv_files)} CSV files to check.")

    # Variable to store the reference column set (from the first file)
    reference_columns = None

    for csv_file in csv_files:
        try:
            # Load only the header (first row) to check column names
            df = pd.read_csv(csv_file, nrows=0)
            columns = list(df.columns)

            file_columns[csv_file] = columns

            # Compare columns with the reference (first CSV file)
            if reference_columns is None:
                reference_columns = columns  # Set the first file as the reference
                print(f"Setting reference columns from: {os.path.basename(csv_file)}")
            else:
                if columns != reference_columns:
                    print(f"WARNING: {os.path.basename(csv_file)} has different columns!")
                    print(f"Expected columns: {reference_columns}")
                    print(f"Found columns: {columns}")
                else:
                    print(f"{os.path.basename(csv_file)} has consistent columns.")
        
        except Exception as e:
            print(f"Error reading {csv_file}: {e}")

    return file_columns

# Example usage
input_folder = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files'
file_columns = check_column_names(input_folder)


Found 39 CSV files to check.
Setting reference columns from: transArchive_201001_201003.csv
transArchive_201004_201006.csv has consistent columns.
transArchive_201007_201009.csv has consistent columns.
transArchive_201010_201012.csv has consistent columns.
transArchive_201101_201103.csv has consistent columns.
transArchive_201104.csv has consistent columns.
transArchive_201105.csv has consistent columns.
transArchive_201106.csv has consistent columns.
transArchive_201107_201109.csv has consistent columns.
transArchive_201110_201112.csv has consistent columns.
transArchive_201201_201203.csv has consistent columns.
transArchive_201204_201206.csv has consistent columns.
transArchive_201207_201209.csv has consistent columns.
transArchive_201210_201212.csv has consistent columns.
transArchive_201301_201303.csv has consistent columns.
transArchive_201304_201306.csv has consistent columns.
transArchive_201307_201309.csv has consistent columns.
transArchive_201310_201312.csv has consistent col

Fixing Mismatched Data Types

In [26]:
import os
import glob
import pandas as pd

# Define the expected data types for each column
expected_dtypes = {
    "datetime": "datetime64[ns]",
    "register_no": "Int64",
    "emp_no": "Int64",
    "trans_no": "Int64",
    "upc": "string",
    "description": "string",
    "trans_type": "string",
    "trans_subtype": "string",
    "trans_status": "string",
    "department": "Int64",
    "quantity": "float64",
    "Scale": "Int64",
    "cost": "float64",
    "unitPrice": "float64",
    "total": "float64",
    "regPrice": "float64",
    "altPrice": "float64",
    "tax": "Int64",
    "taxexempt": "Int64",
    "foodstamp": "Int64",
    "wicable": "Int64",
    "discount": "float64",
    "memDiscount": "float64",
    "discountable": "Int64",
    "discounttype": "Int64",
    "voided": "Int64",
    "percentDiscount": "float64",
    "ItemQtty": "float64",
    "volDiscType": "Int64",
    "volume": "Int64",
    "VolSpecial": "float64",
    "mixMatch": "Int64",
    "matched": "Int64",
    "memType": "string",
    "staff": "Int64",
    "numflag": "Int64",
    "itemstatus": "Int64",
    "tenderstatus": "Int64",
    "charflag": "string",
    "varflag": "Int64",
    "batchHeaderID": "string",
    "local": "Int64",
    "organic": "string",
    "display": "string",
    "receipt": "Int64",
    "card_no": "Int64",
    "store": "Int64",
    "branch": "Int64",
    "match_id": "Int64",
    "trans_id": "Int64"
}

# Function to fix data type mismatches
def fix_dtypes(input_file, output_file):
    try:
        print(f"Fixing file: {input_file}")
        
        # Read the file without specifying data types
        df = pd.read_csv(input_file, low_memory=False)
        
        # Iterate over the expected data types and try to convert columns
        for column, expected_type in expected_dtypes.items():
            if column in df.columns:
                try:
                    # Attempt to convert the column to the expected type
                    if expected_type == "datetime64[ns]":
                        df[column] = pd.to_datetime(df[column], errors='coerce')  # Convert to datetime
                    else:
                        df[column] = df[column].astype(expected_type, errors='ignore')  # Convert to expected type
                except Exception as e:
                    print(f"Error converting {column} in {input_file}: {e}")
            else:
                print(f"Column '{column}' is missing from {input_file}, skipping conversion.")

        # Save the fixed file
        df.to_csv(output_file, index=False)
        print(f"Saved fixed file: {output_file}")

    except pd.errors.EmptyDataError:
        print(f"Error: {input_file} is empty.")
    except FileNotFoundError:
        print(f"Error: {input_file} not found.")
    except Exception as e:
        print(f"Error processing {input_file}: {e}")

# Process all CSVs to fix data type mismatches
def process_and_fix_all_csvs(input_folder, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Get all CSV files in the input folder
    csv_files = glob.glob(os.path.join(input_folder, '*.csv'))
    print(f"Found {len(csv_files)} CSV files to process.")
    
    # Fix each file and save the corrected version
    for csv_file in csv_files:
        # Construct output file path
        output_file = os.path.join(output_folder, os.path.basename(csv_file))
        
        # Fix data types and save the corrected file
        fix_dtypes(csv_file, output_file)

# Example usage
input_folder = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files'  # Folder with standardized CSVs
output_folder = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleanedV2_csv_files'  # Folder for cleaned CSVs

# Fix all CSVs
process_and_fix_all_csvs(input_folder, output_folder)


Found 39 CSV files to process.
Fixing file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201001_201003.csv
Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleanedV2_csv_files\transArchive_201001_201003.csv
Fixing file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201004_201006.csv
Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleanedV2_csv_files\transArchive_201004_201006.csv
Fixing file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201007_201009.csv
Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleanedV2_csv_files\transArchive_201007_201009.csv
Fixing file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201010_201012.csv
Saved fixed file: D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleanedV2_csv_files\transArchive_201010_201012.csv
Fixing file: D:/WedgeProject/

Step 4: Uploading to Google BigQuery

In [27]:
import os
import glob
from google.cloud import bigquery

def table_exists(client, dataset_id, table_name):
    """Check if a table already exists in BigQuery."""
    try:
        client.get_table(f'{dataset_id}.{table_name}')
        return True
    except Exception:
        # Table does not exist
        return False

def upload_csv_to_bigquery(client, dataset_id, table_name, csv_file, schema):
    try:
        # Configure the load job with schema
        job_config = bigquery.LoadJobConfig(
            source_format=bigquery.SourceFormat.CSV,
            skip_leading_rows=1,  # Skipping header row if the CSV contains headers
            schema=schema  # Use the predefined schema
        )

        # Load data from CSV into BigQuery
        with open(csv_file, "rb") as source_file:
            load_job = client.load_table_from_file(source_file, f'{dataset_id}.{table_name}', job_config=job_config)

        # Wait for the load job to complete
        load_job.result()

        print(f"Loaded {load_job.output_rows} rows into {dataset_id}.{table_name}")
    
    except Exception as e:
        print(f"Error uploading {csv_file} to BigQuery: {e}")

def upload_all_csvs(input_folder, dataset_id, schema):
    # Initialize the BigQuery client
    client = bigquery.Client()

    # Get all CSV files in the input folder
    csv_files = glob.glob(os.path.join(input_folder, '*.csv'))
    
    print(f"Found {len(csv_files)} CSV files to upload.")
    
    # Iterate through each file and upload it
    for csv_file in csv_files:
        # Extract the base name of the CSV file to use as the table name
        table_name = os.path.basename(csv_file).replace('.csv', '')
        
        # Check if the table already exists
        if table_exists(client, dataset_id, table_name):
            print(f"Skipping {csv_file}, table {table_name} already exists in BigQuery.")
            continue
        
        # Upload the CSV to BigQuery if the table does not exist
        upload_csv_to_bigquery(client, dataset_id, table_name, csv_file, schema)

# Define your schema
schema = [
    bigquery.SchemaField("datetime", "TIMESTAMP"),
    bigquery.SchemaField("register_no", "INTEGER"),
    bigquery.SchemaField("emp_no", "INTEGER"),
    bigquery.SchemaField("trans_no", "INTEGER"),
    bigquery.SchemaField("upc", "STRING"),
    bigquery.SchemaField("description", "STRING"),
    bigquery.SchemaField("trans_type", "STRING"),
    bigquery.SchemaField("trans_subtype", "STRING"),
    bigquery.SchemaField("trans_status", "STRING"),
    bigquery.SchemaField("department", "INTEGER"),
    bigquery.SchemaField("quantity", "FLOAT"),
    bigquery.SchemaField("Scale", "INTEGER"),
    bigquery.SchemaField("cost", "FLOAT"),
    bigquery.SchemaField("unitPrice", "FLOAT"),
    bigquery.SchemaField("total", "FLOAT"),
    bigquery.SchemaField("regPrice", "FLOAT"),
    bigquery.SchemaField("altPrice", "FLOAT"),
    bigquery.SchemaField("tax", "INTEGER"),
    bigquery.SchemaField("taxexempt", "INTEGER"),  
    bigquery.SchemaField("foodstamp", "INTEGER"),
    bigquery.SchemaField("wicable", "INTEGER"),
    bigquery.SchemaField("discount", "FLOAT"),
    bigquery.SchemaField("memDiscount", "FLOAT"),
    bigquery.SchemaField("discountable", "INTEGER"),
    bigquery.SchemaField("discounttype", "INTEGER"),
    bigquery.SchemaField("voided", "INTEGER"),
    bigquery.SchemaField("percentDiscount", "FLOAT"),
    bigquery.SchemaField("ItemQtty", "FLOAT"),
    bigquery.SchemaField("volDiscType", "INTEGER"),
    bigquery.SchemaField("volume", "INTEGER"),
    bigquery.SchemaField("VolSpecial", "FLOAT"),
    bigquery.SchemaField("mixMatch", "INTEGER"),
    bigquery.SchemaField("matched", "INTEGER"),
    bigquery.SchemaField("memType", "STRING"),
    bigquery.SchemaField("staff", "INTEGER"),
    bigquery.SchemaField("numflag", "INTEGER"),
    bigquery.SchemaField("itemstatus", "INTEGER"),
    bigquery.SchemaField("tenderstatus", "INTEGER"),
    bigquery.SchemaField("charflag", "STRING"),
    bigquery.SchemaField("varflag", "INTEGER"),
    bigquery.SchemaField("batchHeaderID", "STRING"),
    bigquery.SchemaField("local", "INTEGER"),
    bigquery.SchemaField("organic", "STRING"),
    bigquery.SchemaField("display", "STRING"),
    bigquery.SchemaField("receipt", "INTEGER"),
    bigquery.SchemaField("card_no", "INTEGER"),
    bigquery.SchemaField("store", "INTEGER"),
    bigquery.SchemaField("branch", "INTEGER"),
    bigquery.SchemaField("match_id", "INTEGER"),
    bigquery.SchemaField("trans_id", "INTEGER"),
]


# Example usage
input_folder = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files'  # Folder with cleaned CSVs
dataset_id = 'wedgeproject-rileyororke.transaction_tables'  # Your BigQuery dataset

# Upload all CSVs to BigQuery
upload_all_csvs(input_folder, dataset_id, schema)



Found 39 CSV files to upload.
Skipping D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201001_201003.csv, table transArchive_201001_201003 already exists in BigQuery.
Skipping D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201004_201006.csv, table transArchive_201004_201006 already exists in BigQuery.
Skipping D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201007_201009.csv, table transArchive_201007_201009 already exists in BigQuery.
Error uploading D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/cleaned_csv_files\transArchive_201010_201012.csv to BigQuery: 400 Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 100; errors: 100. Please look into the errors[] collection for more details.; reason: invalid, message: Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 100; errors: 100. Plea

KeyboardInterrupt: 