
# Task 1 Clean and Process the Zip Files


In [1]:
import os 
import io
from zipfile import ZipFile
import pandas as pd
import csv
import numpy as np
#from google.cloud import bigquery
#from pandas_gbq import to_gbq
#from google.oauth2 import service_account

### Questions:
1. Extra quotes on some headers???  
1. Loading with or without a header???
1. I did not find any duplicates or didn't remove any null values
1. When importing to GBQ I cannot get datetime to come in as TIMESTAMP vs string.

### Data Source Setup

In [2]:
data_directory = "Data/LargeZips/"
zip_files = os.listdir("Data/LargeZips")

### Sniffing out the Delimiter 
This section identifies the delimiter for each csv file and stores it in a dictionary called **delimiters** with the file_name as the key. 

In [None]:
delimiters = dict() 

# Start by reading in all the files again.

#current_zf = zip_files[0]
for current_zf in zip_files :
# Open the current zf
    with ZipFile(data_directory + current_zf,'r') as zf :
        zipped_files = zf.namelist()
        
        # Iteraate over each file inside the current zip file
        for file_name in zipped_files :
            # Open and wrap it to read as text
            input_file = io.TextIOWrapper(zf.open(file_name, 'r'), encoding="utf-8")
            
            dialect = csv.Sniffer().sniff(sample=input_file.readline(),
                                      delimiters=[",",";","\t"])
            
            delimiters[file_name] = dialect.delimiter
            
            print(" ".join(["For",
                           file_name,
                           "the delimiter is",
                           dialect.delimiter
                           ]))

            input_file.close() # tidy up

### Checking for Headers

This sections iterates over each csv file to determine if the file contains a header row. The results are stored in a dictionary titled **Headers** with file_name as the key.

In [None]:
headers = dict()

def is_header_row(first_row, second_row):

    # Check if most elements in the first row contain non-numeric characters
    if all(any(c.isalpha() for c in value) for value in first_row):
        return True
    
    # Optionally: Check if types of first and second rows differ
    if set(map(type, first_row)) != set(map(type, second_row)):
        return True
    
    return False


#current_zf = zip_files[0]
for current_zf in zip_files :
# Open the current zf
    with ZipFile(data_directory + current_zf,'r') as zf :
        zipped_files = zf.namelist()
        
        # Loop through each file in the zip
        for file_name in zipped_files:
            with zf.open(file_name, 'r') as input_file:
                input_file = io.TextIOWrapper(input_file, encoding="utf-8")

                this_delimiter = delimiters.get(file_name, ',')  # Use delimiter or default to ','

                # Read the first two lines
                first_line = input_file.readline().strip().split(this_delimiter)
                second_line = input_file.readline().strip().split(this_delimiter)

                # Check for header presence using improved logic
                has_header = is_header_row(first_line, second_line)
                print(f"File '{file_name}' has header: {has_header}")

                # Print first two lines for verification
                print("First line:", first_line)
                print("Second line:", second_line)


                headers[file_name] = has_header


                input_file.close()  # Close the file properly
    

In [None]:
# testing the dictionary
if not headers:
    print("The 'headers' dictionary is empty.")
else:
    print("The 'headers' dictionary contains:")
    print(headers)

### Removing Duplicates

In [None]:

for current_zf in zip_files:
    with ZipFile(data_directory + current_zf, 'r') as zf:
        zipped_files = zf.namelist()

        # Iterate over each file in the zip
        for file_name in zipped_files:
            # Assuming the file is a CSV, we will load it into a DataFrame
            with zf.open(file_name) as file:
                # Read file as CSV
                try:
                    df = pd.read_csv(file, quotechar='"') 
                    print(f"Loaded {file_name} into DataFrame.")
                   # print(df.head())  # Check the first few rows of the DataFrame
                except Exception as e:
                    print(f"Error reading {file_name}: {e}")
                    
            initial_row_count = len(df)

            # Remove duplicates
            df_cleaned = df.drop_duplicates()

            # Get the number of rows after removing duplicates
            final_row_count = len(df_cleaned)

            # Calculate the number of duplicates removed
            duplicates_removed = initial_row_count - final_row_count

            print(f"Number of duplicates removed: {duplicates_removed}")

### Handle Missing Data

Null Values

trans_subtype       389327 string
trans_status        673703 string
percentDiscount    1094857 float
memType            2998330 boolean
charflag           2321518 string
batchHeaderID      2998330 boolean
organic            2998330 float
display            2998330 boolean

In [None]:
# Here's a function to transform the date column in a dataframe to 
# the YYYYMM01 format we'd like to use for subsetting.

def reformat_date(date_string) :
    date_string = datetime.datetime.strptime(date_string,"%Y-%m-%d")
    return(datetime.date.strftime(date_string,"%Y%m")+"01")

assert(reformat_date("2022-09-20")=="20220901")
assert(reformat_date("2000-10-20")=="20001001")

#### Cleaning and Processing
-- same loop as above but with processing.

In [63]:
def missing_values(df, schema):
    for field in schema:
        field_name = field.name
        field_type = field.field_type
        
        # Handle missing values based on field type
        if field_type == "FLOAT":
            df[field_name] = df[field_name].fillna(0.0)
        elif field_type == "STRING":
            df[field_name] = df[field_name].fillna('')
        elif field_type == "TIMESTAMP":
            df[field_name] = pd.to_datetime(df[field_name], errors='coerce')
        elif field_type == "BOOLEAN":
            df[field_name] = df[field_name].fillna(False)
    
    return df

### Processing Files
This sections iterates over all the zip files in the raw data directory. It then processes them and saves them in a new processed files directory as csv files. 

1. Check for delimiter
1. Check for header
1. Add header column if missing
1. Handle Missing Values
1. Correct datatypes

In [7]:
def assign_expected_types(df):
    for col, dtype in expected_types.items():
        if col in df.columns:
            print(f"Converting column '{col}' to {dtype}...")
            try:
                if dtype == "datetime64[ns]":
                    df[col] = pd.to_datetime(df[col], errors='coerce')
                elif dtype == "float64":
                    df[col] = pd.to_numeric(df[col], errors='coerce')
                elif dtype == "Int64":
                    df[col] = pd.to_numeric(df[col], errors='coerce').astype("Int64")
                elif dtype == "boolean":
                    df[col] = df[col].astype(str).str.lower().map({
                        'true': True, 'false': False, '1': True, '0': False, 'nan': pd.NA
                    }).astype("boolean")
                elif dtype == "object":
                    df[col] = df[col].astype(str).replace("nan", pd.NA)
            except Exception as e:
                print(f"Error converting column '{col}': {e}")
                df[col] = df[col].astype("object")
                print(f"Fallback: Converted '{col}' to object type.")
    return df

In [None]:
##########################################################
## Working Datatype column conversion and header/delimter#
##########################################################
processed_files_directory = "data/processed_files/"

# Define common headers
common_headers = [
    'datetime', 'register_no', 'emp_no', 'trans_no', 'upc', 'description', 
    'trans_type', 'trans_subtype', 'trans_status', 'department', 'quantity', 
    'Scale', 'cost', 'unitPrice', 'total', 'regPrice', 'altPrice', 'tax', 
    'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 
    'discountable', 'discounttype', 'voided', 'percentDiscount', 'ItemQtty', 
    'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'memType', 
    'staff', 'numflag', 'itemstatus', 'tenderstatus', 'charflag', 'varflag', 
    'batchHeaderID', 'local', 'organic', 'display', 'receipt', 'card_no', 
    'store', 'branch', 'match_id', 'trans_id'
]

# Define expected types
expected_types = {
    "datetime": "datetime64[ns]",
    "register_no": "float64",
    "emp_no": "float64",
    "trans_no": "float64",
    "upc": "object",
    "description": "object",
    "trans_type": "object",
    "trans_subtype": "object",
    "trans_status": "object",
    "department": "float64",
    "quantity": "float64",
    "Scale": "float64",
    "cost": "float64",
    "unitPrice": "float64",
    "total": "float64",
    "regPrice": "float64",
    "altPrice": "float64",
    "tax": "float64",
    "taxexempt": "float64",
    "foodstamp": "float64",
    "wicable": "float64",
    "discount": "float64",
    "memDiscount": "float64",
    "discountable": "float64",
    "discounttype": "float64",
    "voided": "float64",
    "percentDiscount": "float64",
    "ItemQtty": "float64",
    "volDiscType": "object",
    "volume": "float64",
    "VolSpecial": "float64",
    "mixMatch": "float64",
    "matched": "float64",
    "memType": "boolean",
    "staff": "boolean",
    "numflag": "float64",
    "itemstatus": "float64",
    "tenderstatus": "float64",
    "charflag": "object",
    "varflag": "object",
    "batchHeaderID": "boolean",
    "local": "float64",
    "organic": "float64",
    "display": "boolean",
    "receipt": "float64",
    "card_no": "float64",
    "store": "float64",
    "branch": "float64",
    "match_id": "float64",
    "trans_id": "float64"
}

# Set the future option to silence the warning (optional)
pd.set_option('future.no_silent_downcasting', True)

def handle_null_values(df):
    """Replace known null-like values with NaN and ensure NaN is treated as None."""
    for col in df.columns:
        print(f"Handling null values for column: '{col}'...")
        try:
            # Use raw strings to avoid unicode escape errors
            df[col] = df[col].replace([r'nan', r'None', r'\\N', r'\N'], np.nan)

            # Ensure that the correct type is inferred
            df[col] = df[col].infer_objects(copy=False)

            # Ensure NaN becomes None for BigQuery compatibility
            df[col] = df[col].where(~df[col].isna(), None)
        except Exception as e:
            print(f"Error handling null values for column '{col}': {e}")

    return df

# Function to sanitize and strictly convert to boolean
def clean_boolean_column(df, col):
    """Strictly clean, validate, and convert a column to boolean."""
    print(f"Cleaning boolean column '{col}'...")

    try:
        # Normalize values to boolean-like or pd.NA
        df[col] = df[col].astype(str).str.strip().str.lower().map({
            'true': True, 'false': False, '1': True, '0': False, 
            '': pd.NA, 'nan': pd.NA, 'none': pd.NA
        })

        # Identify any remaining invalid values
        invalid_values = df[col][~df[col].isin([True, False, pd.NA])]
        if not invalid_values.empty:
            print(f"Warning: Invalid boolean values found in '{col}': {invalid_values.unique()}")
            # Replace invalid values with pd.NA
            df[col] = df[col].where(df[col].isin([True, False]), pd.NA)

        # Ensure the column has the correct dtype
        df[col] = df[col].astype('boolean')
        print(f"Successfully converted '{col}' to boolean.")

    except Exception as e:
        print(f"Error processing column '{col}': {e}")
        # Log invalid entries for debugging
        print(f"Invalid entries: {df[col].dropna().unique()}")
        raise ValueError(f"Failed to convert column '{col}' to boolean.")


# Function to assign expected data types

                # Immediately raise the error to prevent bad data from continuing
                #except Exception as e:
                #print(f"Error converting column '{col}': {e}. Falling back to 'object'.")
                #df[col] = df[col].astype("object")   

def assign_data_types(df):
    """Convert DataFrame columns to expected data types."""
    for col, dtype in expected_types.items():
        if col in df.columns:
            print(f"Converting column '{col}' to {dtype}...")
            try:
                if dtype == "datetime64[ns]":
                    df[col] = pd.to_datetime(df[col], errors='raise', format="%Y-%m-%d %H:%M:%S")
                elif dtype == "float64":
                    df[col] = pd.to_numeric(df[col], errors='raise')
                elif dtype == "Int64":
                    df[col] = pd.to_numeric(df[col], errors='raise').astype("Int64")
                elif dtype == "boolean":
                    clean_boolean_column(df, col)  # Use new boolean conversion logic
                elif dtype == "object":
                    df[col] = df[col].astype(str).replace("nan", pd.NA)
            except Exception as e:
                print(f"Error converting column '{col}': {e}")
                raise  # Stop the process if conversion fails

    return df


def load_file(file_name, working_file):
    delimiter = delimiters.get(file_name, ',')  # Default to ','
    has_header = headers.get(file_name, True)  # Check if the file has a header
    header = 0 if has_header else None  # 0 for header row, None if no header
    

    try:
            # Load the CSV without parsing dates initially
            df = pd.read_csv(
                working_file,
                delimiter=delimiter,
                header=header,
                dtype={col: dtype for col, dtype in expected_types.items() if col != "datetime"},
                low_memory=False
            )

            # If the file doesn't have headers, assign them now
            if not has_header:
                df.columns = common_headers
                print(f"Assigned common headers to {file_name}.")

            # Now parse the datetime column, if it exists
            if 'datetime' in df.columns:
                print(f"Parsing 'datetime' column for {file_name}...")
                df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

            print(f"Loaded {file_name} successfully!")
            return df

    except Exception as e:
            print(f"Error loading {file_name}: {e}")
            return None

# Process ZIP files
zip_files = os.listdir(data_directory)

for current_zf in zip_files:
    with ZipFile(os.path.join(data_directory, current_zf), 'r') as zf:
        zipped_files = zf.namelist()

        for file_name in zipped_files:
            with zf.open(file_name) as working_file:
                df = load_file(file_name, working_file)
               
                if df is not None:
                    try:
                        # Clean and process the DataFrame
                        df.columns = df.columns.str.strip()
                
                        # Clean and prepare the DataFrame
                        df = handle_null_values(df)
                        df = assign_data_types(df)

                        # Save the processed DataFrame
                        processed_filename = os.path.join(
                            processed_files_directory, f"{os.path.splitext(file_name)[0]}_processed.csv"
                        )
                        df.to_csv(processed_filename, index=False)
                        print(f"Processed and saved {file_name}.")
                    except Exception as e:
                        print(f"Error processing {file_name}: {e}")

    print(f"Completed processing ZIP file: {current_zf}")



In [None]:
# Function to load a file with its metadata
def load_file(file_name, working_file):
    delimiter = delimiters.get(file_name, ',')  # Default to ','
    has_header = headers.get(file_name, True)  # Check if the file has a header
    header = 0 if has_header else None  # 0 for header row, None if no header

    try:
        # Load the CSV with the appropriate delimiter and header handling
        df = pd.read_csv(working_file, delimiter=delimiter, header=header)

        # Assign common headers if the file has no header
        if not has_header:
            df.columns = common_headers

        print(f"Loaded {file_name} successfully!")
        return df
    except Exception as e:
        print(f"Error loading {file_name}: {e}")
        return None

# Process ZIP files
zip_files = os.listdir(data_directory)

for current_zf in zip_files:
    with ZipFile(os.path.join(data_directory, current_zf), 'r') as zf:
        zipped_files = zf.namelist()

        for file_name in zipped_files:
            with zf.open(file_name) as working_file:
                df = load_file(file_name, working_file)

                if df is not None:
                    try:
                        # Clean and process the DataFrame
                        df.columns = df.columns.str.strip()

process data one file at time to test

In [None]:
processed_files_directory = "data/processed_files/"

# Define common headers for files without headers
common_headers = [
    'datetime', 'register_no', 'emp_no', 'trans_no', 'upc', 'description', 
    'trans_type', 'trans_subtype', 'trans_status', 'department', 'quantity', 
    'Scale', 'cost', 'unitPrice', 'total', 'regPrice', 'altPrice', 'tax', 
    'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 
    'discountable', 'discounttype', 'voided', 'percentDiscount', 'ItemQtty', 
    'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'memType', 
    'staff', 'numflag', 'itemstatus', 'tenderstatus', 'charflag', 'varflag', 
    'batchHeaderID', 'local', 'organic', 'display', 'receipt', 'card_no', 
    'store', 'branch', 'match_id', 'trans_id'
]

if not os.path.exists(processed_files_directory):
    os.makedirs(processed_files_directory)

# Function to load a file with its metadata
def load_file(file_name, working_file):
    delimiter = delimiters.get(file_name, ',')  # Default to ','
    has_header = headers.get(file_name, True)  # Check if the file has a header
    header = 0 if has_header else None  # 0 for header row, None if no header

    try:
        # Load the CSV with the appropriate delimiter and header handling
        df = pd.read_csv(working_file, delimiter=delimiter, header=header)

        # Assign common headers if the file has no header
        if not has_header:
            df.columns = common_headers

        print(f"Loaded {file_name} successfully!")
        return df
    except Exception as e:
        print(f"Error loading {file_name}: {e}")
        return None

# Process ZIP files
zip_files = os.listdir(data_directory)

for current_zf in zip_files:
    with ZipFile(os.path.join(data_directory, current_zf), 'r') as zf:
        zipped_files = zf.namelist()

        for file_name in zipped_files:
            with zf.open(file_name) as working_file:
                df = load_file(file_name, working_file)

                if df is not None:
                    try:
                        # Clean and process the DataFrame
                        df.columns = df.columns.str.strip()

                        # Safely handle 'register_no' and 'quantity' columns if they exist
                        if 'register_no' in df.columns:
                            df['register_no'] = df['register_no'].fillna(0)
                        if 'quantity' in df.columns:
                            df['quantity'] = df['quantity'].fillna(0)

                        # Convert datetime if the column exists
                        if 'datetime' in df.columns:
                            df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')


                        # Convert columns to appropriate data types
                        float_columns = [
                            'register_no', 'emp_no', 'trans_no', 'department',
                            'Scale', 'tax', 'taxexempt', 'foodstamp', 'wicable',
                            'discountable', 'discounttype', 'voided', 'local',
                            'receipt', 'card_no', 'store', 'branch', 'match_id',
                            'trans_id'
                        ]
                        df[float_columns] = df[float_columns].astype(float, errors='ignore')

                        bool_columns = ['memType', 'staff', 'batchHeaderID', 'display']
                        df[bool_columns] = df[bool_columns].astype(bool, errors='ignore')

                        df['volDiscType'] = df.get('volDiscType', "").astype(str)

                        # Extract the base name without extensions before adding _processed.csv
                        file_name_no_ext = os.path.splitext(file_name)[0]

                        # Save the processed file
                        processed_filename = os.path.join(
                            processed_files_directory, f"{file_name_no_ext}_processed.csv"
                        )
                        df.to_csv(processed_filename, index=False)

                        print(f"Processed and saved {file_name}.")
                    except KeyError as e:
                        print(f"Missing expected column: {e}")
                    except Exception as e:
                        print(f"Error processing {file_name}: {e}")

        print(f"Completed processing ZIP file: {current_zf}\n")
    
    

#### DOUBLE CHECK CSV headers 

In [None]:
# Loop through all files in the directory
csv_files = [f for f in os.listdir(processed_files_directory) if f.endswith('.csv')]

# Function to print headers of each CSV
def print_csv_headers(file_name, file_path):
    try:
        # Read only the first 5 rows to print headers
        df = pd.read_csv(file_path, nrows=5)
        print(f"Headers for {file_name}: {list(df.columns)}")
    except Exception as e:
        print(f"Error reading {file_name}: {e}")

# Loop through and print headers for each CSV file
for csv_file in csv_files:
    file_path = os.path.join(processed_files_directory, csv_file)
    print_csv_headers(csv_file, file_path)

### Iterate Over Zip Files and Save to Pandas Dataframe

In [None]:
# Iterate over all zip files in the directory
for current_zf in zip_files:
    with ZipFile(data_directory + current_zf, 'r') as zf:
        zipped_files = zf.namelist()

        # Iterate over each file in the zip
        for file_name in zipped_files:
            # Assuming the file is a CSV, we will load it into a DataFrame
            with zf.open(file_name) as file:
                # Read file as CSV
                try:
                    df = pd.read_csv(file, quotechar='"') 
                    print(f"Loaded {file_name} into DataFrame.")
                    print(df.head())  # Check the first few rows of the DataFrame
                except Exception as e:
                    print(f"Error reading {file_name}: {e}")
                # Ensure datetime column is properly formatted
                df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

                # Convert integer columns to float where needed
                float_columns = [
                    'register_no', 'emp_no', 'trans_no', 'department', 'Scale', 'tax', 
                    'taxexempt', 'foodstamp', 'wicable', 'discountable', 'discounttype', 
                    'voided', 'local', 'receipt', 'card_no', 'store', 'branch', 'match_id', 
                    'trans_id'
                ]
                df[float_columns] = df[float_columns].astype(float)

                # Convert specific columns to boolean
                bool_columns = ['memType', 'staff', 'batchHeaderID', 'display']
                df[bool_columns] = df[bool_columns].astype(bool)

                # Convert columns to string if needed
                df['volDiscType'] = df['volDiscType'].astype(str)

                # Verify the new schema
                print(df.dtypes)
                    
        print("\n")

In [None]:
print(df.columns.tolist()) 

In [None]:
job.result()

In [None]:
# Iterate over all zip files in the directory
for current_zf in zip_files:
    with ZipFile(data_directory + current_zf, 'r') as zf:
        zipped_files = zf.namelist()

        # Iterate over each file in the zip
        for file_name in zipped_files:
            # Assuming the file is a CSV, we will load it into a DataFrame
            with zf.open(file_name) as file:
                try:
                    # Read file as CSV
                    df = pd.read_csv(file, quotechar='"')
                    print(f"Loaded {file_name} into DataFrame.")
                    print(df.head())  # Check the first few rows
                except Exception as e:
                    print(f"Error reading {file_name}: {e}")
                    continue  # Skip to the next file on error

                # Handle 'datetime' column if it exists
                if 'datetime' in df.columns:
                    df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
                else:
                    print(f"Warning: 'datetime' column not found in {file_name}.")

                # Convert integer columns to float where needed (only if they exist)
                float_columns = [
                    'register_no', 'emp_no', 'trans_no', 'department', 'Scale', 'tax', 
                    'taxexempt', 'foodstamp', 'wicable', 'discountable', 'discounttype', 
                    'voided', 'local', 'receipt', 'card_no', 'store', 'branch', 'match_id', 
                    'trans_id'
                ]
                for col in float_columns:
                    if col in df.columns:
                        df[col] = df[col].astype(float, errors='ignore')

                # Convert specific columns to boolean (only if they exist)
                bool_columns = ['memType', 'staff', 'batchHeaderID', 'display']
                for col in bool_columns:
                    if col in df.columns:
                        df[col] = df[col].astype(bool, errors='ignore')

                # Convert 'volDiscType' to string if it exists
                if 'volDiscType' in df.columns:
                    df['volDiscType'] = df['volDiscType'].astype(str)

                # Verify the new schema
                print("Updated DataFrame types:")
                print(df.dtypes)

        print("\n")

#### Print out first line test (to be removed)

In [None]:
for this_zf in zip_files :
    with ZipFile(data_directory + this_zf,'r') as zf :
        zipped_files = zf.namelist()

        for file_name in zipped_files :
            input_file = zf.open(file_name,'r')
            input_file = io.TextIOWrapper(input_file,encoding="utf-8")
            
            this_delimiter = delimiters[file_name]
            
            #for line in input_file :
                #print(line.strip().split(this_delimiter))
                #break


            for line in input_file:
                #Split the line using the delimiter and remove quotes
                cleaned_line = [item.replace('"', '').strip() for item in line.strip().split(this_delimiter)]
                
                # Print the cleaned line
                print(cleaned_line)
                break      
            input_file.close() # tidy up

### Code to Align to Schema

In [None]:
# Ensure datetime column is properly formatted
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

# Convert integer columns to float where needed
float_columns = [
    'register_no', 'emp_no', 'trans_no', 'department', 'Scale', 'tax', 
    'taxexempt', 'foodstamp', 'wicable', 'discountable', 'discounttype', 
    'voided', 'local', 'receipt', 'card_no', 'store', 'branch', 'match_id', 
    'trans_id'
]
df[float_columns] = df[float_columns].astype(float)

# Convert specific columns to boolean
bool_columns = ['memType', 'staff', 'batchHeaderID', 'display']
df[bool_columns] = df[bool_columns].astype(bool)

# Convert columns to string if needed
df['volDiscType'] = df['volDiscType'].astype(str)

# Verify the new schema
print(df.dtypes)