
# Task 1 Clean and Upload Data


In [1]:
import os 
import io
from zipfile import ZipFile
import pandas as pd
import csv
#from google.cloud import bigquery
#from pandas_gbq import to_gbq
#from google.oauth2 import service_account

### Questions:
1. Extra quotes on some headers???  
1. Loading with or without a header???
1. I did not find any duplicates or didn't remove any null values
1. When importing to GBQ I cannot get datetime to come in as TIMESTAMP vs string.

#### Data Source Setup

In [2]:
data_directory = "Data/LargeZips/"
zip_files = os.listdir("Data/LargeZips")

### Iterate Over Zip Files and Save to Pandas Dataframe

In [None]:
# Iterate over all zip files in the directory
for current_zf in zip_files:
    with ZipFile(data_directory + current_zf, 'r') as zf:
        zipped_files = zf.namelist()

        # Iterate over each file in the zip
        for file_name in zipped_files:
            # Assuming the file is a CSV, we will load it into a DataFrame
            with zf.open(file_name) as file:
                # Read file as CSV
                try:
                    df = pd.read_csv(file, quotechar='"') 
                    print(f"Loaded {file_name} into DataFrame.")
                    print(df.head())  # Check the first few rows of the DataFrame
                except Exception as e:
                    print(f"Error reading {file_name}: {e}")
                # Ensure datetime column is properly formatted
                df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

                # Convert integer columns to float where needed
                float_columns = [
                    'register_no', 'emp_no', 'trans_no', 'department', 'Scale', 'tax', 
                    'taxexempt', 'foodstamp', 'wicable', 'discountable', 'discounttype', 
                    'voided', 'local', 'receipt', 'card_no', 'store', 'branch', 'match_id', 
                    'trans_id'
                ]
                df[float_columns] = df[float_columns].astype(float)

                # Convert specific columns to boolean
                bool_columns = ['memType', 'staff', 'batchHeaderID', 'display']
                df[bool_columns] = df[bool_columns].astype(bool)

                # Convert columns to string if needed
                df['volDiscType'] = df['volDiscType'].astype(str)

                # Verify the new schema
                print(df.dtypes)
                    
        print("\n")

In [None]:
# Iterate over all zip files in the directory
for current_zf in zip_files:
    with ZipFile(data_directory + current_zf, 'r') as zf:
        zipped_files = zf.namelist()

        # Iterate over each file in the zip
        for file_name in zipped_files:
            # Assuming the file is a CSV, we will load it into a DataFrame
            with zf.open(file_name) as file:
                try:
                    # Read file as CSV
                    df = pd.read_csv(file, quotechar='"')
                    print(f"Loaded {file_name} into DataFrame.")
                    print(df.head())  # Check the first few rows
                except Exception as e:
                    print(f"Error reading {file_name}: {e}")
                    continue  # Skip to the next file on error

                # Handle 'datetime' column if it exists
                if 'datetime' in df.columns:
                    df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
                else:
                    print(f"Warning: 'datetime' column not found in {file_name}.")

                # Convert integer columns to float where needed (only if they exist)
                float_columns = [
                    'register_no', 'emp_no', 'trans_no', 'department', 'Scale', 'tax', 
                    'taxexempt', 'foodstamp', 'wicable', 'discountable', 'discounttype', 
                    'voided', 'local', 'receipt', 'card_no', 'store', 'branch', 'match_id', 
                    'trans_id'
                ]
                for col in float_columns:
                    if col in df.columns:
                        df[col] = df[col].astype(float, errors='ignore')

                # Convert specific columns to boolean (only if they exist)
                bool_columns = ['memType', 'staff', 'batchHeaderID', 'display']
                for col in bool_columns:
                    if col in df.columns:
                        df[col] = df[col].astype(bool, errors='ignore')

                # Convert 'volDiscType' to string if it exists
                if 'volDiscType' in df.columns:
                    df['volDiscType'] = df['volDiscType'].astype(str)

                # Verify the new schema
                print("Updated DataFrame types:")
                print(df.dtypes)

        print("\n")

In [None]:
"""#current_zf = zip_files[0]
for current_zf in zip_files :
# Open the current zipfile
    with ZipFile(data_directory + current_zf,'r') as zf :
        zipped_files = zf.namelist()
        
        # Iteraate over each file inside the current zip file
        for file_name in zipped_files :
            # Open and wrap it to read as text
            input_file = io.TextIOWrapper(zf.open(file_name, 'r'), encoding="utf-8")

            for idx, line in enumerate(input_file) :
                print(line)
                if idx > 4 :
                    break
            input_file.close()
        print("\n")"""


### Sniffing out the Delimiter 
This section identifies the delimiter for each csv file and stores it in a dictionary called **delimiters** with the file_name as the key. 

In [3]:
delimiters = dict() 

# Start by reading in all the files again.

#current_zf = zip_files[0]
for current_zf in zip_files :
# Open the current zf
    with ZipFile(data_directory + current_zf,'r') as zf :
        zipped_files = zf.namelist()
        
        # Iteraate over each file inside the current zip file
        for file_name in zipped_files :
            # Open and wrap it to read as text
            input_file = io.TextIOWrapper(zf.open(file_name, 'r'), encoding="utf-8")
            
            dialect = csv.Sniffer().sniff(sample=input_file.readline(),
                                      delimiters=[",",";","\t"])
            
            delimiters[file_name] = dialect.delimiter
            
            print(" ".join(["For",
                           file_name,
                           "the delimiter is",
                           dialect.delimiter
                           ]))

            input_file.close() # tidy up

For transArchive_201001_201003.csv the delimiter is ,
For transArchive_201004_201006.csv the delimiter is ,
For transArchive_201007_201009.csv the delimiter is ,
For transArchive_201010_201012.csv the delimiter is ,
For transArchive_201101_201103.csv the delimiter is ,
For transArchive_201104.csv the delimiter is ,
For transArchive_201105.csv the delimiter is ,
For transArchive_201106.csv the delimiter is ,
For transArchive_201107_201109.csv the delimiter is ,
For transArchive_201110_201112.csv the delimiter is ,
For transArchive_201201_201203.csv the delimiter is ,
For transArchive_201201_201203_inactive.csv the delimiter is ;
For transArchive_201204_201206.csv the delimiter is ,
For transArchive_201204_201206_inactive.csv the delimiter is ;
For transArchive_201207_201209.csv the delimiter is ,
For transArchive_201207_201209_inactive.csv the delimiter is ;
For transArchive_201210_201212.csv the delimiter is ,
For transArchive_201210_201212_inactive.csv the delimiter is ;
For transArch

#### Print out first line test (to be removed)

In [4]:
for this_zf in zip_files :
    with ZipFile(data_directory + this_zf,'r') as zf :
        zipped_files = zf.namelist()

        for file_name in zipped_files :
            input_file = zf.open(file_name,'r')
            input_file = io.TextIOWrapper(input_file,encoding="utf-8")
            
            this_delimiter = delimiters[file_name]
            
            #for line in input_file :
                #print(line.strip().split(this_delimiter))
                #break


            for line in input_file:
                #Split the line using the delimiter and remove quotes
                cleaned_line = [item.replace('"', '').strip() for item in line.strip().split(this_delimiter)]
                
                # Print the cleaned line
                print(cleaned_line)
                break      
            input_file.close() # tidy up

['datetime', 'register_no', 'emp_no', 'trans_no', 'upc', 'description', 'trans_type', 'trans_subtype', 'trans_status', 'department', 'quantity', 'Scale', 'cost', 'unitPrice', 'total', 'regPrice', 'altPrice', 'tax', 'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 'discountable', 'discounttype', 'voided', 'percentDiscount', 'ItemQtty', 'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'memType', 'staff', 'numflag', 'itemstatus', 'tenderstatus', 'charflag', 'varflag', 'batchHeaderID', 'local', 'organic', 'display', 'receipt', 'card_no', 'store', 'branch', 'match_id', 'trans_id']
['datetime', 'register_no', 'emp_no', 'trans_no', 'upc', 'description', 'trans_type', 'trans_subtype', 'trans_status', 'department', 'quantity', 'Scale', 'cost', 'unitPrice', 'total', 'regPrice', 'altPrice', 'tax', 'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 'discountable', 'discounttype', 'voided', 'percentDiscount', 'ItemQtty', 'volDiscType', 'volume', 'VolSpecial'

### Checking for Headers

This sections iterates over each csv file to determine if the file contains a header row. The results are stored in a dictionary titled **Headers** with file_name as the key.

In [5]:
headers = dict()

def is_header_row(first_row, second_row):

    # Check if most elements in the first row contain non-numeric characters
    if all(any(c.isalpha() for c in value) for value in first_row):
        return True
    
    # Optionally: Check if types of first and second rows differ
    if set(map(type, first_row)) != set(map(type, second_row)):
        return True
    
    return False


#current_zf = zip_files[0]
for current_zf in zip_files :
# Open the current zf
    with ZipFile(data_directory + current_zf,'r') as zf :
        zipped_files = zf.namelist()
        
        # Loop through each file in the zip
        for file_name in zipped_files:
            with zf.open(file_name, 'r') as input_file:
                input_file = io.TextIOWrapper(input_file, encoding="utf-8")

                this_delimiter = delimiters.get(file_name, ',')  # Use delimiter or default to ','

                # Read the first two lines
                first_line = input_file.readline().strip().split(this_delimiter)
                second_line = input_file.readline().strip().split(this_delimiter)

                # Check for header presence using improved logic
                has_header = is_header_row(first_line, second_line)
                print(f"File '{file_name}' has header: {has_header}")

                # Print first two lines for verification
                print("First line:", first_line)
                print("Second line:", second_line)


                headers[file_name] = has_header


                input_file.close()  # Close the file properly
    

File 'transArchive_201001_201003.csv' has header: True
First line: ['"datetime"', '"register_no"', '"emp_no"', '"trans_no"', '"upc"', '"description"', '"trans_type"', '"trans_subtype"', '"trans_status"', '"department"', '"quantity"', '"Scale"', '"cost"', '"unitPrice"', '"total"', '"regPrice"', '"altPrice"', '"tax"', '"taxexempt"', '"foodstamp"', '"wicable"', '"discount"', '"memDiscount"', '"discountable"', '"discounttype"', '"voided"', '"percentDiscount"', '"ItemQtty"', '"volDiscType"', '"volume"', '"VolSpecial"', '"mixMatch"', '"matched"', '"memType"', '"staff"', '"numflag"', '"itemstatus"', '"tenderstatus"', '"charflag"', '"varflag"', '"batchHeaderID"', '"local"', '"organic"', '"display"', '"receipt"', '"card_no"', '"store"', '"branch"', '"match_id"', '"trans_id"']
Second line: ['"2010-01-01 09:04:09"', '"5"', '"17"', '"2"', '"0005385200400"', '"Medium Salsa 16oz GMG"', '"I"', '" "', '" "', '"1"', '"1"', '"0"', '"2.6480"', '"2.9900"', '"2.9900"', '"3.9900"', '"0.0000"', '"0"', '"0"',

In [6]:
# testing the dictionary
if not headers:
    print("The 'headers' dictionary is empty.")
else:
    print("The 'headers' dictionary contains:")
    print(headers)

The 'headers' dictionary contains:
{'transArchive_201001_201003.csv': True, 'transArchive_201004_201006.csv': True, 'transArchive_201007_201009.csv': True, 'transArchive_201010_201012.csv': True, 'transArchive_201101_201103.csv': True, 'transArchive_201104.csv': True, 'transArchive_201105.csv': True, 'transArchive_201106.csv': True, 'transArchive_201107_201109.csv': True, 'transArchive_201110_201112.csv': True, 'transArchive_201201_201203.csv': True, 'transArchive_201201_201203_inactive.csv': True, 'transArchive_201204_201206.csv': True, 'transArchive_201204_201206_inactive.csv': True, 'transArchive_201207_201209.csv': True, 'transArchive_201207_201209_inactive.csv': True, 'transArchive_201210_201212.csv': True, 'transArchive_201210_201212_inactive.csv': True, 'transArchive_201301_201303.csv': True, 'transArchive_201301_201303_inactive.csv': True, 'transArchive_201304_201306.csv': True, 'transArchive_201304_201306_inactive.csv': True, 'transArchive_201307_201309.csv': True, 'transArchi

### Removing Duplicates

In [None]:

for current_zf in zip_files:
    with ZipFile(data_directory + current_zf, 'r') as zf:
        zipped_files = zf.namelist()

        # Iterate over each file in the zip
        for file_name in zipped_files:
            # Assuming the file is a CSV, we will load it into a DataFrame
            with zf.open(file_name) as file:
                # Read file as CSV
                try:
                    df = pd.read_csv(file, quotechar='"') 
                    print(f"Loaded {file_name} into DataFrame.")
                   # print(df.head())  # Check the first few rows of the DataFrame
                except Exception as e:
                    print(f"Error reading {file_name}: {e}")
                    
            initial_row_count = len(df)

            # Remove duplicates
            df_cleaned = df.drop_duplicates()

            # Get the number of rows after removing duplicates
            final_row_count = len(df_cleaned)

            # Calculate the number of duplicates removed
            duplicates_removed = initial_row_count - final_row_count

            print(f"Number of duplicates removed: {duplicates_removed}")

### Handle Missing Data

### Code to Align to Schema

In [None]:
# Ensure datetime column is properly formatted
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

# Convert integer columns to float where needed
float_columns = [
    'register_no', 'emp_no', 'trans_no', 'department', 'Scale', 'tax', 
    'taxexempt', 'foodstamp', 'wicable', 'discountable', 'discounttype', 
    'voided', 'local', 'receipt', 'card_no', 'store', 'branch', 'match_id', 
    'trans_id'
]
df[float_columns] = df[float_columns].astype(float)

# Convert specific columns to boolean
bool_columns = ['memType', 'staff', 'batchHeaderID', 'display']
df[bool_columns] = df[bool_columns].astype(bool)

# Convert columns to string if needed
df['volDiscType'] = df['volDiscType'].astype(str)

# Verify the new schema
print(df.dtypes)

In [None]:
# Here's a function to transform the date column in a dataframe to 
# the YYYYMM01 format we'd like to use for subsetting.

def reformat_date(date_string) :
    date_string = datetime.datetime.strptime(date_string,"%Y-%m-%d")
    return(datetime.date.strftime(date_string,"%Y%m")+"01")

assert(reformat_date("2022-09-20")=="20220901")
assert(reformat_date("2000-10-20")=="20001001")

#### Cleaning and Processing
-- same loop as above but with processing.

In [63]:
def missing_values(df, schema):
    for field in schema:
        field_name = field.name
        field_type = field.field_type
        
        # Handle missing values based on field type
        if field_type == "FLOAT":
            df[field_name] = df[field_name].fillna(0.0)
        elif field_type == "STRING":
            df[field_name] = df[field_name].fillna('')
        elif field_type == "TIMESTAMP":
            df[field_name] = pd.to_datetime(df[field_name], errors='coerce')
        elif field_type == "BOOLEAN":
            df[field_name] = df[field_name].fillna(False)
    
    return df

### Processing Files
This sections iterates over all the zip files in the raw data directory. It then processes them and saves them in a new processed files directory as csv files. 

1. Check for delimiter
1. Check for header
1. Add header column if missing
1. Handle Missing Values
1. Correct datatypes

process data one file at time to test

In [8]:
processed_files_directory = "data/processed_files/"

# Define common headers for files without headers
common_headers = [
    'datetime', 'register_no', 'emp_no', 'trans_no', 'upc', 'description', 
    'trans_type', 'trans_subtype', 'trans_status', 'department', 'quantity', 
    'Scale', 'cost', 'unitPrice', 'total', 'regPrice', 'altPrice', 'tax', 
    'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 
    'discountable', 'discounttype', 'voided', 'percentDiscount', 'ItemQtty', 
    'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'memType', 
    'staff', 'numflag', 'itemstatus', 'tenderstatus', 'charflag', 'varflag', 
    'batchHeaderID', 'local', 'organic', 'display', 'receipt', 'card_no', 
    'store', 'branch', 'match_id', 'trans_id'
]

if not os.path.exists(processed_files_directory):
    os.makedirs(processed_files_directory)

# Function to load a file with its metadata
def load_file(file_name, working_file):
    delimiter = delimiters.get(file_name, ',')  # Default to ','
    has_header = headers.get(file_name, True)  # Check if the file has a header
    header = 0 if has_header else None  # 0 for header row, None if no header

    try:
        # Load the CSV with the appropriate delimiter and header handling
        df = pd.read_csv(working_file, delimiter=delimiter, header=header)

        # Assign common headers if the file has no header
        if not has_header:
            df.columns = common_headers

        print(f"Loaded {file_name} successfully!")
        return df
    except Exception as e:
        print(f"Error loading {file_name}: {e}")
        return None

# Process ZIP files
zip_files = os.listdir(data_directory)

for current_zf in zip_files:
    with ZipFile(os.path.join(data_directory, current_zf), 'r') as zf:
        zipped_files = zf.namelist()

        for file_name in zipped_files:
            with zf.open(file_name) as working_file:
                df = load_file(file_name, working_file)

                if df is not None:
                    try:
                        # Clean and process the DataFrame
                        df.columns = df.columns.str.strip()

                        # Safely handle 'register_no' and 'quantity' columns if they exist
                        if 'register_no' in df.columns:
                            df['register_no'] = df['register_no'].fillna(0)
                        if 'quantity' in df.columns:
                            df['quantity'] = df['quantity'].fillna(0)

                        # Convert datetime if the column exists
                        if 'datetime' in df.columns:
                            df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')


                        # Convert columns to appropriate data types
                        float_columns = [
                            'register_no', 'emp_no', 'trans_no', 'department',
                            'Scale', 'tax', 'taxexempt', 'foodstamp', 'wicable',
                            'discountable', 'discounttype', 'voided', 'local',
                            'receipt', 'card_no', 'store', 'branch', 'match_id',
                            'trans_id'
                        ]
                        df[float_columns] = df[float_columns].astype(float, errors='ignore')

                        bool_columns = ['memType', 'staff', 'batchHeaderID', 'display']
                        df[bool_columns] = df[bool_columns].astype(bool, errors='ignore')

                        df['volDiscType'] = df.get('volDiscType', "").astype(str)

                        # Extract the base name without extensions before adding _processed.csv
                        file_name_no_ext = os.path.splitext(file_name)[0]

                        # Save the processed file
                        processed_filename = os.path.join(
                            processed_files_directory, f"{file_name_no_ext}_processed.csv"
                        )
                        df.to_csv(processed_filename, index=False)

                        print(f"Processed and saved {file_name}.")
                    except KeyError as e:
                        print(f"Missing expected column: {e}")
                    except Exception as e:
                        print(f"Error processing {file_name}: {e}")

        print(f"Completed processing ZIP file: {current_zf}\n")
    
    

Loaded transArchive_201001_201003.csv successfully!
Processed and saved transArchive_201001_201003.csv.
Completed processing ZIP file: transArchive_201001_201003.zip

Loaded transArchive_201004_201006.csv successfully!
Processed and saved transArchive_201004_201006.csv.
Completed processing ZIP file: transArchive_201004_201006.zip

Loaded transArchive_201007_201009.csv successfully!
Processed and saved transArchive_201007_201009.csv.
Completed processing ZIP file: transArchive_201007_201009.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201010_201012.csv successfully!
Processed and saved transArchive_201010_201012.csv.
Completed processing ZIP file: transArchive_201010_201012.zip

Loaded transArchive_201101_201103.csv successfully!
Processed and saved transArchive_201101_201103.csv.
Completed processing ZIP file: transArchive_201101_201103.zip

Loaded transArchive_201104.csv successfully!
Processed and saved transArchive_201104.csv.
Completed processing ZIP file: transArchive_201104.zip

Loaded transArchive_201105.csv successfully!
Processed and saved transArchive_201105.csv.
Completed processing ZIP file: transArchive_201105.zip

Loaded transArchive_201106.csv successfully!
Processed and saved transArchive_201106.csv.
Completed processing ZIP file: transArchive_201106.zip

Loaded transArchive_201107_201109.csv successfully!
Processed and saved transArchive_201107_201109.csv.
Completed processing ZIP file: transArchive_201107_201109.zip

Loaded transArchive_201110_201112.csv successfully!
Processed

  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201201_201203_inactive.csv successfully!
Processed and saved transArchive_201201_201203_inactive.csv.
Completed processing ZIP file: transArchive_201201_201203_inactive.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201204_201206.csv successfully!
Processed and saved transArchive_201204_201206.csv.
Completed processing ZIP file: transArchive_201204_201206.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201204_201206_inactive.csv successfully!
Processed and saved transArchive_201204_201206_inactive.csv.
Completed processing ZIP file: transArchive_201204_201206_inactive.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201207_201209.csv successfully!
Processed and saved transArchive_201207_201209.csv.
Completed processing ZIP file: transArchive_201207_201209.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201207_201209_inactive.csv successfully!
Processed and saved transArchive_201207_201209_inactive.csv.
Completed processing ZIP file: transArchive_201207_201209_inactive.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201210_201212.csv successfully!
Processed and saved transArchive_201210_201212.csv.
Completed processing ZIP file: transArchive_201210_201212.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201210_201212_inactive.csv successfully!
Processed and saved transArchive_201210_201212_inactive.csv.
Completed processing ZIP file: transArchive_201210_201212_inactive.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201301_201303.csv successfully!
Processed and saved transArchive_201301_201303.csv.
Completed processing ZIP file: transArchive_201301_201303.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201301_201303_inactive.csv successfully!
Processed and saved transArchive_201301_201303_inactive.csv.
Completed processing ZIP file: transArchive_201301_201303_inactive.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201304_201306.csv successfully!
Processed and saved transArchive_201304_201306.csv.
Completed processing ZIP file: transArchive_201304_201306.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201304_201306_inactive.csv successfully!
Processed and saved transArchive_201304_201306_inactive.csv.
Completed processing ZIP file: transArchive_201304_201306_inactive.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201307_201309.csv successfully!
Processed and saved transArchive_201307_201309.csv.
Completed processing ZIP file: transArchive_201307_201309.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201307_201309_inactive.csv successfully!
Processed and saved transArchive_201307_201309_inactive.csv.
Completed processing ZIP file: transArchive_201307_201309_inactive.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201310_201312.csv successfully!
Processed and saved transArchive_201310_201312.csv.
Completed processing ZIP file: transArchive_201310_201312.zip

Loaded transArchive_201310_201312_inactive.csv successfully!
Processed and saved transArchive_201310_201312_inactive.csv.
Completed processing ZIP file: transArchive_201310_201312_inactive.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201401_201403.csv successfully!
Processed and saved transArchive_201401_201403.csv.
Completed processing ZIP file: transArchive_201401_201403.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201401_201403_inactive.csv successfully!
Processed and saved transArchive_201401_201403_inactive.csv.
Completed processing ZIP file: transArchive_201401_201403_inactive.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201404_201406.csv successfully!
Processed and saved transArchive_201404_201406.csv.
Completed processing ZIP file: transArchive_201404_201406.zip

Loaded transArchive_201404_201406_inactive.csv successfully!
Processed and saved transArchive_201404_201406_inactive.csv.
Completed processing ZIP file: transArchive_201404_201406_inactive.zip

Loaded transArchive_201407_201409.csv successfully!
Processed and saved transArchive_201407_201409.csv.
Completed processing ZIP file: transArchive_201407_201409.zip

Loaded transArchive_201407_201409_inactive.csv successfully!
Processed and saved transArchive_201407_201409_inactive.csv.
Completed processing ZIP file: transArchive_201407_201409_inactive.zip

Loaded transArchive_201410_201412.csv successfully!
Processed and saved transArchive_201410_201412.csv.
Completed processing ZIP file: transArchive_201410_201412.zip

Loaded transArchive_201410_201412_inactive.csv successfully!
Processed and saved transArchive_201410_201412_ina

  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201511.csv successfully!
Processed and saved transArchive_201511.csv.
Completed processing ZIP file: transArchive_201511.zip

Loaded transArchive_201512.csv successfully!
Processed and saved transArchive_201512.csv.
Completed processing ZIP file: transArchive_201512.zip

Loaded transArchive_201601.csv successfully!
Processed and saved transArchive_201601.csv.
Completed processing ZIP file: transArchive_201601.zip

Loaded transArchive_201602.csv successfully!
Processed and saved transArchive_201602.csv.
Completed processing ZIP file: transArchive_201602.zip

Loaded transArchive_201603.csv successfully!
Processed and saved transArchive_201603.csv.
Completed processing ZIP file: transArchive_201603.zip

Loaded transArchive_201604.csv successfully!
Processed and saved transArchive_201604.csv.
Completed processing ZIP file: transArchive_201604.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201605.csv successfully!
Processed and saved transArchive_201605.csv.
Completed processing ZIP file: transArchive_201605.zip

Loaded transArchive_201606.csv successfully!
Processed and saved transArchive_201606.csv.
Completed processing ZIP file: transArchive_201606.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201607.csv successfully!
Processed and saved transArchive_201607.csv.
Completed processing ZIP file: transArchive_201607.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201608.csv successfully!
Processed and saved transArchive_201608.csv.
Completed processing ZIP file: transArchive_201608.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201609.csv successfully!
Processed and saved transArchive_201609.csv.
Completed processing ZIP file: transArchive_201609.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201610.csv successfully!
Processed and saved transArchive_201610.csv.
Completed processing ZIP file: transArchive_201610.zip

Loaded transArchive_201611.csv successfully!
Processed and saved transArchive_201611.csv.
Completed processing ZIP file: transArchive_201611.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201612.csv successfully!
Processed and saved transArchive_201612.csv.
Completed processing ZIP file: transArchive_201612.zip



  df = pd.read_csv(working_file, delimiter=delimiter, header=header)


Loaded transArchive_201701.csv successfully!
Processed and saved transArchive_201701.csv.
Completed processing ZIP file: transArchive_201701.zip



#### DOUBLE CHECK CSV headers 

In [None]:
# Loop through all files in the directory
csv_files = [f for f in os.listdir(processed_files_directory) if f.endswith('.csv')]

# Function to print headers of each CSV
def print_csv_headers(file_name, file_path):
    try:
        # Read only the first 5 rows to print headers
        df = pd.read_csv(file_path, nrows=5)
        print(f"Headers for {file_name}: {list(df.columns)}")
    except Exception as e:
        print(f"Error reading {file_name}: {e}")

# Loop through and print headers for each CSV file
for csv_file in csv_files:
    file_path = os.path.join(processed_files_directory, csv_file)
    print_csv_headers(csv_file, file_path)

In [None]:
print(df.columns.tolist()) 

In [None]:
job.result()