Step 1: Extract the Main Zip File

In [1]:
import zipfile
import os
import shutil

def extract_main_zip(main_zip_file, extract_to_folder):
    # Ensure the extraction folder exists
    os.makedirs(extract_to_folder, exist_ok=True)

    # Open the main zip file
    with zipfile.ZipFile(main_zip_file, 'r') as main_zip:
        # Loop through all files in the main zip file
        for zip_info in main_zip.infolist():
            # Skip files with "_inactive" in their names
            if '_inactive' in zip_info.filename:
                print(f"Skipping {zip_info.filename}, marked as inactive.")
                continue
            
            # Create the full output path
            output_file_path = os.path.join(extract_to_folder, zip_info.filename)
            
            # Check if the file already exists
            if not os.path.exists(output_file_path):
                # Create any necessary directories
                os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
                
                # Extract the file
                with main_zip.open(zip_info) as source, open(output_file_path, 'wb') as target:
                    shutil.copyfileobj(source, target)
                print(f"Extracted {zip_info.filename} to {output_file_path}")
            else:
                print(f"Skipping {zip_info.filename}, already exists.")

# Example usage
main_zip = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/WedgeZipOfZips(raw).zip'
extract_folder = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_main_zip'

extract_main_zip(main_zip, extract_folder)




Skipping transArchive_201001_201003.zip, already exists.
Skipping transArchive_201004_201006.zip, already exists.
Skipping transArchive_201007_201009.zip, already exists.
Skipping transArchive_201010_201012.zip, already exists.
Skipping transArchive_201101_201103.zip, already exists.
Skipping transArchive_201104.zip, already exists.
Skipping transArchive_201105.zip, already exists.
Skipping transArchive_201106.zip, already exists.
Skipping transArchive_201107_201109.zip, already exists.
Skipping transArchive_201110_201112.zip, already exists.
Skipping transArchive_201201_201203.zip, already exists.
Skipping transArchive_201201_201203_inactive.zip, marked as inactive.
Skipping transArchive_201204_201206.zip, already exists.
Skipping transArchive_201204_201206_inactive.zip, marked as inactive.
Skipping transArchive_201207_201209.zip, already exists.
Skipping transArchive_201207_201209_inactive.zip, marked as inactive.
Skipping transArchive_201210_201212.zip, already exists.
Skipping tran

Step 2: Extract the Nested Zip Files

In [2]:
import zipfile
import os

def extract_all_csvs_to_one_folder(extract_folder, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Walk through the extracted folder and look for zip files
    for root, dirs, files in os.walk(extract_folder):
        for file in files:
            if file.endswith('.zip'):
                nested_zip_path = os.path.join(root, file)
                
                # Check if the file is a valid zip file before proceeding
                try:
                    with zipfile.ZipFile(nested_zip_path, 'r') as nested_zip:
                        for zip_info in nested_zip.infolist():
                            if zip_info.filename.endswith('.csv'):
                                output_file_path = os.path.join(output_folder, zip_info.filename)
                                # Check if the CSV file already exists in the output folder
                                if not os.path.exists(output_file_path):
                                    # Extract the CSV if it doesn't already exist
                                    nested_zip.extract(zip_info, output_folder)
                                    print(f"Extracted {zip_info.filename} to {output_folder}")
                                else:
                                    print(f"Skipping {zip_info.filename}, already exists.")
                except zipfile.BadZipFile:
                    print(f"Skipping {nested_zip_path}, not a valid zip file.")

# Example usage
extract_folder = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_main_zip'
output_folder = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_csv_files'  

extract_all_csvs_to_one_folder(extract_folder, output_folder)




Skipping transArchive_201001_201003.csv, already exists.
Skipping transArchive_201004_201006.csv, already exists.
Skipping transArchive_201007_201009.csv, already exists.
Skipping transArchive_201010_201012.csv, already exists.
Skipping transArchive_201101_201103.csv, already exists.
Skipping transArchive_201104.csv, already exists.
Skipping transArchive_201105.csv, already exists.
Skipping transArchive_201106.csv, already exists.
Skipping transArchive_201107_201109.csv, already exists.
Skipping transArchive_201110_201112.csv, already exists.
Skipping transArchive_201201_201203.csv, already exists.
Skipping transArchive_201204_201206.csv, already exists.
Skipping transArchive_201207_201209.csv, already exists.
Skipping transArchive_201210_201212.csv, already exists.
Skipping transArchive_201301_201303.csv, already exists.
Skipping transArchive_201304_201306.csv, already exists.
Skipping transArchive_201307_201309.csv, already exists.
Skipping transArchive_201310_201312.csv, already exi

Step 3: Clean and Standardize the CSV Files

In [3]:
import pandas as pd
import glob
import os

def clean_and_standardize_file(input_file):
    try:
        # Detect delimiter automatically and load the CSV
        df = pd.read_csv(input_file, sep=None, engine='python')
        
        # Replace different forms of NULL values with None/NaN (using raw strings to avoid Unicode escape error)
        df.replace({"NULL": None, r"\\N": None, r"\N": None}, inplace=True)
        
        print(f"Cleaned {input_file} in memory")
        return df  # Return the cleaned dataframe for further processing
    except Exception as e:
        print(f"Error processing {input_file}: {e}")
        return None  # Return None in case of an error

def process_extracted_csvs_in_memory(extracted_folder):
    # Get all CSV files in the extracted folder
    csv_files = glob.glob(f"{extracted_folder}/**/*.csv", recursive=True)
    
    cleaned_dfs = []
    
    # Clean and store each CSV file in memory
    for csv_file in csv_files:
        df = clean_and_standardize_file(csv_file)
        if df is not None:
            cleaned_dfs.append(df)  # Store cleaned dataframe in the list

    return cleaned_dfs  # Return the list of cleaned dataframes

# Example usage
extracted_folder = 'D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_csv_files'  # Folder where your extracted CSVs are located

# Process and clean all CSVs in memory
cleaned_dataframes = process_extracted_csvs_in_memory(extracted_folder)


Cleaned D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_csv_files\transArchive_201001_201003.csv in memory
Cleaned D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_csv_files\transArchive_201004_201006.csv in memory
Cleaned D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_csv_files\transArchive_201007_201009.csv in memory
Cleaned D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_csv_files\transArchive_201010_201012.csv in memory
Cleaned D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_csv_files\transArchive_201101_201103.csv in memory
Cleaned D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_csv_files\transArchive_201104.csv in memory
Cleaned D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_csv_files\transArchive_201105.csv in memory
Cleaned D:/WedgeProject/Wedge-Project-ADA-Riley-ORorke/data/extracted_csv_files\transArchive_201106.csv in memory
Cleaned D:/WedgeProject/Wedge-Project-ADA-Riley-ORork