In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import os
import glob

# ==========================================
# 1. CONFIGURATION (Drive Edition)
# ==========================================
CONFIG = {
    # Update this to match your folder structure in Drive
    # The '/content/drive/MyDrive/' part is standard for Colab
    "BASE_DIR": "/content/drive/MyDrive/Lab/Lab_Data_Analysis/", 

    # We will look for these specific filenames inside the folders
    "RAW_FILENAME": "vehicle.csv",
    "AUX_FILENAME": "VEH_AUX.csv",
    "ACC_FILENAME": "accident.csv",
    
    # Codes
    "CODES": {
        "LARGE_TRUCK": [4], 
        "CURVES": [2, 3, 4] 
    }
}

# ==========================================
# 2. HELPER: Auto-Detect Years
# ==========================================

def find_file_insensitive(folder, target_name):
    """
    Scans a folder for a file matching 'target_name', ignoring case.
    Example: Finds 'veh_aux.csv' even if you asked for 'VEH_AUX.CSV'.
    """
    try:
        # List all actual files in the directory
        actual_files = os.listdir(folder)
        
        # Check them one by one
        for f in actual_files:
            if f.lower() == target_name.lower():
                return os.path.join(folder, f)
                
        return None # Not found
    except OSError:
        return None

def find_data_folders():
    """
    Scans the BASE_DIR and finds all folders containing the required files.
    Returns a dictionary: { 2021: 'path/to/2021', 2022: 'path/to/2022' }
    """
    valid_years = {}
    
    # Use glob to find all 'vehicle.csv' files recursively
    # The '**' means "look in every subfolder"
    search_pattern = os.path.join(CONFIG["BASE_DIR"], "**", CONFIG["RAW_FILENAME"])
    found_files = glob.glob(search_pattern, recursive=True)
    
    print(f"Scanning {CONFIG['BASE_DIR']}...")
    
    for file_path in found_files:
        # Infer the folder path and the year from the file path
        folder_path = os.path.dirname(file_path)
        
        # Try to extract a year from the folder name (e.g., ".../2021/vehicle.csv")
        # We take the folder name immediately containing the file
        folder_name = os.path.basename(folder_path)
        
        if folder_name.isdigit() and len(folder_name) == 4:
            year = int(folder_name)
            valid_years[year] = folder_path
            print(f"  [+] Found data for Year: {year}")
        else:
            print(f"  [-] Found file in '{folder_name}' but it doesn't look like a year. Skipping.")
            
    return valid_years

# ==========================================
# 3. PROCESSING PIPELINE
# ==========================================
def load_and_process_year(year, folder_path):
    print(f"  [?] Checking folder: {folder_path}")
    
    # Use the smart finder for BOTH files
    path_raw = find_file_insensitive(folder_path, CONFIG["RAW_FILENAME"])
    path_aux = find_file_insensitive(folder_path, CONFIG["AUX_FILENAME"])
    path_acc = find_file_insensitive(folder_path, CONFIG["ACC_FILENAME"])
    
    # 2. Validation
    if not path_raw:
        print(f"  [!] CRITICAL: Missing '{CONFIG['RAW_FILENAME']}'")
        return None
    if not path_aux:
        print(f"  [!] CRITICAL: Missing '{CONFIG['AUX_FILENAME']}'")
        return None
    if not path_acc:
        print(f"  [!] CRITICAL: Missing '{CONFIG['ACC_FILENAME']}'")
        return None

    try:
        print(f"      Loading: {os.path.basename(path_raw)} + {os.path.basename(path_aux)} + {os.path.basename(path_acc)}")
        
        # 3. Load Data (low_memory=False to prevent Dtype warnings)
        df_raw = pd.read_csv(path_raw, encoding='latin1', low_memory=False)
        df_aux = pd.read_csv(path_aux, encoding='latin1', low_memory=False)
        df_acc = pd.read_csv(path_acc, encoding='latin1', low_memory=False) # <--- NEW LOAD
        
        # 4. Standardize Columns
        df_raw.columns = [c.upper() for c in df_raw.columns]
        df_aux.columns = [c.upper() for c in df_aux.columns]
        df_acc.columns = [c.upper() for c in df_acc.columns]
        
        # Rename legacy columns if needed
        if 'VALIGN' in df_raw.columns: df_raw.rename(columns={'VALIGN': 'V_ALIGN'}, inplace=True)
        
        # 5. MERGE STEP 1: Vehicle + Aux (Inner Join on ST_CASE + VEH_NO)
        df_merged = pd.merge(df_raw, df_aux, on=['ST_CASE', 'VEH_NO'], how='inner', suffixes=('', '_AUX_DROP'))
        
        # 6. MERGE STEP 2: Result + Accident (Left Join on ST_CASE only)
        # We use suffixes to prevent duplicate column names like STATE_x, STATE_y
        df_final = pd.merge(df_merged, df_acc, on='ST_CASE', how='left', suffixes=('', '_ACC_DROP'))
        
        # Clean up: Remove duplicate columns generated by the merges
        df_final = df_final.filter(regex='^(?!.*_DROP)')

        # 7. Apply Logic / Calculations
        df_final['YEAR'] = year
        df_final['is_large_truck'] = df_final['A_BODY'].isin(CONFIG["CODES"]["LARGE_TRUCK"])
        df_final['is_curve'] = df_final['V_ALIGN'].isin(CONFIG["CODES"]["CURVES"])
        df_final['is_truck_on_curve'] = df_final['is_large_truck'] & df_final['is_curve']
        
        return df_final

    except Exception as e:
        print(f"  [!] Error processing {year}: {e}")
        return None

# ==========================================
# 4. MAIN EXECUTION
# ==========================================
all_data = []
detected_years = find_data_folders()

if not detected_years:
    print("No valid data folders found! Check your BASE_DIR path.")
else:
    # Sort years to process in order
    for year in sorted(detected_years.keys()):
        print(f"Processing {year}...")
        df_year = load_and_process_year(year, detected_years[year])
        
        if df_year is not None:
            all_data.append(df_year)

    # Combine
    if all_data:
        final_df = pd.concat(all_data, ignore_index=True)
        print(f"\nSuccessfully imported {len(final_df)} total records from {len(all_data)} years.")
        print(final_df.shape)
        

        # Quick Check
        summary = final_df.groupby('YEAR')['is_truck_on_curve'].sum()
        print("\nTruck Crashes on Curves per Year:")
        print(summary)
    else:
        print("Processing complete, but no data was generated.")

Scanning /content/drive/MyDrive/Lab/Lab_Data_Analysis/...
  [+] Found data for Year: 2023
  [+] Found data for Year: 2022
  [+] Found data for Year: 2021
  [+] Found data for Year: 2020
Processing 2020...
  [?] Checking folder: /content/drive/MyDrive/Lab/Lab_Data_Analysis/2020
      Loading: vehicle.csv + VEH_AUX.CSV + accident.csv
Processing 2021...
  [?] Checking folder: /content/drive/MyDrive/Lab/Lab_Data_Analysis/2021
      Loading: vehicle.csv + VEH_AUX.CSV + accident.csv
Processing 2022...
  [?] Checking folder: /content/drive/MyDrive/Lab/Lab_Data_Analysis/2022
      Loading: vehicle.csv + veh_aux.csv + accident.csv
Processing 2023...
  [?] Checking folder: /content/drive/MyDrive/Lab/Lab_Data_Analysis/2023
      Loading: vehicle.csv + veh_aux.csv + accident.csv

Successfully imported 235438 total records from 4 years.
(235438, 289)

Truck Crashes on Curves per Year:
YEAR
2020    261
2021    334
2022    294
2023    293
Name: is_truck_on_curve, dtype: int64
