In [83]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [84]:
# Define the file path
file_path = "C:/Users/jason/OneDrive - The Pennsylvania State University/DAAN 881, Data Drive Decision Making/Project/Auto Data/MI/Michigan_CrashDataset_2022.csv"

try:
    # Assuming CSV for now based on the path extension in the code
    df_mi = pd.read_csv(file_path, low_memory=False)
    print("CSV file loaded successfully.")
except UnicodeDecodeError:
    print(f"UTF-8 decoding failed for {file_path}. Trying different encodings...")
    try:
        # Try latin1 encoding, common for files with special characters
        df_mi = pd.read_csv(file_path, low_memory=False, encoding='latin1')
        print("CSV file loaded successfully with latin1 encoding.")
    except Exception as e:
        print(f"Error loading CSV file with latin1 encoding: {e}")
        # If this fails, consider if it's actually an Excel file -> use pd.read_excel
        # Or investigate other potential encodings like 'cp1252'
        df_mi = pd.DataFrame() # Create empty df to avoid downstream errors
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    df_mi = pd.DataFrame() # Create empty df
except Exception as e:
    print(f"An unexpected error occurred while loading the file: {e}")
    df_mi = pd.DataFrame() # Create empty df

CSV file loaded successfully.


In [85]:
# Proceed only if data loading was successful
if not df_mi.empty:

    # --- Create AccidentID and PersonID ---

    # Define columns to group by to identify unique crash events
    grouping_cols = [
        'Crash Year', 'Crash Month', 'Crash Day', 'Time of Day',
        'County', 'City or Township'
    ]

    # Verify grouping columns exist
    actual_grouping_cols = [col for col in grouping_cols if col in df_mi.columns]
    if len(actual_grouping_cols) != len(grouping_cols):
        print(f"Warning: Not all intended grouping columns found for AccidentID. Using: {actual_grouping_cols}")

    if actual_grouping_cols:
        print("Creating AccidentID based on grouping...")
        df_mi['AccidentID'] = df_mi.groupby(actual_grouping_cols).ngroup()
        print("AccidentID created.")
    else:
        print("Error: Cannot create AccidentID because essential grouping columns are missing.")
        # Handle error: Maybe create a dummy AccidentID or stop execution
        df_mi['AccidentID'] = -1 # Example: Dummy ID

    # Create PersonID as a unique row identifier (using reset_index approach)
    print("Creating PersonID...")
    df_mi.reset_index(drop=True, inplace=True) # Ensure index is sequential first
    df_mi['PersonID'] = df_mi.index
    print("PersonID created.")

    # Display the first few rows with the new IDs
    print("\nDataFrame head with new IDs:")
    # Add check if AccidentID was created before trying to display it
    display_cols_ids = ['PersonID']
    if 'AccidentID' in df_mi.columns:
        display_cols_ids.append('AccidentID')
    print(df_mi[display_cols_ids + actual_grouping_cols].head())

Creating AccidentID based on grouping...
AccidentID created.
Creating PersonID...
PersonID created.

DataFrame head with new IDs:
   PersonID  AccidentID  Crash Year Crash Month  Crash Day        Time of Day  \
0         0     66579.0      2022.0     January        1.0  4:00 PM - 4:59 PM   
1         1     66649.0      2022.0     January        1.0  5:00 PM - 5:59 PM   
2         2     66757.0      2022.0     January        1.0  6:00 PM - 6:59 PM   
3         3     66780.0      2022.0     January        1.0  6:00 PM - 6:59 PM   
4         4     66780.0      2022.0     January        1.0  6:00 PM - 6:59 PM   

      County                  City or Township  
0  Kalamazoo         Kalamazoo County: Portage  
1       Kent         Kent County: Cascade Twp.  
2    Oakland  Oakland County: Farmington Hills  
3      Wayne             Wayne County: Romulus  
4      Wayne             Wayne County: Romulus  


In [86]:
    # --- Preparation for Analysis ---
    # Define the columns we decided to keep
    columns_to_keep = [
        # === Newly Created IDs ===
        'AccidentID',         # Generated unique ID for each crash event
        'PersonID',           # Generated unique ID for each person record (row)

        # === Crash Event Identifiers & Time ===
        'Crash Year',
        'Crash Month',
        'Crash Day',
        'Time of Day',
        'Day of Week',

        # === Crash Location ===
        'County',
        'City or Township',
        'Rural/Urban Area (2016+)',
        'Highway Class',        # Added for context
        # 'Highway Number',     # Optional: Keep if route-level analysis is needed

        # === Crash Severity & Type ===
        'Worst Injury in Crash',  # Crash-level overall severity
        'Crash Type',
        'Crash: Fatal Crash',
        'Crash: Injury Crash',
        'Crash: Property Damage',

        # === Person/Party Information ===
        'Party Type',           # **Crucial** to identify drivers vs. others
        'Person Age',
        'Person Gender',
        'Person Race (2021+)',  # Use most recent Race/Ethnicity field
        'Person Degree of Injury', # Person-level injury outcome

        # === Driver Condition / Contributing Factors ===
        'Driver Condition: Emotional (2016+)',
        'Driver Condition: Fatigued or Asleep (2016+)',
        'Driver Condition: Medication',
        'Driver Condition: Normal',
        'Driver Condition: Other (2016+)',
        'Driver Condition: Physically Disabled (2016+)',
        'Driver Condition: Sick',
        # 'Driver Condition: Unknown (2016+)', # Optional: Often less informative
        'Driver Contributing Factor - Alcohol Use (2016+)', # Cited factor
        'Driver Contributing Factor - Drug Use (2016+)', # **Cited factor - Key**
        'Crash: Drinking',      # Crash-level flag
        'Crash: Drug Use',      # **Crash-level flag - Key**
        # 'Driver Drinking',    # Likely redundant with Contributing Factor, dropped for simplicity
        'Drugs Suspected',      # **Officer suspicion - Key**
        'Contributing Circumstance 1 (2016+)', # Context
        'Contributing Circumstance 2 (2016+)', # Context

        # === Substance Testing & Results ===
        'Test Offered - Alcohol',
        'Test Offered - Drug (2016+)',
        'Refusal Information - Alcohol',
        'Refusal Information - Drug (2016+)',
        'Test Result - Alcohol',
        'Test Result - Cannabinoid Drug (2021+)', # Specific drug class result
        'Test Result - Drug 1', # **Key drug result**
        'Test Result - Drug 2', # **Key drug result**
        'Test Result - Drug 3', # **Key drug result**
        'Test Result Pending - Alcohol (2016+)',
        'Test Result Pending - Drug (2016+)',

        # === Crash Environment Context ===
        'Lighting Conditions',
        'Road Conditions',
        'Weather Conditions (2016+)',
        'Traffic Control',
        'Posted Speed Limit',
        # 'Number of Traffic Lanes', # Optional context

        # === Other Relevant Person/Unit Attributes ===
        'Person Position',
        'Person Trapped',
        'Person Ejection',
        'Person Restraint (2016+)', # Use most recent restraint field
        'Traffic Unit Type',    # Vehicle/unit type context
        'Vehicle Model Year',   # Vehicle context
        'Vehicle Type',         # Vehicle context
        'Extent of Damage (2015+)' # Use most recent damage field
    ]

    # Verify which of these columns actually exist in the loaded dataframe
    existing_columns_to_keep = [col for col in columns_to_keep if col in df_mi.columns]
    missing_expected_columns = [col for col in columns_to_keep if col not in df_mi.columns]

    if missing_expected_columns:
        print(f"\nWarning: The following columns specified in 'columns_to_keep' were NOT found in the DataFrame and will be ignored: {missing_expected_columns}")

    # Create a dataframe with only the selected available columns for analysis
    print(f"\nCreating analysis DataFrame with {len(existing_columns_to_keep)} columns.")
    df_analysis = df_mi[existing_columns_to_keep].copy()
    
    # Find the actual column name for Race/Ethnicity
    # Updated to search for the specific column name identified
    race_col_name = None
    # Prioritize the exact name from the refined list
    if 'Person Race (2021+)' in df_analysis.columns:
        race_col_name = 'Person Race (2021+)'
        print(f"Found Race/Ethnicity column: '{race_col_name}'")
    else:
        # Fallback search if the exact name isn't present (less likely now)
        possible_race_cols = ['Race / Ethnicity', 'Race/Ethnicity', 'RACE', 'ETHNICITY']
        for col in possible_race_cols:
            if col in df_analysis.columns:
                race_col_name = col
                print(f"Found Race/Ethnicity column using fallback search: '{race_col_name}'")
                break
    if not race_col_name:
        print("Warning: Race/Ethnicity column ('Person Race (2021+)' or fallbacks) not found.")
    
    # Find the actual column name for Party Type (for identifying drivers)
    # Updated to search for the specific column name identified
    party_type_col = None
    # Prioritize 'Party Type' as identified in the refined list
    if 'Party Type' in df_analysis.columns:
        party_type_col = 'Party Type'
        print(f"Found Party Type column: '{party_type_col}'")
    # Fallback to 'Traffic Unit Type' if 'Party Type' is missing
    elif 'Traffic Unit Type' in df_analysis.columns:
         party_type_col = 'Traffic Unit Type'
         print(f"Found Party Type column (using fallback): '{party_type_col}'")
    else:
        # Fallback to original broader search if specific names fail
        possible_party_cols = ['Unit Type', 'UNITTYPE', 'Person Type']
        for col in possible_party_cols:
            if col in df_analysis.columns:
                party_type_col = col
                print(f"Found Party Type column using broader fallback search: '{party_type_col}'")
                break
    if not party_type_col:
        print("Error: Party Type column ('Party Type', 'Traffic Unit Type', or fallbacks) not found. Cannot reliably filter for drivers.")
        # Consider stopping or proceeding with caution, df_drivers will be a copy of df_analysis
    
    
    # --- Filter for Drivers (Crucial Step as per request) ---
    df_drivers = df_analysis.copy() # Initialize df_drivers as a copy first
    driver_count = len(df_drivers) # Default to all rows if filtering fails
    original_person_count = len(df_analysis)
    
#    if party_type_col:
#        print(f"\nUnique values in '{party_type_col}' (used for driver filtering):")
#        try:
#            # Display value counts to help identify the correct driver string
#            print(df_analysis[party_type_col].value_counts())
#    
#            # **Action Required:** Confirm the exact string used for drivers from the output above.
#            # Common values might be 'Motor Vehicle Driver', 'Driver', 'DRIVER'. Adjust below.
#            driver_identifier = 'Motor Vehicle Driver' # <-- **VERIFY THIS VALUE**
#    
#            # Perform the filtering
#            df_drivers = df_analysis[df_analysis[party_type_col] == driver_identifier].copy()
#            driver_count = len(df_drivers)
#            print(f"\nFiltered for drivers using '{driver_identifier}'. Original persons in selection: {original_person_count}, Drivers identified: {driver_count}")
#            if driver_count == 0:
#                print(f"WARNING: No rows matched the driver identifier '{driver_identifier}'. Check the value counts above and update the identifier.")
#                # Reset df_drivers to the unfiltered data to allow subsequent analysis steps to run
#                df_drivers = df_analysis.copy()
#                print("Proceeding analysis on the unfiltered data due to filtering issue.")
#    
#        except KeyError:
#            print(f"Error accessing Party Type column '{party_type_col}' during filtering. Skipping driver filter.")
#        except Exception as e:
#            print(f"An unexpected error occurred during driver filtering: {e}. Skipping driver filter.")
#    else:
#         print("\nSkipping driver filter as Party Type column was not identified.")
#    
#    print(f"\n--- Starting Detailed Analysis on {driver_count} records (filtered drivers if successful) ---")

    # --- Analysis of Kept Attributes (on df_drivers) ---
    
    for col in df_drivers.columns:
        # Skip analysis for high-cardinality IDs if they dominate output
        if col in ['AccidentID', 'PersonID'] and df_drivers[col].nunique() > 1000:
            print(f"\n--- Skipping detailed value counts for high-cardinality ID: {col} ---")
            print(f"Data Type: {df_drivers[col].dtype}")
            missing_count = df_drivers[col].isnull().sum()
            missing_percent = (missing_count / len(df_drivers)) * 100 if len(df_drivers) > 0 else 0
            print(f"Missing Values: {missing_count} ({missing_percent:.2f}%)")
            continue # Move to next column
    
        print(f"\n--- Analyzing Column: {col} ---")
        print(f"Data Type: {df_drivers[col].dtype}")
    
        # Missing Values
        missing_count = df_drivers[col].isnull().sum()
        missing_percent = (missing_count / len(df_drivers)) * 100 if len(df_drivers) > 0 else 0
        print(f"Missing Values: {missing_count} ({missing_percent:.2f}%)")
    
        # Unique Values & Potential Errors (Value Counts) - Handle potential division by zero if df_drivers is empty
        if len(df_drivers) > 0:
            if df_drivers[col].dtype == 'object' or df_drivers[col].nunique() < 50: # Show value counts for categorical or low-cardinality numeric
                print("Value Counts (Top 20):")
                print(df_drivers[col].value_counts(dropna=False).head(20)) # Show top 20 + NaN count
                if df_drivers[col].nunique() > 20: print("... (truncated)")
                # Look for 'Unknown', 'Not Reported', 'Uncoded & Errors', '99', '', whitespace, typos here.
    
            # Specific Checks based on column type
            if pd.api.types.is_numeric_dtype(df_drivers[col]): # More robust check for numeric types
                # Numeric Analysis
                print("Basic Statistics:")
                print(df_drivers[col].describe())
                skewness = df_drivers[col].skew()
                print(f"Skewness: {skewness:.2f}")
                # Outliers (Visual check recommended in later steps)
                # Plan: Use IQR or Z-score during cleaning if appropriate (e.g., for Person Age).
    
                # Potential Fixes/Transformations:
                if missing_count > 0:
                    print(f"FIX REQUIRED: Impute missing numeric values (e.g., using mean, median, mode, or model-based). Strategy depends on variable (e.g., median for skewed 'Person Age').")
                if abs(skewness) > 1:
                    print(f"TRANSFORMATION NEEDED?: High skewness detected. Consider log, square root, or Box-Cox transformation if required by modeling techniques.")
                # Check for placeholder numerics (e.g., 99, 999 meaning 'Unknown') based on data dictionary or value counts. Needs inspection of value_counts output.
    
            elif pd.api.types.is_object_dtype(df_drivers[col]): # Check for object type (includes strings)
                # Categorical/Text Analysis
                unique_count = df_drivers[col].nunique()
                print(f"Unique Values Count: {unique_count}")
                # Adjusted exclusion list for high cardinality warning
                if unique_count > 100 and col not in ['AccidentID', 'PersonID', 'City or Township']:
                     print(f"WARNING: High cardinality ({unique_count}). May need grouping or feature engineering.")
    
                # Potential Fixes/Transformations:
                if missing_count > 0:
                    print(f"FIX REQUIRED: Impute missing categorical/text values (e.g., using mode or 'Unknown' category).")
                # Check for inconsistencies (e.g., 'Male', 'male', 'M'; 'Yes', 'No', 'Y', 'N') in value_counts output. Needs inspection.
                print(f"FIX REQUIRED: Standardize values (e.g., lowercase, mapping variations like Yes/No to 1/0, using consistent codes for 'Unknown').")
                print(f"TRANSFORMATION NEEDED: Apply encoding (e.g., One-Hot Encoding, Target Encoding) for modeling.")
                # Check for text errors like leading/trailing spaces:
                # Need to handle potential non-string data in object columns before using .str accessor
                if pd.api.types.is_string_dtype(df_drivers[col]): # Check if specifically string type
                     if df_drivers[col].str.strip().nunique() != unique_count:
                          print("FIX REQUIRED: Trim leading/trailing whitespace.")
                elif df_drivers[col].apply(type).eq(str).all(): # Check if all elements are strings
                     if df_drivers[col].str.strip().nunique() != unique_count:
                         print("FIX REQUIRED: Trim leading/trailing whitespace.")
    
            # Date/Time Check (Separate Columns - handled during combination/sorting phase)
            if col in ['Crash Year', 'Crash Month', 'Crash Day', 'Time of Day']:
                 pass # Analysis/combination done in data preparation step before this deep dive ideally.
    
        else: # Handle case where df_drivers is empty
            print("DataFrame is empty, skipping detailed analysis for this column.")
    
    
        # --- Specific Known Issues & Checks (Updated Columns) ---
        
        # A. Opioid Undercounting (Alcohol Focus): Compare relevant alcohol/drug flags
        print("\n--- Checking Indicators Related to Known Issue A (Alcohol Focus Bias) ---")
        # Using contributing factors as primary indicators
        alcohol_factor_col = 'Driver Contributing Factor - Alcohol Use (2016+)'
        drug_factor_col = 'Driver Contributing Factor - Drug Use (2016+)'
        # Also consider crash-level flags or suspicion
        drug_suspected_col = 'Drugs Suspected'
        
        if alcohol_factor_col in df_drivers.columns and drug_factor_col in df_drivers.columns:
            # Cross-tabulation of Alcohol vs. Drug Contributing Factors
            factor_crosstab = pd.crosstab(df_drivers[alcohol_factor_col], df_drivers[drug_factor_col], dropna=False)
            print(f"\nCross-tabulation of '{alcohol_factor_col}' vs. '{drug_factor_col}':")
            print(factor_crosstab)
            # Look for cases where Alcohol Factor=Yes and Drug Factor=No/Unknown.
        elif drug_suspected_col in df_drivers.columns: # Use suspicion if factors aren't available
             print(f"Analyzing '{drug_suspected_col}' counts:")
             print(df_drivers[drug_suspected_col].value_counts(dropna=False))
        else:
             print("Relevant columns for Alcohol/Drug focus check not found in the filtered data.")
        
        print("PLAN: Acknowledge potential alcohol focus bias. Analyze available drug flags/factors/suspicion cautiously.")
        
        
        # B. Opioid Undercounting (Coroner Data): Check testing for fatal injuries
        print("\n--- Checking Indicators Related to Known Issue B (Coroner Data Gap) ---")
        injury_col = 'Person Degree of Injury'
        test_offered_col = 'Test Offered - Drug (2016+)'
        test_result_col = 'Test Result - Drug 1' # Use Drug 1 as primary indicator
        
        if injury_col in df_drivers.columns and test_offered_col in df_drivers.columns and test_result_col in df_drivers.columns:
            # **Action Required:** Identify the code for fatal injuries from value_counts output for 'Person Degree of Injury'.
            fatal_code = 'K - Killed' # <-- **VERIFY THIS VALUE** (Common code, but check MI specifics)
        
            fatal_drivers = df_drivers[df_drivers[injury_col] == fatal_code]
            print(f"\nAnalysis of Drug Testing for Fatal Injuries ('{injury_col}' == '{fatal_code}'):")
        
            if not fatal_drivers.empty:
                print(f"\n'{test_offered_col}' counts for fatalities:")
                print(fatal_drivers[test_offered_col].value_counts(dropna=False))
        
                # Analyze results among those offered a test (assuming 'Yes' means offered)
                offered_fatal = fatal_drivers[fatal_drivers[test_offered_col] == 'Yes'] #<-- **VERIFY 'Yes' VALUE**
                if not offered_fatal.empty:
                     print(f"\n'{test_result_col}' counts among fatalities offered a drug test:")
                     print(offered_fatal[test_result_col].value_counts(dropna=False))
                     # How many offered tests have 'Not Reported', 'Unknown', or specific positive results?
                else:
                     print("No fatalities recorded as offered a drug test.")
            else:
                print(f"No drivers found with fatal injury code '{fatal_code}'.")
        else:
            print(f"Relevant columns ('{injury_col}', '{test_offered_col}', '{test_result_col}') not found for coroner data gap check.")
        
        print("PLAN: Acknowledge limitation regarding coroner data. Analyze available test offer/result data for fatalities, noting potential undercounting.")
        
        
        # --- Tuple Duplicates Check (Using Generated IDs) ---
        print("\n--- Checking for Duplicate Driver Records ---")
        if 'AccidentID' in df_drivers.columns and 'PersonID' in df_drivers.columns:
            # Check for duplicate PersonID within the same AccidentID (shouldn't happen if PersonID is unique row ID)
            # More relevant check: Is there more than one DRIVER per ACCIDENT?
            # This requires grouping by AccidentID and checking Party Type counts within groups.
            # Simplified check here: Are there duplicate PersonID rows overall in df_drivers? (Should be 0 if created correctly)
            person_duplicates = df_drivers.duplicated(subset=['PersonID'], keep=False).sum()
            if person_duplicates > 0:
                 print(f"WARNING: Found {person_duplicates} duplicate PersonID entries in the driver data. This indicates an issue with PersonID generation or prior filtering.")
            else:
                 print("No duplicate PersonID entries found in the filtered driver data.")
        
            # Check for multiple drivers per crash (more complex check, requires grouping)
            # Example concept (run only if needed and party_type_col is valid):
            # if party_type_col:
            #    multi_driver_crashes = df_drivers.groupby('AccidentID')[party_type_col].transform('count') > 1
            #    if multi_driver_crashes.any():
            #        print(f"WARNING: Found {multi_driver_crashes.sum()} driver records associated with crashes having potentially more than one driver listed.")
        else:
            print("Cannot check duplicates accurately without reliable AccidentID and PersonID.")
        
        
        # --- Initial Scale Checks (Updated Columns) ---
        print("\n--- Initial Scale Checks (Within MI Data) ---")
        # Example: Injury Severity codes
        if injury_col in df_drivers.columns:
            print(f"\n'{injury_col}' Codes & Counts:")
            print(df_drivers[injury_col].value_counts(dropna=False))
            print("PLAN: Document these codes (e.g., O, C, B, A, K). Ensure consistency when merging with FARS/PA data later (mapping likely required).")
        else:
            print(f"'{injury_col}' column not found for scale check.")
        
        # Example: Road Conditions
        road_cond_col = 'Road Conditions'
        if road_cond_col in df_drivers.columns:
            print(f"\n'{road_cond_col}' Codes & Counts:")
            print(df_drivers[road_cond_col].value_counts(dropna=False))
            print("PLAN: Document codes (e.g., 'Dry', 'Wet', 'Snowy'). Map to standardized categories during integration with other datasets.")
        else:
            print(f"'{road_cond_col}' column not found for scale check.")


Creating analysis DataFrame with 59 columns.
Found Race/Ethnicity column: 'Person Race (2021+)'
Found Party Type column: 'Party Type'

--- Skipping detailed value counts for high-cardinality ID: AccidentID ---
Data Type: float64
Missing Values: 1 (0.00%)

--- Skipping detailed value counts for high-cardinality ID: PersonID ---
Data Type: int64
Missing Values: 0 (0.00%)

--- Analyzing Column: Crash Year ---
Data Type: float64
Missing Values: 1 (0.00%)
Value Counts (Top 20):
Crash Year
2022.0    364104
NaN            1
Name: count, dtype: int64
Basic Statistics:
count    364104.0
mean       2022.0
std           0.0
min        2022.0
25%        2022.0
50%        2022.0
75%        2022.0
max        2022.0
Name: Crash Year, dtype: float64
Skewness: 0.00
FIX REQUIRED: Impute missing numeric values (e.g., using mean, median, mode, or model-based). Strategy depends on variable (e.g., median for skewed 'Person Age').

--- Checking Indicators Related to Known Issue A (Alcohol Focus Bias) ---

C

In [87]:
# --- Code for Tables/Figures Before Cleaning ---
# Run this *after* initial load into df_mi, *before* cleaning code.

print("\n--- Descriptive Statistics Before Cleaning (Raw Data Sample) ---")
# Display info for raw loaded data
print("Raw Data Info:")
df_mi.info()

# Display summary stats for numeric columns (raw)
print("\nRaw Numeric Summary:")
# Added error handling for describe in case no numeric columns exist
try:
    print(df_mi.describe(include=np.number))
except ValueError:
    print("No numeric columns to describe.")


# Display summary stats for object columns (raw)
print("\nRaw Object/Categorical Summary:")
try:
    print(df_mi.describe(include='object'))
except ValueError:
    print("No object columns to describe.")


# --- Example: Value Counts for Key Columns Before Cleaning ---

print("\n--- Value Counts Before Cleaning (Using Refined Key Columns) ---")

# Updated list to reflect key columns identified in the refined analysis
# These are the raw versions of columns we intend to keep or use for filtering/analysis
key_cols_pre_clean = [
    'Crash Year',                       # Time
    'County',                           # Location
    'Worst Injury in Crash',            # Severity (Crash Level)
    'Party Type',                       # Person Role (Crucial for Driver ID)
    'Person Age',                       # Demographic
    'Person Gender',                    # Demographic
    'Person Race (2021+)',              # Demographic (Latest available)
    'Person Degree of Injury',          # Severity (Person Level)
    'Driver Contributing Factor - Drug Use (2016+)', # Impairment Indicator
    'Drugs Suspected',                  # Impairment Indicator
    'Test Offered - Drug (2016+)',      # Testing Indicator
    'Test Result - Drug 1',             # Testing Indicator (Example Result)
    'Road Conditions',                  # Context
    'Weather Conditions (2016+)',       # Context (Latest available)
    # Add other key raw columns you want initial counts for, if desired
]

for col in key_cols_pre_clean:
    if col in df_mi.columns:
        print(f"\nValue Counts for '{col}' (Before Cleaning):")
        try:
            # Display top 20 unique values and their counts, including NaNs
            print(df_mi[col].value_counts(dropna=False).head(20))
            if df_mi[col].nunique() > 20: print("... (truncated)")
        except Exception as e:
            print(f"Could not get value counts for column '{col}'. Error: {e}")
    else:
        # This message is important given the focus on correct column names
        print(f"\nColumn '{col}' (from key_cols_pre_clean) not found in raw DataFrame 'df_mi'.")

# --- Example: Missing Value Counts Before Cleaning ---
print("\n--- Missing Value Counts Before Cleaning (Top 30) ---")
if not df_mi.empty:
    missing_raw = df_mi.isnull().sum()
    missing_raw = missing_raw[missing_raw > 0].sort_values(ascending=False)
    print(missing_raw.head(30)) # Show top 30 columns with missing values
    if len(missing_raw) > 30: print("... (truncated)")
else:
    print("DataFrame is empty, cannot calculate missing values.")

# Code for a plot before cleaning (e.g., Raw Age Distribution)
# Updated to use 'Person Age' and includes basic error handling/check
# Uncomment the following lines to generate the plot:

age_col_plot = 'Person Age'
if age_col_plot in df_mi.columns:
    plt.figure(figsize=(10, 6))
    # Coerce errors to NaN for plotting, as raw age might have non-numeric entries
    sns.histplot(pd.to_numeric(df_mi[age_col_plot], errors='coerce').dropna(), kde=False, bins=50)
    plt.title(f'Raw Distribution of {age_col_plot} (All Persons, Before Cleaning)')
    plt.xlabel(f'{age_col_plot} (Potential Errors/Placeholders Present, NaNs Dropped)')
    plt.ylabel('Frequency')
    try:
        plt.savefig('raw_person_age_distribution.png')
        print("\nGenerated raw_person_age_distribution.png (Illustrative)")
    except Exception as e:
        print(f"\nFailed to save plot: {e}")
    plt.close() # Close the plot figure to free memory
else:
     print(f"\nColumn '{age_col_plot}' not found, skipping raw age distribution plot.")


--- Descriptive Statistics Before Cleaning (Raw Data Sample) ---
Raw Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364105 entries, 0 to 364104
Columns: 156 entries, Area of Road at Crash to PersonID
dtypes: float64(9), int64(1), object(146)
memory usage: 433.4+ MB

Raw Numeric Summary:
           Crash Day  Crash Year  Total Motor Vehicles  \
count  364104.000000    364104.0         364104.000000   
mean       15.738306      2022.0              1.801568   
std         8.791865         0.0              0.965369   
min         1.000000      2022.0              1.000000   
25%         8.000000      2022.0              1.000000   
50%        16.000000      2022.0              2.000000   
75%        23.000000      2022.0              2.000000   
max        31.000000      2022.0             71.000000   

       Total Non-Motor Vehicles (2016+)  Total Units Reported  \
count                     364104.000000         364104.000000   
mean                           0.010060     

In [88]:
print(f"\nValue counts for '{party_type_col}' (lowercase, pre-filtering):")
print(df_cleaned[party_type_col].value_counts())


Value counts for 'Party Type' (lowercase, pre-filtering):
Party Type
motor vehicle driver    305786
Name: count, dtype: int64


In [89]:
print("\n--- Starting Data Cleaning Process ---")

try:
    # Step 1: Select Relevant Columns (using the verified list)
    # Ensure AccidentID and PersonID are in existing_columns_to_keep if they were successfully created
    if 'AccidentID' not in existing_columns_to_keep and 'AccidentID' in df_mi.columns:
        existing_columns_to_keep.append('AccidentID')
    if 'PersonID' not in existing_columns_to_keep and 'PersonID' in df_mi.columns:
        existing_columns_to_keep.append('PersonID')

    # Ensure the list contains unique columns before selection
    existing_columns_to_keep = list(dict.fromkeys(existing_columns_to_keep)) # Simple way to deduplicate while preserving order

    df_cleaned = df_mi[existing_columns_to_keep].copy()
    print(f"Step 1: Selected {len(existing_columns_to_keep)} relevant columns. Shape: {df_cleaned.shape}")

    # --- Convert object columns to lowercase ---
    print("\nStep 1.5: Converting text columns to lowercase...")
    object_columns = df_cleaned.select_dtypes(include=['object']).columns
    for col in object_columns:
        try:
            # Use .astype(str) first to handle potential mixed types before .str accessor
            df_cleaned[col] = df_cleaned[col].astype(str).str.lower()
        except Exception as e:
            # Indentation corrected here
            print(f"  - Could not convert column '{col}' to lowercase. Error: {e}")
    print("Step 1.5: Finished converting text columns to lowercase.")

    # Find the actual column name for Race/Ethnicity AFTER lowercase conversion
    race_col_name = None
    # Check existence using original casing first, then try lowercase if needed
    if 'Person Race (2021+)' in df_cleaned.columns:
        # Check if column name itself became lowercase due to source file inconsistency
        if 'person race (2021+)' in df_cleaned.columns and 'Person Race (2021+)' not in df_cleaned.columns:
             df_cleaned.rename(columns={'person race (2021+)': 'Person Race (2021+)'}, inplace=True)
        race_col_name = 'Person Race (2021+)' # Use consistent capitalized name
        print(f"Found Race/Ethnicity column: '{race_col_name}'.")
    # Add fallbacks if needed, checking other potential names (also check lowercase versions if necessary)
    
    if not race_col_name:
        print("Warning: Race/Ethnicity column ('Person Race (2021+)') not found.")

    # Find the actual column name for Party Type AFTER lowercase conversion
    party_type_col = None
    # Check existence using original casing first
    if 'Party Type' in df_cleaned.columns:
        # Check if column name itself became lowercase
        if 'party type' in df_cleaned.columns and 'Party Type' not in df_cleaned.columns:
            df_cleaned.rename(columns={'party type': 'Party Type'}, inplace=True)
        party_type_col = 'Party Type'
        print(f"Found Party Type column: '{party_type_col}'.")
    elif 'Traffic Unit Type' in df_cleaned.columns:
         # Check if column name itself became lowercase
        if 'traffic unit type' in df_cleaned.columns and 'Traffic Unit Type' not in df_cleaned.columns:
             df_cleaned.rename(columns={'traffic unit type': 'Traffic Unit Type'}, inplace=True)
        party_type_col = 'Traffic Unit Type' # Using this as the consistent name now
        print(f"Found Party Type column (using fallback): '{party_type_col}'.")
    # Add fallbacks if needed

    if not party_type_col:
        print("Error: Party Type column ('Party Type' or 'Traffic Unit Type') not found.")

    # Step 2: Filter for Drivers (using identified party_type_col)
    # IMPORTANT: Verify 'driver_identifier' based on actual data *after* lowercase conversion
    if party_type_col: # Check if party_type_col was successfully identified
        initial_rows = len(df_cleaned)
        if party_type_col in df_cleaned.columns: # Double check column exists in df_cleaned
            # Get value counts *after* lowercase to confirm identifier
            print(f"\nValue counts for '{party_type_col}' (lowercase, pre-filtering):")
            # Use try-except for value_counts as it might fail on unexpected data types
            try:
                print(df_cleaned[party_type_col].value_counts())
            except Exception as e:
                print(f"Could not get value counts for {party_type_col}: {e}")

            # **Action Required:** Confirm the exact LOWERCASE string for drivers from the output above.
            driver_identifier_lc = 'motor vehicle driver' # <-- **VERIFY THIS LOWERCASE VALUE**

            # Ensure the column is string type before comparison
            df_cleaned[party_type_col] = df_cleaned[party_type_col].astype(str)
            # Perform the filtering
            df_cleaned = df_cleaned[df_cleaned[party_type_col] == driver_identifier_lc].copy()
            print(f"Step 2: Filtered for drivers using '{party_type_col}' == '{driver_identifier_lc}'. Rows reduced from {initial_rows} to {len(df_cleaned)}.")
            if initial_rows > 0 and len(df_cleaned) == 0:
                print(f"WARNING Step 2: Filtering resulted in an empty DataFrame. Check identifier: '{driver_identifier_lc}' and column: '{party_type_col}'.")
        else:
            # This case should ideally not be reached if party_type_col is identified correctly
            print(f"Step 2: Column '{party_type_col}' not found in df_cleaned. Skipping driver filtering.")
    else:
        print("Step 2: Skipped driver filtering (Party Type column issue or identifier not confirmed/available).")


    # Step 3: Handle Potential Duplicates (Revised approach)
    # Check for duplicates based on PersonID within an AccidentID after filtering for drivers.
    if 'AccidentID' in df_cleaned.columns and 'PersonID' in df_cleaned.columns:
        initial_rows = len(df_cleaned)
        # Keep the first record found for a given person within a given accident
        df_cleaned.drop_duplicates(subset=['AccidentID', 'PersonID'], keep='first', inplace=True)
        if len(df_cleaned) < initial_rows:
            print(f"Step 3: Removed {initial_rows - len(df_cleaned)} duplicate person entries within the same crash.")
        else:
            print("Step 3: No duplicate person entries found within the same crash (based on AccidentID, PersonID).")

        # Optional: Check if any AccidentID still has multiple rows after filtering for drivers
        # if 'AccidentID' in df_cleaned.columns:
        #     multi_driver_check = df_cleaned.duplicated(subset=['AccidentID'], keep=False)
        #     if multi_driver_check.any():
        #         print(f"WARNING Step 3: {multi_driver_check.sum()} records belong to crashes possibly listing multiple drivers after filtering.")

    else:
         print("Step 3: Skipping duplicate checks due to missing AccidentID or PersonID.")


    # Step 4: Date/Time Handling & Create 'CrashDateTime'
    month_map = {
    'january': 1, 'february': 2, 'march': 3,
    'april': 4,   'may': 5,      'june': 6,
    'july': 7,    'august': 8,   'september': 9,
    'october': 10,'november': 11,'december': 12
    }
    
    # Only do this for 'Crash Month'
    df_cleaned['Crash Month'] = (
        df_cleaned['Crash Month']
        .astype(str)
        .str.lower()
        .map(month_map)
    )
    date_cols = ['Crash Year', 'Crash Month', 'Crash Day']
    # Use original casing for time_col check as renaming wasn't done for this one
    time_col = 'Time of Day'
    datetime_created = False
    # Check existence using original casing
    if all(col in df_cleaned.columns for col in date_cols) and time_col in df_cleaned.columns:
        print("Step 4: Processing Date/Time columns...")
        # Convert date parts to numeric, coercing errors
        for col in date_cols:
            df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')

        # --- Handle Time of Day (already lowercased if object type) ---
        try:
            # Extract hour (handle potential errors)
            # Convert to string robustly before applying string methods
            time_str = df_cleaned[time_col].astype(str)
            extracted_hour = time_str.str.split(':').str[0].str.extract('(\d+)').astype(float)
            df_cleaned['Crash Hour'] = extracted_hour
            # Impute NaN hours using the median of successfully extracted hours
            if df_cleaned['Crash Hour'].isnull().any():
                 median_hour = df_cleaned['Crash Hour'].median()
                 if pd.notna(median_hour): # Check if median calculation was successful
                      df_cleaned['Crash Hour'].fillna(median_hour, inplace=True)
                      print(f"Step 4a: Extracted and imputed 'Crash Hour'. Median Hour used for imputation: {median_hour}")
                 else: # Handle case where median is NaN (e.g., all parsing failed)
                      df_cleaned['Crash Hour'].fillna(12, inplace=True) # Assign default hour
                      print(f"Step 4a: Extracted 'Crash Hour', but median calculation failed. Imputed NaNs with 12.")
            else:
                 print(f"Step 4a: Extracted 'Crash Hour'. No imputation needed.")
            # Convert final hour to integer
            df_cleaned['Crash Hour'] = df_cleaned['Crash Hour'].astype(int)
        except Exception as e:
            print(f"Step 4a: Could not reliably parse 'Crash Hour' from '{time_col}'. Error: {e}. Assigning default 12.")
            df_cleaned['Crash Hour'] = 12 # Assign a default hour

        # Drop rows where core date components are missing AFTER numeric conversion
        initial_rows = len(df_cleaned)
        df_cleaned.dropna(subset=date_cols, inplace=True)
        if len(df_cleaned) < initial_rows:
            print(f"Step 4b: Dropped {initial_rows - len(df_cleaned)} rows with missing date components (Year, Month, or Day).")

        # Create the CrashDateTime column (using imputed/extracted hour)
        if 'Crash Hour' in df_cleaned.columns:
            try:
                # Ensure date parts are integer before assembling
                datetime_comps = df_cleaned[['Crash Year', 'Crash Month', 'Crash Day', 'Crash Hour']].astype(int)
                datetime_comps.columns = ['year', 'month', 'day', 'hour'] # Rename for to_datetime
                df_cleaned['CrashDateTime'] = pd.to_datetime(datetime_comps, errors='coerce')
                datetime_created = True
            except Exception as e:
                print(f"Step 4c: Failed to create 'CrashDateTime' with hour component. Error: {e}")
        # Fallback to creating date only if hour failed or wasn't processed
        if not datetime_created:
            try:
                # Ensure date parts are integer
                datetime_comps = df_cleaned[['Crash Year', 'Crash Month', 'Crash Day']].astype(int)
                datetime_comps.columns = ['year', 'month', 'day']
                df_cleaned['CrashDateTime'] = pd.to_datetime(datetime_comps, errors='coerce')
                datetime_created = True
                print("Step 4c: Created 'CrashDateTime' (Date component only).")
            except Exception as e:
                print(f"Step 4c: Failed to create 'CrashDateTime' (Date only). Error: {e}")

        # Drop rows where final DateTime conversion failed
        if datetime_created:
            initial_rows = len(df_cleaned)
            df_cleaned.dropna(subset=['CrashDateTime'], inplace=True)
            if len(df_cleaned) < initial_rows:
                  print(f"Step 4d: Dropped {initial_rows - len(df_cleaned)} rows with invalid date/time combinations.")
            print(f"Step 4e: Final 'CrashDateTime' column created. Type: {df_cleaned['CrashDateTime'].dtype}")
        else:
            print("Step 4: Failed to create CrashDateTime column.")
    else:
        print("Step 4: Skipped CrashDateTime creation (missing essential Year/Month/Day/Time columns).")


    # Step 5: Data Type Conversion & Standardization (Person Age)
    age_col = 'Person Age' # Use the correct column name
    if age_col in df_cleaned.columns:
        original_dtype = df_cleaned[age_col].dtype
        # Convert Age to numeric, coercing errors (like non-numeric entries) to NaN
        df_cleaned[age_col] = pd.to_numeric(df_cleaned[age_col], errors='coerce')
        print(f"Step 5: Converted '{age_col}' from {original_dtype} to numeric. New Dtype: {df_cleaned[age_col].dtype}")
        # Handle outliers/placeholders after conversion if needed
    else:
        print(f"Step 5: Column '{age_col}' not found. Skipping Age cleaning.")


    # Step 6: Standardize Categorical Values (Person Gender - now lowercase)
    gender_col = 'Person Gender' # Use original casing
    if gender_col in df_cleaned.columns:
        print(f"Step 6a: Standardizing '{gender_col}'. Original unique values sample (lowercase): {df_cleaned[gender_col].unique()[:5]}")
        # Adjust keys to lowercase based on Step 1.5
        gender_map = {'male': 'm', 'female': 'f', 'unknown': 'u', 'nan': 'u'} # handle 'nan' string
        # Apply map (column should be string/object after lowercase step)
        df_cleaned[gender_col] = df_cleaned[gender_col].map(gender_map).fillna('u')
        print(f"Step 6b: Standardized '{gender_col}'. New unique values: {df_cleaned[gender_col].unique()}")
    else:
        print(f"Step 6: Column '{gender_col}' not found.")


    # Step 7: Handle Missing Values (Targeted Imputation)
    print("Step 7: Handling missing values...")
    # Impute Person Age with Median
    if age_col in df_cleaned.columns and df_cleaned[age_col].isnull().any():
        median_age = df_cleaned[age_col].median()
        # Check if median is valid before imputing
        if pd.notna(median_age):
             df_cleaned[age_col].fillna(median_age, inplace=True)
             print(f"Step 7a: Imputed missing '{age_col}' values with median ({median_age:.0f}).")
        else:
             print(f"Step 7a: Could not calculate median for '{age_col}'. Skipping imputation.")

    # Impute Race/Ethnicity with 'unknown' (lowercase)
    # Use race_col_name variable which holds the correctly cased name
    if race_col_name and race_col_name in df_cleaned.columns:
         # Check for NaNs first before imputation
        if df_cleaned[race_col_name].isnull().any():
            df_cleaned[race_col_name].fillna('unknown', inplace=True) # Use lowercase unknown
            print(f"Step 7b: Imputed missing '{race_col_name}' values with 'unknown'.")
        # Ensure the entire column is lowercase string
        df_cleaned[race_col_name] = df_cleaned[race_col_name].astype(str).str.lower()
        print(f"Step 7b: Ensured '{race_col_name}' column is lowercase.")
    # else: (handled by previous warning)


    # Impute 'Drugs Suspected' with 'unknown' (lowercase)
    drugs_suspected_col = 'Drugs Suspected' # Use original casing
    if drugs_suspected_col in df_cleaned.columns:
        # Check for NaNs first
        if df_cleaned[drugs_suspected_col].isnull().any():
            df_cleaned[drugs_suspected_col].fillna('unknown', inplace=True) # Use lowercase unknown
            print(f"Step 7c: Imputed missing '{drugs_suspected_col}' values with 'unknown'.")
        # Ensure lowercase (should already be done by Step 1.5, but double-check)
        df_cleaned[drugs_suspected_col] = df_cleaned[drugs_suspected_col].astype(str).str.lower()

    # Impute contributing factors (e.g., fill NaN with 'no' - lowercase)
    factor_cols = ['Driver Contributing Factor - Alcohol Use (2016+)', 'Driver Contributing Factor - Drug Use (2016+)']
    fill_value_factor = 'no' # Use lowercase 'no'
    for col in factor_cols:
        if col in df_cleaned.columns:
             if df_cleaned[col].isnull().any():
                  df_cleaned[col].fillna(fill_value_factor, inplace=True)
                  print(f"Step 7d: Imputed missing '{col}' values with '{fill_value_factor}'.")
             # Ensure lowercase
             df_cleaned[col] = df_cleaned[col].astype(str).str.lower()

    # Handle missing values in Testing columns - Treat NaN as 'not reported' (lowercase)
    test_cols_to_impute = [
        'Test Offered - Alcohol', 'Test Offered - Drug (2016+)',
        'Refusal Information - Alcohol', 'Refusal Information - Drug (2016+)',
        'Test Result - Alcohol', 'Test Result - Cannabinoid Drug (2021+)',
        'Test Result - Drug 1', 'Test Result - Drug 2', 'Test Result - Drug 3',
        'Test Result Pending - Alcohol (2016+)', 'Test Result Pending - Drug (2016+)'
    ]
    fill_value_test = 'not reported' # Use lowercase
    imputed_test_cols_count = 0
    for col in test_cols_to_impute:
        if col in df_cleaned.columns:
             if df_cleaned[col].isnull().any():
                 df_cleaned[col].fillna(fill_value_test, inplace=True)
                 imputed_test_cols_count += 1
             # Ensure lowercase
             df_cleaned[col] = df_cleaned[col].astype(str).str.lower()
    if imputed_test_cols_count > 0:
        print(f"Step 7e: Imputed missing values and ensured lowercase for {imputed_test_cols_count} testing/result columns using '{fill_value_test}'.")
    else:
        print("Step 7e: No missing values found in specified testing/result columns. Ensured lowercase.")


    # --- Split Drug Test Result Columns ---
    print("\nStep 7.5: Splitting drug test result columns...")
    drug_result_cols = ['Test Result - Drug 1', 'Test Result - Drug 2', 'Test Result - Drug 3']
    # Define special values (now lowercase)
    special_vals = ['reported as unknown if tested for drugs', 'other drug', 'not reported']

    for i, col in enumerate(drug_result_cols):
        if col in df_cleaned.columns:
            print(f"  - Processing '{col}'...")
            class_col = f'Drug {i+1} Class'
            name_col = f'Drug {i+1} Name'

            # Initialize new columns
            df_cleaned[class_col] = 'unknown' # Default to unknown string
            df_cleaned[name_col] = 'unknown' # Default to unknown string

            # Define masks based on the content (already lowercase)
            mask_special = df_cleaned[col].isin(special_vals)
            mask_split = df_cleaned[col].str.contains(': ') & ~mask_special

            # Apply splitting where possible
            # Use .loc with the mask for efficient assignment
            split_values = df_cleaned.loc[mask_split, col].str.split(': ', expand=True, n=1)

            # Assign class (part 0)
            if 0 in split_values.columns:
                 df_cleaned.loc[mask_split, class_col] = split_values[0].str.strip()

            # Assign name (part 1)
            if 1 in split_values.columns:
                 df_cleaned.loc[mask_split, name_col] = split_values[1].str.strip()
            # else: name remains 'unknown' if only one part after split

            # Assign special values to both columns
            df_cleaned.loc[mask_special, class_col] = df_cleaned.loc[mask_special, col]
            df_cleaned.loc[mask_special, name_col] = df_cleaned.loc[mask_special, col]

            # Handle other cases (not special, no ':') - they remain 'unknown' by default init

        else:
            print(f"  - Column '{col}' not found, skipping split.")
    print("Step 7.5: Finished splitting drug test result columns.")
    # --- END OF NEW STEP ---


    # Step 8: Further Standardization (Mapping flags to numeric)
    print("Step 8: Standardizing indicator columns to numeric...")
    # Standardize 'Drugs Suspected' (already lowercased and imputed)
    drugs_suspected_col = 'Drugs Suspected' # Use original casing
    if drugs_suspected_col in df_cleaned.columns:
        # Adjust keys to lowercase
        suspected_map_lc = {'drugs suspected': 1, 'no drugs suspected': 0, 'unknown': -1}
        # Map values, fill any remaining NAs (e.g., from original non-string values) with -1
        df_cleaned[drugs_suspected_col] = df_cleaned[drugs_suspected_col].map(suspected_map_lc).fillna(-1).astype(int)
        print(f"Step 8a: Standardized '{drugs_suspected_col}' to numeric map (1=drugs suspected, 0=no drugs suspected, -1=unknown).")

    # Standardize 'Crash: Drug Use' flag (already lowercased)
    crash_drug_flag = 'Crash: Drug Use' # Use original casing
    if crash_drug_flag in df_cleaned.columns:
        # Adjust keys to lowercase, **VERIFY these values from data**
        flag_map_lc_drug = {'drugs involved': 1, 'no drugs involved': 0}
        # Map and fillna with 0 (assuming NaN/unmapped means 'No Drugs Involved')
        df_cleaned[crash_drug_flag] = df_cleaned[crash_drug_flag].map(flag_map_lc_drug).fillna(0).astype(int)
        print(f"Step 8b: Standardized '{crash_drug_flag}' to numeric map (1=involved, 0=not involved). Filled unmapped/NaN with 0.")

    # Standardize 'Crash: Drinking' flag
    crash_alc_flag = 'Crash: Drinking' # Use original casing
    if crash_alc_flag in df_cleaned.columns:
        # **VERIFY values**
        flag_map_lc_alc = {'drinking involved': 1, 'no drinking involved': 0}
        df_cleaned[crash_alc_flag] = df_cleaned[crash_alc_flag].map(flag_map_lc_alc).fillna(0).astype(int)
        print(f"Step 8c: Standardized '{crash_alc_flag}' to numeric map (1=involved, 0=not involved). Filled unmapped/NaN with 0.")

    # Standardize 'Test Offered - Drug (2016+)' (already lowercased and imputed)
    test_offered_drug = 'Test Offered - Drug (2016+)' # Use original casing
    if test_offered_drug in df_cleaned.columns:
        # Adjust keys to lowercase, **VERIFY 'yes'/'no' values**
        offered_map_lc = {'yes': 1, 'no': 0, 'not reported': -1}
        df_cleaned[test_offered_drug] = df_cleaned[test_offered_drug].map(offered_map_lc).fillna(-1).astype(int)
        print(f"Step 8d: Standardized '{test_offered_drug}' to numeric map (1=yes, 0=no, -1=not reported).")
    # Repeat for 'Test Offered - Alcohol', 'Refusal Information - Alcohol', 'Refusal Information - Drug (2016+)' if desired


    # --- Cleaning Complete ---
    print("\n--- Data Cleaning Process Finished ---")
    print("Final Cleaned Data Info:")
    if not df_cleaned.empty:
        df_cleaned.info()
        print("\nFinal Cleaned Data Shape:")
        print(df_cleaned.shape)
        print("\nExample of Cleaned Data (first 5 rows with new drug columns):")
        # Select some relevant cols including the new ones to display
        display_cols_final = ['AccidentID', 'PersonID', 'Party Type', 'Drugs Suspected',
                              'Test Result - Drug 1', 'Drug 1 Class', 'Drug 1 Name',
                              'Test Result - Drug 2', 'Drug 2 Class', 'Drug 2 Name']
        # Ensure columns exist before displaying
        display_cols_final_exist = [col for col in display_cols_final if col in df_cleaned.columns]
        print(df_cleaned[display_cols_final_exist].head())
    else:
        print("Cleaning finished, but the resulting DataFrame is empty.")

except Exception as e:
    print(f"\n--- An error occurred during the cleaning process: {e} ---")
    # Print detailed traceback for debugging
    traceback.print_exc()


--- Starting Data Cleaning Process ---
Step 1: Selected 59 relevant columns. Shape: (364105, 59)

Step 1.5: Converting text columns to lowercase...
Step 1.5: Finished converting text columns to lowercase.
Found Race/Ethnicity column: 'Person Race (2021+)'.
Found Party Type column: 'Party Type'.

Value counts for 'Party Type' (lowercase, pre-filtering):
Party Type
motor vehicle driver    305786
uninjured passenger      46034
injured passenger        11352
bicyclist                  782
pedestrian                  91
train engineer              59
nan                          1
Name: count, dtype: int64
Step 2: Filtered for drivers using 'Party Type' == 'motor vehicle driver'. Rows reduced from 364105 to 305786.
Step 3: No duplicate person entries found within the same crash (based on AccidentID, PersonID).
Step 4: Processing Date/Time columns...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['Crash Hour'].fillna(median_hour, inplace=True)


Step 4a: Extracted and imputed 'Crash Hour'. Median Hour used for imputation: 6.0
Step 4e: Final 'CrashDateTime' column created. Type: datetime64[ns]
Step 5: Converted 'Person Age' from object to numeric. New Dtype: float64
Step 6a: Standardizing 'Person Gender'. Original unique values sample (lowercase): ['male' 'female' 'uncoded & errors' 'non-binary']
Step 6b: Standardized 'Person Gender'. New unique values: ['m' 'f' 'u']
Step 7: Handling missing values...
Step 7a: Imputed missing 'Person Age' values with median (39).
Step 7b: Ensured 'Person Race (2021+)' column is lowercase.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned[age_col].fillna(median_age, inplace=True)


Step 7e: No missing values found in specified testing/result columns. Ensured lowercase.

Step 7.5: Splitting drug test result columns...
  - Processing 'Test Result - Drug 1'...
  - Processing 'Test Result - Drug 2'...
  - Processing 'Test Result - Drug 3'...
Step 7.5: Finished splitting drug test result columns.
Step 8: Standardizing indicator columns to numeric...
Step 8a: Standardized 'Drugs Suspected' to numeric map (1=drugs suspected, 0=no drugs suspected, -1=unknown).
Step 8b: Standardized 'Crash: Drug Use' to numeric map (1=involved, 0=not involved). Filled unmapped/NaN with 0.
Step 8c: Standardized 'Crash: Drinking' to numeric map (1=involved, 0=not involved). Filled unmapped/NaN with 0.
Step 8d: Standardized 'Test Offered - Drug (2016+)' to numeric map (1=yes, 0=no, -1=not reported).

--- Data Cleaning Process Finished ---
Final Cleaned Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 305786 entries, 0 to 364103
Data columns (total 67 columns):
 #   Column             

In [90]:
    # --- Visualizations (Run only if df_cleaned is not empty) ---
    if not df_cleaned.empty:
        print("\n--- Generating Visualizations ---")

        # --- Visualization 1: Driver Age Distribution ---
        # (Code omitted for brevity, assume it uses 'Person Age' correctly)
        age_col_plot = 'Person Age'
        if age_col_plot in df_cleaned.columns:
            plt.figure(figsize=(10, 6))
            sns.histplot(df_cleaned[age_col_plot], kde=True, bins=30) # Assumes age is cleaned numeric
            plt.title('Distribution of Driver Age (2022)')
            plt.xlabel('Person Age')
            plt.ylabel('Frequency')
            plt.grid(axis='y', alpha=0.5)
            plt.savefig('driver_age_distribution.png')
            plt.close()
            print("Generated driver_age_distribution.png")
        else: print("Skipping Vis 1: Age column missing.")


--- Generating Visualizations ---
Generated driver_age_distribution.png


In [91]:
print("Rows in df_cleaned right before Visualization 2:", len(df_cleaned))
print("Unique counties in df_cleaned:", df_cleaned['County'].unique() if 'County' in df_cleaned.columns else "No county column")

Rows in df_cleaned right before Visualization 2: 305786
Unique counties in df_cleaned: ['kalamazoo' 'kent' 'oakland' 'wayne' 'livingston' 'bay' 'berrien'
 'branch' 'genesee' 'ingham' 'jackson' 'macomb' 'oceana' 'st. clair'
 'washtenaw' 'calhoun' 'charlevoix' 'clinton' 'eaton' 'emmet' 'ionia'
 'keweenaw' 'lapeer' 'monroe' 'montcalm' 'muskegon' 'ontonagon' 'ottawa'
 'saginaw' 'schoolcraft' 'van buren' 'hillsdale' 'allegan'
 'grand traverse' 'roscommon' 'alger' 'arenac' 'baraga' 'cass' 'cheboygan'
 'chippewa' 'crawford' 'delta' 'gladwin' 'gogebic' 'gratiot' 'huron'
 'isabella' 'leelanau' 'lenawee' 'mason' 'mecosta' 'midland' 'missaukee'
 'newaygo' 'ogemaw' 'osceola' 'otsego' 'shiawassee' 'tuscola' 'wexford'
 'antrim' 'montmorency' 'lake' 'marquette' 'houghton' 'alpena' 'barry'
 'clare' 'kalkaska' 'st. joseph' 'sanilac' 'benzie' 'mackinac' 'manistee'
 'oscoda' 'presque isle' 'iosco' 'menominee' 'dickinson' 'alcona' 'iron'
 'luce']


In [92]:
# --- Visualization 2: Crashes per County ---
# Only show top N counties for readability
top_n = 20
county_counts = df_cleaned['County'].value_counts()

plt.figure(figsize=(12, 8))
sns.barplot(x=county_counts.head(top_n).values, y=county_counts.head(top_n).index, palette='viridis')
plt.title(f'Top {top_n} Counties by Crash Frequency (Drivers, Jan-Mar 2022)')
plt.xlabel('Number of Crashes (Drivers Involved)')
plt.ylabel('County')
plt.tight_layout()
# plt.show()
plt.savefig('crashes_per_county.png')
plt.close()

print("Generated crashes_per_county.png")

# Also print the raw counts for reference
print("\nCrash Counts per County (Top 10):")
print(county_counts.head(10))


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=county_counts.head(top_n).values, y=county_counts.head(top_n).index, palette='viridis')


Generated crashes_per_county.png

Crash Counts per County (Top 10):
County
wayne        58141
oakland      34419
macomb       23726
kent         20560
genesee      11569
kalamazoo     9721
ingham        9020
washtenaw     8925
ottawa        7858
saginaw       5903
Name: count, dtype: int64


In [93]:
# --- Visualization 3: Driver Sex Distribution ---
plt.figure(figsize=(6, 6))
sex_counts = df_cleaned['Person Gender'].value_counts()
plt.pie(sex_counts, labels=sex_counts.index, autopct='%1.1f%%', startangle=90, colors=sns.color_palette('pastel'))
plt.title('Distribution of Driver Sex (2022)')
# plt.show()
plt.savefig('driver_sex_distribution.png')
plt.close()

print("Generated driver_sex_distribution.png")

# Print counts
print("\nDriver Sex Counts:")
print(sex_counts)

Generated driver_sex_distribution.png

Driver Sex Counts:
Person Gender
m    166353
f    123166
u     16267
Name: count, dtype: int64


In [94]:
# --- Visualization 4: Suspected/Factor Substance Use (Updated Columns) ---

# Define the columns to compare - using refined names
# Assumes these columns exist in df_cleaned and were standardized numerically (e.g., 1/0/-1)
alcohol_indicator_col = 'Driver Contributing Factor - Alcohol Use (2016+)'
drug_indicator_col = 'Drugs Suspected'
cols_to_plot = []

# Check if the columns exist before trying to use them
if alcohol_indicator_col in df_cleaned.columns:
    cols_to_plot.append(alcohol_indicator_col)
else:
    print(f"Warning: Column '{alcohol_indicator_col}' not found for plotting.")

if drug_indicator_col in df_cleaned.columns:
    cols_to_plot.append(drug_indicator_col)
else:
    print(f"Warning: Column '{drug_indicator_col}' not found for plotting.")


if cols_to_plot: # Proceed only if at least one column was found
    try:
        # Get value counts for each indicator column
        substance_counts = df_cleaned[cols_to_plot].apply(pd.value_counts).fillna(0).astype(int)

        # Map the numeric index (1, 0, -1) back to meaningful labels for the plot axis
        # Ensure the mapping covers all expected numeric values in the index
        label_map = {1: 'Yes / Factor Cited', 0: 'No / Factor Not Cited', -1: 'Unknown / Not Reported'}
        # Apply the mapping, keeping unmapped indices as they are (shouldn't happen if cleaning is correct)
        substance_counts.index = substance_counts.index.map(lambda x: label_map.get(x, f'Unknown Code: {x}'))
        # Sort index for consistent plot order
        substance_counts = substance_counts.reindex(['Yes / Factor Cited', 'No / Factor Not Cited', 'Unknown / Not Reported'], fill_value=0)

        # Create the plot
        substance_counts.plot(kind='bar', figsize=(10, 7)) # Adjusted size slightly
        plt.title('Alcohol Factor vs. Drug Suspicion Among Drivers') # Updated title
        plt.xlabel('Status') # Updated x-axis label
        plt.ylabel('Number of Drivers')
        plt.xticks(rotation=0) # Keep x-ticks horizontal
        plt.legend(title='Indicator Type') # Updated legend title
        plt.tight_layout()
        # plt.show() # Uncomment to display interactively
        plt.savefig('alcohol_factor_vs_drug_suspicion.png') # Updated filename
        plt.close() # Close the plot figure

        print("Generated alcohol_factor_vs_drug_suspicion.png")

        # Print the counts used in the plot
        print("\nCounts for Alcohol Factor vs. Drug Suspicion:")
        print(substance_counts)

    except Exception as e:
        print(f"Error generating Visualization 4: {e}")
        # Print info about the columns to help debug
        if alcohol_indicator_col in df_cleaned.columns:
             print(f"\nValue counts for {alcohol_indicator_col}:\n", df_cleaned[alcohol_indicator_col].value_counts(dropna=False))
        if drug_indicator_col in df_cleaned.columns:
             print(f"\nValue counts for {drug_indicator_col}:\n", df_cleaned[drug_indicator_col].value_counts(dropna=False))

else:
    print("Skipping Visualization 4 as required columns were not found in df_cleaned.")

Error generating Visualization 4: '<' not supported between instances of 'int' and 'str'

Value counts for Driver Contributing Factor - Alcohol Use (2016+):
 Driver Contributing Factor - Alcohol Use (2016+)
alcohol use was not a contributing factor    299388
alcohol use was a contributing factor          6029
uncoded & errors                                369
Name: count, dtype: int64

Value counts for Drugs Suspected:
 Drugs Suspected
 0    303629
 1      1763
-1       394
Name: count, dtype: int64


  substance_counts = df_cleaned[cols_to_plot].apply(pd.value_counts).fillna(0).astype(int)
  substance_counts = df_cleaned[cols_to_plot].apply(pd.value_counts).fillna(0).astype(int)


In [95]:
# --- Visualization 5: Person Degree of Injury Distribution (Updated) ---
# Define the correct column name based on the refined list
severity_col = 'Person Degree of Injury'

# Check if the column exists in the cleaned DataFrame
if severity_col in df_cleaned.columns:
    try:
        plt.figure(figsize=(10, 6))
        # Get value counts for the severity column
        severity_counts = df_cleaned[severity_col].value_counts()

        # Create the bar plot using the identified severity column
        sns.barplot(x=severity_counts.index, y=severity_counts.values, order=severity_counts.index, palette='magma')

        plt.title('Distribution of Person Degree of Injury (Drivers, Jan-Mar 2022)') # Updated Title
        plt.xlabel('Person Degree of Injury Code') # Updated Label
        plt.ylabel('Number of Drivers')
        plt.xticks(rotation=45, ha='right') # Keep rotation for potentially longer codes
        plt.tight_layout()
        # plt.show() # Uncomment to display interactively
        plt.savefig('person_degree_of_injury_distribution.png') # Updated filename
        plt.close() # Close the plot figure

        print("Generated person_degree_of_injury_distribution.png")

        # Print the counts used in the plot
        print(f"\nCounts for '{severity_col}':") # Updated print statement
        print(severity_counts)

    except Exception as e:
        print(f"Error generating Visualization 5: {e}")
        print(f"\nValue counts for {severity_col}:\n", df_cleaned[severity_col].value_counts(dropna=False))

else:
    print(f"Skipping Visualization 5 as the required column '{severity_col}' was not found in df_cleaned.")


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=severity_counts.index, y=severity_counts.values, order=severity_counts.index, palette='magma')


Generated person_degree_of_injury_distribution.png

Counts for 'Person Degree of Injury':
Person Degree of Injury
no injury (o)                   251351
possible injury (c)              21491
uncoded & errors                 17897
suspected minor injury (b)       11404
suspected serious injury (a)      3080
fatal injury (k)                   563
Name: count, dtype: int64


In [96]:
# --- Visualization 6: Age Distribution vs. Drug Suspicion (Updated) ---
# Define the columns needed for the plot
drug_suspicion_col = 'Drugs Suspected'
age_col = 'Person Age' # Use the correct age column

# Check if the necessary columns exist in the cleaned DataFrame
if drug_suspicion_col in df_cleaned.columns and age_col in df_cleaned.columns:
    try:
        # Filter out unknowns (-1) for a clearer comparison between Yes (1) and No (0)
        # Assumes 'Drugs Suspected' is cleaned to 1/0/-1
        df_plot = df_cleaned[df_cleaned[drug_suspicion_col].isin([0, 1])].copy()

        # Check if filtering resulted in data to plot
        if not df_plot.empty:
            # Map numeric codes back to meaningful labels for the plot legend/axis
            df_plot['Drugs Suspected Label'] = df_plot[drug_suspicion_col].map({1: 'Yes', 0: 'No'})

            plt.figure(figsize=(10, 7))
            # Updated to use 'Person Age' on the y-axis
            sns.boxplot(x='Drugs Suspected Label', y=age_col, data=df_plot, palette='coolwarm')

            # Optional: Add swarmplot for individual points (can be slow for large data)
            # sns.swarmplot(x='Drugs Suspected Label', y=age_col, data=df_plot, color=".25", size=3, alpha=0.5)

            plt.title('Driver Age Distribution by Drug Suspicion Status (Jan-Mar 2022)')
            plt.xlabel('Drugs Suspected by Officer')
            plt.ylabel('Person Age') # Updated y-axis label
            # plt.show() # Uncomment to display interactively
            plt.savefig('person_age_vs_drug_suspicion.png') # Updated filename
            plt.close() # Close the plot figure

            print("Generated person_age_vs_drug_suspicion.png")

        else:
            print("Skipping Visualization 6: No data remaining after filtering for 'Drugs Suspected' == Yes or No.")

    except Exception as e:
        print(f"Error generating Visualization 6: {e}")
        # Print info about the columns to help debug
        print(f"\nValue counts for {drug_suspicion_col}:\n", df_cleaned[drug_suspicion_col].value_counts(dropna=False))
        print(f"\nBasic stats for {age_col}:\n", df_cleaned[age_col].describe())

else:
    print(f"Skipping Visualization 6 as required columns ('{drug_suspicion_col}', '{age_col}') were not found in df_cleaned.")


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Drugs Suspected Label', y=age_col, data=df_plot, palette='coolwarm')


Generated person_age_vs_drug_suspicion.png
