<a href="https://colab.research.google.com/github/DMDTague/GAVOTE/blob/main/GAVOTE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [4]:
import pandas as pd
import glob
import re
from datetime import datetime

EVENTS = {"new_records", "address_change", "name_change",
          "voter_status_change", "voter_in_inactive", "dropped_records"}
rx = re.compile(r"tbl_prod_GABU(\d{6})_(\w+)\.csv$")

frames = []
for p in glob.glob("/content/tbl_prod_GABU*.csv"):
    m = rx.search(p)
    if not m:
        continue
    yyyymm, et = m.groups()
    year = int(yyyymm[:4])
    month = int(yyyymm[4:])
    # Ensure we only process data for Q4 2020 as per the plan
    if et not in EVENTS or year != 2020 or month not in [10, 11, 12]:
        continue

    try:
        df_chunk = pd.read_csv(p, engine="python", sep=None, on_bad_lines="skip", dtype=str)

        # Add metadata columns
        df_chunk["event_year"] = year
        df_chunk["event_month"] = datetime(year, month, 1)
        df_chunk["event_type"] = et
        df_chunk["source_file"] = p

        # Standardize county - assuming a column name containing 'coun' exists
        county_cols = [c for c in df_chunk.columns if 'coun' in c.lower()]
        if not county_cols:
             # If no column with 'coun' found, try to find a column that might represent county
             # This is a fallback and might need adjustment based on actual file content
             county_cols = [c for c in df_chunk.columns if 'county' in c.lower() or 'cnt' in c.lower()]
             if not county_cols:
                # If still no county column found, you might need to inspect the files
                # For now, I will add a placeholder or skip this file
                print(f"Warning: No obvious county column found in {p}. Skipping county standardization for this file.")
                df_chunk['county'] = None # Add a placeholder column
             else:
                df_chunk = df_chunk.rename(columns={county_cols[0]:"county"})
        else:
            df_chunk = df_chunk.rename(columns={county_cols[0]:"county"})

        # Clean county names if the county column exists
        if 'county' in df_chunk.columns and df_chunk['county'] is not None:
            df_chunk["county"] = (df_chunk["county"].astype(str).str.title()
                                   .str.replace(" County", "", regex=False)
                                   .str.strip())
            # Handle potential NaN values after strip if the original was empty/whitespace
            df_chunk['county'] = df_chunk['county'].replace('', None)


        frames.append(df_chunk)

    except Exception as e:
        print(f"Error processing file {p}: {e}")


df = pd.concat(frames, ignore_index=True).drop_duplicates()

# quick checks
print("Shape of the combined dataframe:", df.shape)
print("\nCounts by event type:")
print(df.groupby("event_type").size())
print("\nMissing county %:", df["county"].isna().mean()*100)
print("\nFirst 5 rows of the combined dataframe:")
display(df.head())

Shape of the combined dataframe: (1936955, 88)

Counts by event type:
event_type
address_change          316590
dropped_records          33985
name_change              22510
new_records             274535
voter_in_inactive      1185474
voter_status_change     103861
dtype: int64

Missing county %: 0.0

First 5 rows of the combined dataframe:


Unnamed: 0,county,y_code|regis,ra,ion_number|vo,er_s,a,us|residence_ci,y|residence_zipcode|bir,hyear|regis,ra.1,...,ric.13,_combo|race_desc|las,_con,ac,_da,e,event_year,event_month,event_type,source_file
0,028|08390577|A|Woodstock|30188|1971|2011-02-25...,e no,of Hispanic Origin|2020-10-14,,,,,,,,...,,,,,,,2020,2020-11-01,address_change,/content/tbl_prod_GABU202011_address_change.csv
1,069|10594694|A|Oakwood|30566|1997|2015-10-23|H...,,,,,,,,,,...,,,,,,,2020,2020-11-01,address_change,/content/tbl_prod_GABU202011_address_change.csv
2,060|10955769|A|Atlanta|30324|1997|2019-03-12|A...,,,,,,,,,,...,,,,,,,2020,2020-11-01,address_change,/content/tbl_prod_GABU202011_address_change.csv
3,044|05568605|A|Decatur|30034|1965|2012-09-24|B...,of Hispanic Origin|2020-10-20,,,,,,,,,...,,,,,,,2020,2020-11-01,address_change,/content/tbl_prod_GABU202011_address_change.csv
4,048|08496183|A|Lithia Springs|30122|1993|2020-...,e no,of Hispanic Origin|2020-10-19,,,,,,,,...,,,,,,,2020,2020-11-01,address_change,/content/tbl_prod_GABU202011_address_change.csv


## Step 2: Build the Monthly Funnel

### Subtask:
Group data by `event_month` and `event_type`, count the number of rows for each group, and pivot the results into a table with months as rows and event types as columns.

## Step 5: Anomaly Detection (Z-scores)

### Subtask:
For high inactive/dropped rates per month, calculate z-scores across counties and flag counties with `|z-score| >= 2`. Present these anomalies in a table.

In [9]:
from scipy.stats import zscore

anomalies = []
# Iterate through each month in the rates_df index
for month in rates_df.index.get_level_values('event_month').unique():
    # Select data for the current month
    monthly_rates = rates_df.loc[(slice(None), month), :]

    # Define the event types for anomaly detection
    event_types_for_anomaly = ['voter_in_inactive_rate', 'dropped_records_rate']

    for event_type_rate_col in event_types_for_anomaly:
        if event_type_rate_col in monthly_rates.columns:
            # Extract the rates for the current event type and drop any NaN values
            monthly_rates_event_type = monthly_rates[event_type_rate_col].dropna()

            # Ensure there's enough data to calculate z-scores (at least 2 data points)
            if len(monthly_rates_event_type) > 1:
                # Calculate z-scores
                monthly_z_scores = zscore(monthly_rates_event_type)

                # Find anomalies (|z-score| >= 2)
                anomalous_indices = monthly_rates_event_type.index[abs(monthly_z_scores) >= 2]

                # Record anomalies
                for index in anomalous_indices:
                    county = index[0] # County is the first level of the MultiIndex
                    rate = monthly_rates_event_type.loc[index]
                    z = monthly_z_scores[monthly_rates_event_type.index.get_loc(index)]
                    anomalies.append({'county': county, 'month': month, 'event_type': event_type_rate_col.replace('_rate', ''), 'rate': rate, 'z_score': z})
            elif len(monthly_rates_event_type) == 1:
                 print(f"Only one data point for {event_type_rate_col} in {month.strftime('%Y-%m-%d')}. Cannot calculate z-score.")
            else:
                print(f"No data points for {event_type_rate_col} in {month.strftime('%Y-%m-%d')}.")
        else:
            print(f"Rate column '{event_type_rate_col}' not found for anomaly detection in {month.strftime('%Y-%m-%d')}.")


anomalies_df = pd.DataFrame(anomalies)

if not anomalies_df.empty:
    print("\nIdentified Anomalies (|z-score| >= 2):")
    display(anomalies_df)
    # Optionally save anomalies_df to a CSV
    anomalies_df.to_csv('county_anomalies.csv', index=False)
    print("\nAnomalies saved to county_anomalies.csv")
else:
    print("\nNo anomalies found (|z-score| >= 2).")

# Print first 5 rows of the anomaly output file if it was created
try:
    print("\nFirst 5 rows of county_anomalies.csv:")
    with open('county_anomalies.csv', 'r') as f:
        for i, line in enumerate(f):
            if i < 5:
                print(line.strip())
            else:
                break
except FileNotFoundError:
    print("\ncounty_anomalies.csv not created as no anomalies were found.")


Identified Anomalies (|z-score| >= 2):


Unnamed: 0,county,month,event_type,rate,z_score
0,001|00647630|A|Baxley|315131663|1928|1991-01-0...,2020-10-01,dropped_records,1000.0,6.105314
1,001|00649362|A|Baxley|315130755|1951|1991-01-0...,2020-10-01,dropped_records,1000.0,6.105314
2,001|00650806|A|Baxley|315130963|1928|1991-01-0...,2020-10-01,dropped_records,1000.0,6.105314
3,001|00651522|A|Odum|315556057|1929|1991-01-01|...,2020-10-01,dropped_records,1000.0,6.105314
4,001|00653187|A|Baxley|315132161|1932|1991-01-0...,2020-10-01,dropped_records,1000.0,6.105314
...,...,...,...,...,...
175528,159|11515021|A|Ty Ty|31795|1958|2019-10-18|Wh|...,2020-12-01,dropped_records,1000.0,8.170951
175529,159|11613932|A|Albany|31705|1998|2020-10-05|Bh...,2020-12-01,dropped_records,1000.0,8.170951
175530,159|12173291|A|Sumner|31789|1960|2019-04-02|U|...,2020-12-01,dropped_records,1000.0,8.170951
175531,159|12692313|A|Warwick|31796|1958|2020-04-28|U...,2020-12-01,dropped_records,1000.0,8.170951



Anomalies saved to county_anomalies.csv

First 5 rows of county_anomalies.csv:
county,month,event_type,rate,z_score
001|00647630|A|Baxley|315131663|1928|1991-01-01|Wh|M||||1B||012|019|156|Brun|1|1|||||||||||||||||||2020-08-11|R|1991-01-01|2020-08-31|102|Whi,2020-10-01,dropped_records,1000.0,6.105313600485252
001|00649362|A|Baxley|315130755|1951|1991-01-01|Bh|M||||2|Baxle|012|019|156|Brun|2|2|||||Baxley|047|Ward|6|||||||||||2018-12-04||1991-01-01|2018-12-18|107|Black No,2020-10-01,dropped_records,1000.0,6.105313600485252
001|00650806|A|Baxley|315130963|1928|1991-01-01|Wh|M||||3C|Baxle|012|019|156|Brun|3|3|||||Baxley|047|Ward|3|||||||||||2018-11-06||1991-01-01|2018-11-26|112|Whi,2020-10-01,dropped_records,1000.0,6.105313600485252
001|00651522|A|Odum|315556057|1929|1991-01-01|Wh|M||||4B||012|019|178|Brun|4|4|||||||||||||||||||2020-08-11|R|1991-01-01|2020-09-01|116|Whi,2020-10-01,dropped_records,1000.0,6.105313600485252


## Step 4: Track October 2020 New Voters Over Time (Cohort Analysis)

### Subtask:
Identify October 2020 new registrants, track their status in November and December, and count the outcomes (Active, Inactive, Dropped).

In [8]:
# Check for a potential voter ID column.
# Based on previous inspection, a column starting with 'coun' contains multiple fields separated by '|'
# The second field after splitting by '|' might be a voter ID.
# Let's confirm this by looking at the columns again and a sample of the data.

print("Checking for a potential voter ID column...")
print("Columns in the DataFrame:", df.columns.tolist())

# Assuming the second field in the first 'coun' like column is the voter ID
county_like_cols = [col for col in df.columns if 'coun' in col.lower()]

voter_id_column_name = None
if county_like_cols:
    potential_voter_id_col_data = df[county_like_cols[0]].astype(str).str.split('|').str[1]
    # Check if there are many unique values and if it looks like an ID
    if potential_voter_id_col_data.nunique() > len(df) * 0.8: # Heuristic: more than 80% unique values
         print(f"Potential voter ID found in the second part of column '{county_like_cols[0]}'.")
         # We will extract this as 'voter_id'
         df['voter_id'] = potential_voter_id_col_data
         voter_id_column_name = 'voter_id'
    else:
         print(f"The second part of column '{county_like_cols[0]}' does not appear to be a unique voter ID.")
else:
    print("No column containing 'coun' found to check for voter ID.")

if voter_id_column_name:
    print(f"Proceeding with cohort analysis using '{voter_id_column_name}' as voter ID.")

    # 1. Identify the October new registrants
    oct_new_registrants = df[(df['event_type'] == 'new_records') & (df['event_month'] == datetime(2020, 10, 1))].copy()
    oct_voter_ids = oct_new_registrants[voter_id_column_name].unique()

    print(f"\nNumber of October 2020 new registrants: {len(oct_voter_ids)}")

    # 2. Look them up in November and December
    # Filter the DataFrame to include only the October new registrants and events in Nov/Dec
    subsequent_events = df[
        df[voter_id_column_name].isin(oct_voter_ids) &
        (df['event_month'].isin([datetime(2020, 11, 1), datetime(2020, 12, 1)]))
    ].copy()

    # Classify each Oct voter’s latest known status in Nov and Dec
    # We need to find the latest event for each voter in each subsequent month
    voter_status = {}
    for voter_id in oct_voter_ids:
        voter_sub_events = subsequent_events[subsequent_events[voter_id_column_name] == voter_id].copy()

        # Determine status for November
        nov_events = voter_sub_events[voter_sub_events['event_month'] == datetime(2020, 11, 1)]
        nov_status = 'Active' # Assume active unless an inactive or dropped event is found
        if 'dropped_records' in nov_events['event_type'].values:
            nov_status = 'Dropped'
        elif 'voter_in_inactive' in nov_events['event_type'].values:
            nov_status = 'Inactive'

        # Determine status for December
        dec_events = voter_sub_events[voter_sub_events['event_month'] == datetime(2020, 12, 1)]
        dec_status = 'Active' # Assume active unless an inactive or dropped event is found
        if 'dropped_records' in dec_events['event_type'].values:
            dec_status = 'Dropped'
        elif 'voter_in_inactive' in dec_events['event_type'].values:
            dec_status = 'Inactive'

        voter_status[voter_id] = {'Nov_Status': nov_status, 'Dec_Status': dec_status}

    # 4. Count the outcomes
    cohort_outcomes_nov = pd.DataFrame.from_dict(voter_status, orient='index')['Nov_Status'].value_counts()
    cohort_outcomes_dec = pd.DataFrame.from_dict(voter_status, orient='index')['Dec_Status'].value_counts()

    cohort_summary = pd.DataFrame({'Nov 2020': cohort_outcomes_nov, 'Dec 2020': cohort_outcomes_dec}).fillna(0).astype(int)

    print("\nCohort Outcomes (October 2020 New Registrants):")
    display(cohort_summary)

else:
    print("\nSkipping voter-level cohort analysis as a suitable voter ID column was not found.")
    # Optionally, add code here for a proxy cohort analysis if needed.

Checking for a potential voter ID column...
Columns in the DataFrame: ['county', 'y_code|regis', 'ra', 'ion_number|vo', 'er_s', 'a', 'us|residence_ci', 'y|residence_zipcode|bir', 'hyear|regis', 'ra.1', 'ion_da', 'e|race|gender|land_dis', 'ric', '|land_lo', '|s', 'a.1', 'us_reason|coun', 'y_precinc', '_id|ci', 'y_precinc.1', '_id|congressional_dis', 'ric.1', '|sena', 'e_dis', 'ric.2', '|house_dis', 'ric.3', '|judicial_dis', 'ric.4', '|commission_dis', 'ric.5', '|school_dis', 'ric.6', '|coun', 'y_dis', 'ric.7', 'a_name|coun', 'y_dis.1', 'ric.8', 'a_value|coun', 'y_dis.2', 'ric.9', 'b_name|coun', 'y_dis.3', 'ric.10', 'b_value|municipal_name|municipal_code|ward_ci', 'y_council_name|ward_ci', 'y_council_code|ci', 'y_school_dis', 'ric.11', '_name|ci', 'y_school_dis.1', 'ric.12', '_value|ci', 'y_dis.4', 'a_name|ci', 'y_dis.5', 'a_value|ci', 'y_dis.6', 'b_name|ci', 'y_dis.7', 'b_value|ci', 'y_dis.8', 'c_name|ci', 'y_dis.9', 'c_value|ci', 'y_dis.10', 'd_name|ci', 'y_dis.11', 'd_value|da', 'e_la

## Step 5: Anomaly Detection (Z-scores)

### Subtask:
For high inactive/dropped rates per month, calculate z-scores across counties and flag counties with `|z-score| >= 2`. Present these anomalies in a table.

In [11]:
from scipy.stats import zscore

anomalies = []
# Iterate through each month in the rates_df index
for month in rates_df.index.get_level_values('event_month').unique():
    # Select data for the current month
    monthly_rates = rates_df.loc[(slice(None), month), :]

    # Define the event types for anomaly detection
    event_types_for_anomaly = ['voter_in_inactive_rate', 'dropped_records_rate']

    for event_type_rate_col in event_types_for_anomaly:
        if event_type_rate_col in monthly_rates.columns:
            # Extract the rates for the current event type and drop any NaN values
            monthly_rates_event_type = monthly_rates[event_type_rate_col].dropna()

            # Ensure there's enough data to calculate z-scores (at least 2 data points)
            if len(monthly_rates_event_type) > 1:
                # Calculate z-scores
                monthly_z_scores = zscore(monthly_rates_event_type)

                # Find anomalies (|z-score| >= 2)
                anomalous_indices = monthly_rates_event_type.index[abs(monthly_z_scores) >= 2]

                # Record anomalies
                for index in anomalous_indices:
                    county = index[0] # County is the first level of the MultiIndex
                    rate = monthly_rates_event_type.loc[index]
                    z = monthly_z_scores[monthly_rates_event_type.index.get_loc(index)]
                    anomalies.append({'county': county, 'month': month, 'event_type': event_type_rate_col.replace('_rate', ''), 'rate': rate, 'z_score': z})
            elif len(monthly_rates_event_type) == 1:
                 print(f"Only one data point for {event_type_rate_col} in {month.strftime('%Y-%m-%d')}. Cannot calculate z-score.")
            else:
                print(f"No data points for {event_type_rate_col} in {month.strftime('%Y-%m-%d')}.")
        else:
            print(f"Rate column '{event_type_rate_col}' not found for anomaly detection in {month.strftime('%Y-%m-%d')}.")


anomalies_df = pd.DataFrame(anomalies)

if not anomalies_df.empty:
    print("\nIdentified Anomalies (|z-score| >= 2):")
    display(anomalies_df)
    # Optionally save anomalies_df to a CSV
    anomalies_df.to_csv('county_anomalies.csv', index=False)
    print("\nAnomalies saved to county_anomalies.csv")
else:
    print("\nNo anomalies found (|z-score| >= 2).")

# Print first 5 rows of the anomaly output file if it was created
try:
    print("\nFirst 5 rows of county_anomalies.csv:")
    with open('county_anomalies.csv', 'r') as f:
        for i, line in enumerate(f):
            if i < 5:
                print(line.strip())
            else:
                break
except FileNotFoundError:
    print("\ncounty_anomalies.csv not created as no anomalies were found.")


Identified Anomalies (|z-score| >= 2):


Unnamed: 0,county,month,event_type,rate,z_score
0,001|00647630|A|Baxley|315131663|1928|1991-01-0...,2020-10-01,dropped_records,1000.0,6.105314
1,001|00649362|A|Baxley|315130755|1951|1991-01-0...,2020-10-01,dropped_records,1000.0,6.105314
2,001|00650806|A|Baxley|315130963|1928|1991-01-0...,2020-10-01,dropped_records,1000.0,6.105314
3,001|00651522|A|Odum|315556057|1929|1991-01-01|...,2020-10-01,dropped_records,1000.0,6.105314
4,001|00653187|A|Baxley|315132161|1932|1991-01-0...,2020-10-01,dropped_records,1000.0,6.105314
...,...,...,...,...,...
175528,159|11515021|A|Ty Ty|31795|1958|2019-10-18|Wh|...,2020-12-01,dropped_records,1000.0,8.170951
175529,159|11613932|A|Albany|31705|1998|2020-10-05|Bh...,2020-12-01,dropped_records,1000.0,8.170951
175530,159|12173291|A|Sumner|31789|1960|2019-04-02|U|...,2020-12-01,dropped_records,1000.0,8.170951
175531,159|12692313|A|Warwick|31796|1958|2020-04-28|U...,2020-12-01,dropped_records,1000.0,8.170951



Anomalies saved to county_anomalies.csv

First 5 rows of county_anomalies.csv:
county,month,event_type,rate,z_score
001|00647630|A|Baxley|315131663|1928|1991-01-01|Wh|M||||1B||012|019|156|Brun|1|1|||||||||||||||||||2020-08-11|R|1991-01-01|2020-08-31|102|Whi,2020-10-01,dropped_records,1000.0,6.105313600485252
001|00649362|A|Baxley|315130755|1951|1991-01-01|Bh|M||||2|Baxle|012|019|156|Brun|2|2|||||Baxley|047|Ward|6|||||||||||2018-12-04||1991-01-01|2018-12-18|107|Black No,2020-10-01,dropped_records,1000.0,6.105313600485252
001|00650806|A|Baxley|315130963|1928|1991-01-01|Wh|M||||3C|Baxle|012|019|156|Brun|3|3|||||Baxley|047|Ward|3|||||||||||2018-11-06||1991-01-01|2018-11-26|112|Whi,2020-10-01,dropped_records,1000.0,6.105313600485252
001|00651522|A|Odum|315556057|1929|1991-01-01|Wh|M||||4B||012|019|178|Brun|4|4|||||||||||||||||||2020-08-11|R|1991-01-01|2020-09-01|116|Whi,2020-10-01,dropped_records,1000.0,6.105313600485252


## Quick Checks and Summary

### Subtask:
Print quick checks as specified (rows read/skipped, missing county/voter_id percentages, total rows by event type) and display the first few rows of the key output dataframes. Summarize the key findings.

In [10]:
# Quick Checks

# Rows read per file and rows skipped (This was handled during initial loading,
# relying on the messages printed during that step. We can re-print total rows).
print("\nTotal rows read across all relevant files:", len(df))

# Total rows by event type (already printed during initial loading, re-printing for completeness)
print("\nTotal rows by event type:")
print(df.groupby("event_type").size())

# Missing percentage in county (already printed during initial loading, re-printing for completeness)
print("\nMissing county %:", df["county"].isna().mean()*100)

# Missing percentage in voter_id (if applicable)
if 'voter_id' in df.columns:
    print("\nMissing voter_id %:", df["voter_id"].isna().mean()*100)
else:
    print("\nVoter ID column was not identified.")


print("\n--- First 5 rows of Output Files ---")

# Display first 5 rows of by_year_funnel.csv
print("\nby_year_funnel.csv:")
try:
    with open('by_year_funnel.csv', 'r') as f:
        for i, line in enumerate(f):
            if i < 6: # Print header and first 5 rows
                print(line.strip())
            else:
                break
except FileNotFoundError:
    print("by_year_funnel.csv not found.")


# Display first 5 rows of county_year_rates.csv
print("\ncounty_year_rates.csv:")
try:
    with open('county_year_rates.csv', 'r') as f:
        for i, line in enumerate(f):
            if i < 6: # Print header and first 5 rows
                print(line.strip())
            else:
                break
except FileNotFoundError:
    print("county_year_rates.csv not found.")

# Display first 5 rows of county_anomalies.csv
print("\ncounty_anomalies.csv:")
try:
    with open('county_anomalies.csv', 'r') as f:
        for i, line in enumerate(f):
            if i < 6: # Print header and first 5 rows
                print(line.strip())
            else:
                break
except FileNotFoundError:
    print("county_anomalies.csv not found (possibly no anomalies found).")

# Display first 5 rows of cohort_summary.csv (if created)
if 'cohort_summary' in locals() and not cohort_summary.empty:
    print("\nCohort Outcomes (October 2020 New Registrants):")
    display(cohort_summary.head())
else:
    print("\nCohort summary not available (voter ID not found or no cohort analysis performed).")


Total rows read across all relevant files: 1936955

Total rows by event type:
event_type
address_change          316590
dropped_records          33985
name_change              22510
new_records             274535
voter_in_inactive      1185474
voter_status_change     103861
dtype: int64

Missing county %: 0.0

Voter ID column was not identified.

--- First 5 rows of Output Files ---

by_year_funnel.csv:
event_year,address_change,dropped_records,name_change,new_records,voter_in_inactive,voter_status_change
2020,316590.0,33985.0,22510.0,274535.0,1185474.0,103861.0

county_year_rates.csv:
county_year_rates.csv not found.

county_anomalies.csv:
county,month,event_type,rate,z_score
001|00647630|A|Baxley|315131663|1928|1991-01-01|Wh|M||||1B||012|019|156|Brun|1|1|||||||||||||||||||2020-08-11|R|1991-01-01|2020-08-31|102|Whi,2020-10-01,dropped_records,1000.0,6.105313600485252
001|00649362|A|Baxley|315130755|1951|1991-01-01|Bh|M||||2|Baxle|012|019|156|Brun|2|2|||||Baxley|047|Ward|6|||||||||||20

## Quick Checks and Summary

### Subtask:
Print quick checks as specified (rows read/skipped, missing county/voter_id percentages, total rows by event type) and display the first few rows of the key output dataframes. Summarize the key findings.

In [14]:
# Quick Checks

# Rows read per file and rows skipped (This was handled during initial loading,
# relying on the messages printed during that step. We can re-print total rows).
print("\nTotal rows read across all relevant files:", len(df))

# Total rows by event type (already printed during initial loading, re-printing for completeness)
print("\nTotal rows by event type:")
print(df.groupby("event_type").size())

# Missing percentage in county (already printed during initial loading, re-printing for completeness)
print("\nMissing county %:", df["county"].isna().mean()*100)

# Missing percentage in voter_id (if applicable)
if 'voter_id' in df.columns:
    print("\nMissing voter_id %:", df["voter_id"].isna().mean()*100)
else:
    print("\nVoter ID column was not identified.")


print("\n--- First 5 rows of Output Files ---")

# Display first 5 rows of by_year_funnel.csv
print("\nby_year_funnel.csv:")
try:
    with open('by_year_funnel.csv', 'r') as f:
        for i, line in enumerate(f):
            if i < 6: # Print header and first 5 rows
                print(line.strip())
            else:
                break
except FileNotFoundError:
    print("by_year_funnel.csv not found.")


# Display first 5 rows of county_year_rates.csv
print("\ncounty_year_rates.csv:")
try:
    with open('county_year_rates.csv', 'r') as f:
        for i, line in enumerate(f):
            if i < 6: # Print header and first 5 rows
                print(line.strip())
            else:
                break
except FileNotFoundError:
    print("county_year_rates.csv not found.")

# Display first 5 rows of county_anomalies.csv
print("\ncounty_anomalies.csv:")
try:
    with open('county_anomalies.csv', 'r') as f:
        for i, line in enumerate(f):
            if i < 6: # Print header and first 5 rows
                print(line.strip())
            else:
                break
except FileNotFoundError:
    print("county_anomalies.csv not found (possibly no anomalies found).")

# Display first 5 rows of cohort_summary.csv (if created)
if 'cohort_summary' in locals() and not cohort_summary.empty:
    print("\nCohort Outcomes (October 2020 New Registrants):")
    display(cohort_summary.head())
else:
    print("\nCohort summary not available (voter ID not found or no cohort analysis performed).")


Total rows read across all relevant files: 1936955

Total rows by event type:
event_type
address_change          316590
dropped_records          33985
name_change              22510
new_records             274535
voter_in_inactive      1185474
voter_status_change     103861
dtype: int64

Missing county %: 0.0

Voter ID column was not identified.

--- First 5 rows of Output Files ---

by_year_funnel.csv:
event_year,address_change,dropped_records,name_change,new_records,voter_in_inactive,voter_status_change
2020,316590.0,33985.0,22510.0,274535.0,1185474.0,103861.0

county_year_rates.csv:
county_year_rates.csv not found.

county_anomalies.csv:
county,event_month,event_type,rate,z_score
001|00647630|A|Baxley|315131663|1928|1991-01-01|Wh|M||||1B||012|019|156|Brun|1|1|||||||||||||||||||2020-08-11|R|1991-01-01|2020-08-31|102|Whi,2020-10-01,dropped_records,1000.0,6.105313600485252
001|00649362|A|Baxley|315130755|1951|1991-01-01|Bh|M||||2|Baxle|012|019|156|Brun|2|2|||||Baxley|047|Ward|6|||||||

## Step 5: Anomaly Detection (Z-scores) - Redo

### Subtask:
Recalculate z-scores for `voter_in_inactive_rate` and `dropped_records_rate` grouped by `event_month`, flag counties where `|z| >= 2`, and present the anomalies in a table with specified columns.

In [13]:
from scipy.stats import zscore

anomalies = []
# Iterate through each month in the rates_df index
for month in rates_df.index.get_level_values('event_month').unique():
    # Select data for the current month
    monthly_rates = rates_df.loc[(slice(None), month), :]

    # Define the event types for anomaly detection
    event_types_for_anomaly = ['voter_in_inactive_rate', 'dropped_records_rate']

    for event_type_rate_col in event_types_for_anomaly:
        if event_type_rate_col in monthly_rates.columns:
            # Extract the rates for the current event type and drop any NaN values
            monthly_rates_event_type = monthly_rates[event_type_rate_col].dropna()

            # Ensure there's enough data to calculate z-scores (at least 2 data points)
            if len(monthly_rates_event_type) > 1:
                # Calculate z-scores
                monthly_z_scores = zscore(monthly_rates_event_type)

                # Find anomalies (|z-score| >= 2)
                anomalous_indices = monthly_rates_event_type.index[abs(monthly_z_scores) >= 2]

                # Record anomalies
                for index in anomalous_indices:
                    county = index[0] # County is the first level of the MultiIndex
                    rate = monthly_rates_event_type.loc[index]
                    z = monthly_z_scores[monthly_rates_event_type.index.get_loc(index)]
                    anomalies.append({'county': county, 'event_month': month, 'event_type': event_type_rate_col.replace('_rate', ''), 'rate': rate, 'z_score': z})
            elif len(monthly_rates_event_type) == 1:
                 print(f"Only one data point for {event_type_rate_col} in {month.strftime('%Y-%m-%d')}. Cannot calculate z-score.")
            else:
                print(f"No data points for {event_type_rate_col} in {month.strftime('%Y-%m-%d')}.")
        else:
            print(f"Rate column '{event_type_rate_col}' not found for anomaly detection in {month.strftime('%Y-%m-%d')}.")


anomalies_df = pd.DataFrame(anomalies)

if not anomalies_df.empty:
    print("\nIdentified Anomalies (|z-score| >= 2):")
    display(anomalies_df[['county', 'event_month', 'event_type', 'rate', 'z_score']]) # Display with specified columns
    # Optionally save anomalies_df to a CSV
    anomalies_df.to_csv('county_anomalies.csv', index=False)
    print("\nAnomalies saved to county_anomalies.csv")
else:
    print("\nNo anomalies found (|z-score| >= 2).")


Identified Anomalies (|z-score| >= 2):


Unnamed: 0,county,event_month,event_type,rate,z_score
0,001|00647630|A|Baxley|315131663|1928|1991-01-0...,2020-10-01,dropped_records,1000.0,6.105314
1,001|00649362|A|Baxley|315130755|1951|1991-01-0...,2020-10-01,dropped_records,1000.0,6.105314
2,001|00650806|A|Baxley|315130963|1928|1991-01-0...,2020-10-01,dropped_records,1000.0,6.105314
3,001|00651522|A|Odum|315556057|1929|1991-01-01|...,2020-10-01,dropped_records,1000.0,6.105314
4,001|00653187|A|Baxley|315132161|1932|1991-01-0...,2020-10-01,dropped_records,1000.0,6.105314
...,...,...,...,...,...
175528,159|11515021|A|Ty Ty|31795|1958|2019-10-18|Wh|...,2020-12-01,dropped_records,1000.0,8.170951
175529,159|11613932|A|Albany|31705|1998|2020-10-05|Bh...,2020-12-01,dropped_records,1000.0,8.170951
175530,159|12173291|A|Sumner|31789|1960|2019-04-02|U|...,2020-12-01,dropped_records,1000.0,8.170951
175531,159|12692313|A|Warwick|31796|1958|2020-04-28|U...,2020-12-01,dropped_records,1000.0,8.170951



Anomalies saved to county_anomalies.csv


## Quick Checks and Summary

### Subtask:
Print quick checks as specified (rows read/skipped, missing county/voter_id percentages, total rows by event type) and display the first few rows of the key output dataframes. Summarize the key findings.

In [12]:
# Quick Checks

# Rows read per file and rows skipped (This was handled during initial loading,
# relying on the messages printed during that step. We can re-print total rows).
print("\nTotal rows read across all relevant files:", len(df))

# Total rows by event type (already printed during initial loading, re-printing for completeness)
print("\nTotal rows by event type:")
print(df.groupby("event_type").size())

# Missing percentage in county (already printed during initial loading, re-printing for completeness)
print("\nMissing county %:", df["county"].isna().mean()*100)

# Missing percentage in voter_id (if applicable)
if 'voter_id' in df.columns:
    print("\nMissing voter_id %:", df["voter_id"].isna().mean()*100)
else:
    print("\nVoter ID column was not identified.")


print("\n--- First 5 rows of Output Files ---")

# Display first 5 rows of by_year_funnel.csv
print("\nby_year_funnel.csv:")
try:
    with open('by_year_funnel.csv', 'r') as f:
        for i, line in enumerate(f):
            if i < 6: # Print header and first 5 rows
                print(line.strip())
            else:
                break
except FileNotFoundError:
    print("by_year_funnel.csv not found.")


# Display first 5 rows of county_year_rates.csv
print("\ncounty_year_rates.csv:")
try:
    with open('county_year_rates.csv', 'r') as f:
        for i, line in enumerate(f):
            if i < 6: # Print header and first 5 rows
                print(line.strip())
            else:
                break
except FileNotFoundError:
    print("county_year_rates.csv not found.")

# Display first 5 rows of county_anomalies.csv
print("\ncounty_anomalies.csv:")
try:
    with open('county_anomalies.csv', 'r') as f:
        for i, line in enumerate(f):
            if i < 6: # Print header and first 5 rows
                print(line.strip())
            else:
                break
except FileNotFoundError:
    print("county_anomalies.csv not found (possibly no anomalies found).")

# Display first 5 rows of cohort_summary.csv (if created)
if 'cohort_summary' in locals() and not cohort_summary.empty:
    print("\nCohort Outcomes (October 2020 New Registrants):")
    display(cohort_summary.head())
else:
    print("\nCohort summary not available (voter ID not found or no cohort analysis performed).")


Total rows read across all relevant files: 1936955

Total rows by event type:
event_type
address_change          316590
dropped_records          33985
name_change              22510
new_records             274535
voter_in_inactive      1185474
voter_status_change     103861
dtype: int64

Missing county %: 0.0

Voter ID column was not identified.

--- First 5 rows of Output Files ---

by_year_funnel.csv:
event_year,address_change,dropped_records,name_change,new_records,voter_in_inactive,voter_status_change
2020,316590.0,33985.0,22510.0,274535.0,1185474.0,103861.0

county_year_rates.csv:
county_year_rates.csv not found.

county_anomalies.csv:
county,month,event_type,rate,z_score
001|00647630|A|Baxley|315131663|1928|1991-01-01|Wh|M||||1B||012|019|156|Brun|1|1|||||||||||||||||||2020-08-11|R|1991-01-01|2020-08-31|102|Whi,2020-10-01,dropped_records,1000.0,6.105313600485252
001|00649362|A|Baxley|315130755|1951|1991-01-01|Bh|M||||2|Baxle|012|019|156|Brun|2|2|||||Baxley|047|Ward|6|||||||||||20