In [5]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load cleaned data
print("Loading data for modeling...")
df = pd.read_csv('cleaned_infectious_disease.csv')
print(f"Dataset shape: {df.shape}")

# Filter for total population (not gender-specific)
df_total = df[df['Sex'] == 'Total'].copy()
print(f"Total records (excluding gender breakdown): {df_total.shape[0]}")

# 1. PROBLEM FORMULATION
print("\n 1. PROBLEM FORMULATION")
print("="*30)

"""
We will explore three modeling approaches:

1. REGRESSION: Predict incidence rate (Rate) for each county-year combination
2. CLASSIFICATION: Predict high-risk counties (above median rate)
3. TIME SERIES: Forecast future incidence rates

Primary focus: Regression problem to predict incidence rates
"""

Loading data for modeling...
Dataset shape: (141777, 14)
Total records (excluding gender breakdown): 47259

 1. PROBLEM FORMULATION


'\nWe will explore three modeling approaches:\n\n1. REGRESSION: Predict incidence rate (Rate) for each county-year combination\n2. CLASSIFICATION: Predict high-risk counties (above median rate)\n3. TIME SERIES: Forecast future incidence rates\n\nPrimary focus: Regression problem to predict incidence rates\n'

In [6]:
# 2. FEATURE ENGINEERING
print("\n2. FEATURE ENGINEERING")
print("="*30)

# Create features at county-year level
features_df = df_total.copy()

# Basic temporal features
features_df['Year_Since_2000'] = features_df['Year'] - 2000
features_df['Year_Squared'] = features_df['Year_Since_2000'] ** 2

# Lag features (previous year's rate)
features_df = features_df.sort_values(['County', 'Year'])

# Create lagged rate for each county
features_df['Rate_Lag1'] = features_df.groupby('County')['Rate'].shift(1)
features_df['Rate_Lag2'] = features_df.groupby('County')['Rate'].shift(2)
features_df['Rate_Lag3'] = features_df.groupby('County')['Rate'].shift(3)

# Moving averages
features_df['Rate_MA2'] = features_df.groupby('County')['Rate'].rolling(window=2).mean().reset_index(level=0, drop=True)
features_df['Rate_MA3'] = features_df.groupby('County')['Rate'].rolling(window=3).mean().reset_index(level=0, drop=True)

# Rate change features
features_df['Rate_Change_Lag1'] = features_df['Rate'] - features_df['Rate_Lag1']
features_df['Rate_Pct_Change_Lag1'] = (features_df['Rate_Change_Lag1'] / features_df['Rate_Lag1']) * 100

# County-level statistics (using only past data to avoid data leakage)
county_stats = features_df.groupby('County').agg({
    'Rate': ['mean', 'std', 'min', 'max']
}).round(3)
county_stats.columns = ['County_Rate_Mean', 'County_Rate_Std', 'County_Rate_Min', 'County_Rate_Max']

# Merge county statistics
features_df = features_df.merge(county_stats, left_on='County', right_index=True, how='left')

# Create rate z-score relative to county history
features_df['Rate_Z_Score'] = (features_df['Rate'] - features_df['County_Rate_Mean']) / features_df['County_Rate_Std']

# Population density proxy (if we had area data, we'd use actual density)
# Using population percentiles instead
population_percentiles = features_df['Population'].rank(pct=True)
features_df['Population_Percentile'] = population_percentiles

# Create interaction features
features_df['Population_Year_Interaction'] = features_df['Population'] * features_df['Year_Since_2000']
features_df['Rate_Lag1_Population'] = features_df['Rate_Lag1'] * features_df['Population']

# Regional features (group counties by rate patterns)
# First, identify county clusters from EDA
from sklearn.cluster import KMeans
county_features = county_stats.values
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
county_clusters = kmeans.fit_predict(county_features)
county_cluster_map = dict(zip(county_stats.index, county_clusters))
features_df['County_Cluster'] = features_df['County'].map(county_cluster_map)

# Create dummy variables for categorical features
county_dummies = pd.get_dummies(features_df['County'], prefix='County', drop_first=True)
cluster_dummies = pd.get_dummies(features_df['County_Cluster'], prefix='Cluster')

# Combine all features
X_full = pd.concat([
    features_df[[
        'Year_Since_2000', 'Year_Squared', 'Population', 'Population_Percentile',
        'Rate_Lag1', 'Rate_Lag2', 'Rate_Lag3', 'Rate_MA2', 'Rate_MA3',
        'Rate_Change_Lag1', 'Rate_Pct_Change_Lag1',
        'County_Rate_Mean', 'County_Rate_Std', 'Rate_Z_Score',
        'Population_Year_Interaction', 'Rate_Lag1_Population'
    ]],
    county_dummies,
    cluster_dummies
], axis=1)

# Target variable
y = features_df['Rate']

# Remove rows with missing values (from lag features)
valid_mask = ~X_full.isnull().any(axis=1) & ~y.isnull()
X_full = X_full[valid_mask]
y = y[valid_mask]
features_df = features_df[valid_mask]

print(f"Final dataset shape: {X_full.shape}")
print(f"Feature columns: {list(X_full.columns)}")


2. FEATURE ENGINEERING
Final dataset shape: (23856, 78)
Feature columns: ['Year_Since_2000', 'Year_Squared', 'Population', 'Population_Percentile', 'Rate_Lag1', 'Rate_Lag2', 'Rate_Lag3', 'Rate_MA2', 'Rate_MA3', 'Rate_Change_Lag1', 'Rate_Pct_Change_Lag1', 'County_Rate_Mean', 'County_Rate_Std', 'Rate_Z_Score', 'Population_Year_Interaction', 'Rate_Lag1_Population', 'County_Alpine', 'County_Amador', 'County_Butte', 'County_Calaveras', 'County_California', 'County_Colusa', 'County_Contra Costa', 'County_Del Norte', 'County_El Dorado', 'County_Fresno', 'County_Glenn', 'County_Humboldt', 'County_Imperial', 'County_Inyo', 'County_Kern', 'County_Kings', 'County_Lake', 'County_Lassen', 'County_Los Angeles', 'County_Madera', 'County_Marin', 'County_Mariposa', 'County_Mendocino', 'County_Merced', 'County_Modoc', 'County_Mono', 'County_Monterey', 'County_Napa', 'County_Nevada', 'County_Orange', 'County_Placer', 'County_Plumas', 'County_Riverside', 'County_Sacramento', 'County_San Benito', 'County_

In [8]:
# 3. FEATURE SELECTION
print("\n3. FEATURE SELECTION")
print("="*30)

from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Split data for feature selection
from sklearn.model_selection import train_test_split
X_temp, X_test, y_temp, y_test = train_test_split(
    X_full, y, test_size=0.2, random_state=42, shuffle=False
)

# Method 1: Correlation with target
correlations = X_temp.corrwith(y_temp).abs().sort_values(ascending=False)
print("Top 10 features by absolute correlation:")
print(correlations.head(10))

# Method 2: SelectKBest using f_regression
selector_kbest = SelectKBest(score_func=f_regression, k=20)
X_kbest = selector_kbest.fit_transform(X_temp, y_temp)
selected_features_kbest = X_temp.columns[selector_kbest.get_support()].tolist()
print(f"\nSelected {len(selected_features_kbest)} features using SelectKBest:")

# Method 3: Recursive Feature Elimination
estimator = LinearRegression()
selector_rfe = RFE(estimator, n_features_to_select=15, step=1)
selector_rfe.fit(X_temp, y_temp)
selected_features_rfe = X_temp.columns[selector_rfe.get_support()].tolist()
print(f"\nSelected {len(selected_features_rfe)} features using RFE:")

# Combine selection methods
selected_features = list(set(selected_features_kbest[:15] + selected_features_rfe))
print(f"\nTotal unique selected features: {len(selected_features)}")
print("Selected features:")
for i, feat in enumerate(sorted(selected_features), 1):
    print(f"  {i:2d}. {feat}")

# Use selected features
X_selected = X_full[selected_features]



3. FEATURE SELECTION
Top 10 features by absolute correlation:
Rate_Z_Score            0.898879
Rate_Change_Lag1        0.707160
Rate_MA2                0.707052
Rate_MA3                0.667718
County_Rate_Mean        0.050621
Cluster_3               0.049963
County_Rate_Std         0.047817
County_San Francisco    0.037481
County_Alpine           0.035517
County_Kern             0.031082
dtype: float64


ValueError: Input X contains infinity or a value too large for dtype('float64').

In [None]:
# 4. TRAIN/VALIDATION/TEST SPLITS
print("\n4. DATA SPLITTING STRATEGY")
print("="*30)

# Time-based splitting (since this is time series data)
# Use first 80% of years for training, next 10% for validation, last 10% for testing

# Get unique years
unique_years = sorted(features_df['Year'].unique())
train_years = unique_years[:int(len(unique_years)*0.8)]  # 2001-2010
val_years = unique_years[int(len(unique_years)*0.8):int(len(unique_years)*0.9)]  # 2011-2012
test_years = unique_years[int(len(unique_years)*0.9):]  # 2013-2014

print(f"Training years: {train_years}")
print(f"Validation years: {val_years}")
print(f"Test years: {test_years}")

# Create masks
train_mask = features_df['Year'].isin(train_years)
val_mask = features_df['Year'].isin(val_years)
test_mask = features_df['Year'].isin(test_years)

# Split data
X_train = X_selected[train_mask]
X_val = X_selected[val_mask]
X_test = X_selected[test_mask]

y_train = y[train_mask]
y_val = y[val_mask]
y_test = y[test_mask]

print(f"\nData split sizes:")
print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")