# Regularized regression: US county-level sociodemographic and health resource data (2018-2019)

For this project I recommend attempting to model the prevalence of morbidity at the county level as a function of socioeconomic, demographic and health care related features. The label will be defined as the number of people with any reported medical condition per 100 people in the county.

An initial round of manual feature selection will be used to discard clearly redundant, confounding or unnecessary features before EDA.

Linear regression will be used to model the data, including regularization.

## Notebook set-up

Handle imports of necessary modules up-front.

In [None]:
# Standard library imports
from pathlib import Path

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.preprocessing import OrdinalEncoder, PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.exceptions import ConvergenceWarning

# Local imports
import helper_functions as funcs

# Set display options for pandas
pd.set_option('display.max_rows', 100)

## 1. Data loading

### 1.1. Load

In [None]:
data_url = 'https://raw.githubusercontent.com/4GeeksAcademy/regularized-linear-regression-project-tutorial/main/demographic_health_data.csv'
data_df = pd.read_csv(data_url, sep=',')

### 1.2. Save local copy

In [None]:
Path('../data/raw').mkdir(parents=True, exist_ok=True)
data_df.to_parquet('../data/raw/demographic_health_data.parquet', index=False)

### 1.3. Inspect

In [None]:
data_df.head().transpose()

In [None]:
data_df.info(verbose=True, show_counts=True)

## 2. Initial feature selection

In [None]:
# Our label is the rate of any condition per 100 people
label = data_df['anycondition_number'] / (data_df['TOT_POP'] / 100)

### 2.1. Age features

In [None]:
# Select age-related features
age_features = data_df.columns[2:20]
age_df = data_df[age_features].copy()

# Remove age count features, keeping only percentages
feature_drops = age_df.filter(regex = '^\d+-\d+$').columns
age_df.drop(columns=feature_drops, inplace=True)
age_df.drop(columns=['19-Oct', '80+'], inplace=True)

age_df.info()

### 2.2. Ethnicity features

In [None]:
# Select Ethnicity-related features
ethnicity_features = data_df.columns[20:32]
ethnicity_df = data_df[ethnicity_features].copy()

# Remove count features, keeping only percentages
features = ethnicity_df.filter(regex = '^\%').columns
ethnicity_df = ethnicity_df[features]

ethnicity_df.info()

### 2.3. Population features

In [None]:
population_features = ['TOT_POP', 'POP_ESTIMATE_2018', 'N_POP_CHG_2018', 'R_birth_2018', 'R_death_2018']
population_df = data_df[population_features].copy()
population_df.info()

### 2.4. Education features

In [None]:
education_features = data_df.columns[45:49]
education_df = data_df[education_features].copy()
education_df.info()

### 2.5. Employment features

In [None]:
employment_features = data_df.columns[56:62]
employment_df = data_df[employment_features].copy()
employment_df.info()

### 2.6. Healthcare access features

In [None]:
healthcare_features = data_df.columns[62:75]
healthcare_df = data_df[healthcare_features].copy()
healthcare_df.info()

### 2.7. Other

In [None]:
other_features = data_df.columns[78:82]
other_df = data_df[other_features].copy()
other_df.info()

### 2.8. Re-combine selected features

In [None]:
data_df = pd.concat([age_df,ethnicity_df,population_df,education_df,employment_df,healthcare_df,other_df], axis=1)
data_df['morbidity'] = label

# Rename features with shorter, more plot-friendly names
column_renames = {
    # Age features (percentages)
 
    '0-9 y/o % of total pop': 'pct_0-9',
    '10-19 y/o % of total pop': 'pct_10-19',
    '20-29 y/o % of total pop': 'pct_20-29',
    '30-39 y/o % of total pop': 'pct_30-39',
    '40-49 y/o % of total pop': 'pct_40-49',
    '50-59 y/o % of total pop': 'pct_50-59',
    '60-69 y/o % of total pop': 'pct_60-69',
    '70-79 y/o % of total pop': 'pct_70-79',
    '80+ y/o % of total pop': 'pct_80+',
    
    # Ethnicity features
    '% Not Hispanic or Latino': 'pct_non_hispanic',
    '% Hispanic or Latino': 'pct_hispanic',
    '% American Indian and Alaska Native Alone': 'pct_native_american',
    '% Asian Alone': 'pct_asian',
    '% Black or African American Alone': 'pct_black',
    '% Native Hawaiian and Other Pacific Islander Alone': 'pct_pacific_islander',
    '% Two or More Races': 'pct_multiracial',
    '% White Alone': 'pct_white',
    
    # Population features
    'TOT_POP': 'total_population',
    'POP_ESTIMATE_2018': 'pop_estimate_2018',
    'N_POP_CHG_2018': 'pop_change_2018',
    'R_birth_2018': 'birth_rate_2018',
    'R_death_2018': 'death_rate_2018',
    
    # Education features
    'Percent of adults with less than a high school diploma 2014-18': 'pct_no_hs_diploma',
    'Percent of adults with a high school diploma only 2014-18': 'pct_hs_only',
    'Percent of adults completing some college or associate\'s degree 2014-18': 'pct_some_college',
    'Percent of adults with a bachelor\'s degree or higher 2014-18': 'pct_bachelor_plus',
    
    # Employment features
    'Civilian_labor_force_2018': 'civilian_labor_force',
    'Employed_2018': 'employed_2018',
    'Unemployed_2018': 'unemployed_2018',
    'Unemployment_rate_2018': 'unemployment_rate',
    'Median_Household_Income_2018': 'median_household_income',
    'Med_HH_Income_Percent_of_State_Total_2018': 'income_pct_of_state',
    
    # Healthcare access features
    'Active Physicians per 100000 Population 2018 (AAMC)': 'active_physicians_per_100k',
    'Active Patient Care Primary Care Physicians per 100000 Population 2018 (AAMC)': 'primary_physicians_per_100k',
    'Active Primary Care Physicians per 100000 Population 2018 (AAMC)': 'active_primary_care_per_100k',
    'Active General Surgeons per 100000 Population 2018 (AAMC)': 'general_surgeons_per_100k',
    'Active Patient Care General Surgeons per 100000 Population 2018 (AAMC)': 'surgeons_per_100k',
    'Total Active Patient Care Physicians per 100000 Population 2018 (AAMC)': 'total_physicians_per_100k',
    'Total nurse practitioners (2019)': 'total_nurse_practitioners',
    'Total physician assistants (2019)': 'total_physician_assistants',
    'Total Hospitals (2019)': 'total_hospitals',
    'Internal Medicine Primary Care (2019)': 'internal_medicine_physicians',
    'Family Medicine/General Practice Primary Care (2019)': 'family_medicine_physicians',
    'Total Specialist Physicians (2019)': 'total_specialist_physicians',
}

data_df.rename(columns=column_renames, inplace=True)
data_df.info()

## 3. EDA
### 3.1. Data composition

#### 3.1.1. Categorical features

In [None]:
# Investigate the distribution of factor levels in any categorical features

#### 3.1.2. Ratio features

In [None]:
# Take a look at the descriptive statistics of the numerical features

In [None]:
# Plot the distributions of numerical features - what do you see?

#### 3.1.3. Label

In [None]:
# Plot the distribution of the label - what do you see?

### 3.2. Feature-label correlations

In [None]:
# Draw scatter plots showing the relationship between the label and each numerical feature

## 4. Data preparation

### 4.1. Train-test split

In [None]:
# Do the train test split

### 4.2. Feature encoding

In [None]:
# Encode any categorical features

### 4.3. Interaction features

Leaving this in so that you get overfitting on purpose - having polynomial interaction features on top of so many features to begin with is a recipy for overfitting. In this assignment, that's what we want!

In [None]:
poly_transformer = PolynomialFeatures(degree=2, include_bias=False)
poly_transformer.fit(training_df.drop(columns=['morbidity']))
poly_training_features = poly_transformer.transform(training_df.drop(columns=['morbidity']))
poly_testing_features = poly_transformer.transform(testing_df.drop(columns=['morbidity']))

poly_training_df = pd.DataFrame(poly_training_features, columns=poly_transformer.get_feature_names_out())
poly_testing_df = pd.DataFrame(poly_testing_features, columns=poly_transformer.get_feature_names_out())

poly_training_df['morbidity'] = training_df['morbidity'].values
poly_testing_df['morbidity'] = testing_df['morbidity'].values

## 5. Linear model training

### 5.1. Baseline

In [None]:
# Define a relevant baseline model for a standard of comparison

### 5.2. Linear regression model

In [None]:
# Train a linear regression model

In [None]:
# Evaluate the R-squared and the RMSE on the training and testing sets - what do you notice?

In [None]:
# Residual plots for training and testing sets
funcs.residual_plot(training_predictions, training_labels, testing_predictions, testing_labels)

## 4. Model regularization

### 4.1. Feature scaling

This aids in model convergence during repeated training runs.

In [None]:
# Use sklearn's StandardScaler to standardize the features so that the model will converge faster/better

### 4.2. Lasso penalty optimization

Now train the lasso model in a loop with different values for the L1 penalty. This should get you started:

In [None]:
%%time

penalties = [0.000025, 0.00005, 0.0001, 0.0002, 0.0004, 0.0008, 0.0016, 0.0032, 0.0064, 0.0128, 0.0256]

lasso_results = {
    'Training R-squared': [],
    'Testing R-Squared': [],
    'Training RMSE': [],
    'Testing RMSE': []
}

for penalty in penalties:

    # Create and fit the Lasso model

    # Evaluate the model on training data

    # Evaluate the model on testing data

    # Store the results for this iteration    
    lasso_results['Training R-squared'].append(train_rsq)
    lasso_results['Training RMSE'].append(train_rmse)
    lasso_results['Testing R-Squared'].append(test_rsq)
    lasso_results['Testing RMSE'].append(test_rmse)

    print(f'Penalty: {penalty}, prediction R-squared - training: {train_rsq:.4f}, testing: {test_rsq:.4f}')

print()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(8, 4))

fig.suptitle('Lasso regression results')

axs[0].plot(penalties, lasso_results['Training R-squared'], color='black', label='Training')
axs[0].plot(penalties, lasso_results['Testing R-Squared'], color='red', label='Testing')
axs[0].set_xlabel('L1 penalty')
axs[0].set_xscale('log', base=2)
axs[0].set_ylabel('R-squared')
axs[0].legend(loc='best')

axs[1].plot(penalties, lasso_results['Training RMSE'], color='black', label='Training')
axs[1].plot(penalties, lasso_results['Testing RMSE'], color='red', label='Testing')
axs[1].set_xlabel('L1 penalty')
axs[1].set_xscale('log', base=2)
axs[1].set_ylabel('RMSE')
axs[1].legend(loc='best')

plt.tight_layout()
plt.show()

### 4.3. Ridge penalty optimization

Do the same thing for a ridge regression model

In [None]:
%%time

penalties = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(8, 4))

fig.suptitle('Ridge regression results')

axs[0].plot(penalties, ridge_results['Training R-squared'], color='black', label='Training')
axs[0].plot(penalties, ridge_results['Testing R-Squared'], color='red', label='Testing')
axs[0].set_xlabel('L2 penalty')
axs[0].set_xscale('log', base=2)
axs[0].set_ylabel('R-squared')
axs[0].legend(loc='best')

axs[1].plot(penalties, ridge_results['Training RMSE'], color='black', label='Training')
axs[1].plot(penalties, ridge_results['Testing RMSE'], color='red', label='Testing')
axs[1].set_xlabel('L2 penalty')
axs[1].set_xscale('log', base=2)
axs[1].set_ylabel('RMSE')
axs[1].legend(loc='best')

plt.tight_layout()
plt.show()

## 6. Final model evaluation

In [None]:
# Use the Lasso and Ridge result plots to choose the best type of model and penalty value
# Then train the final model with the chosen penalty value
# Evaluate the final model's R-square and RMSE on the training and testing sets

In [None]:
funcs.residual_plot(training_predictions, training_labels, testing_predictions, testing_labels)