# Regularized regression: US county-level sociodemographic and health resource data (2018-2019)

## 1. Data loading

In [None]:
# Handle imports upfront
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso

pd.set_option('display.max_rows', 500)


### 1.1. Load

In [None]:
data_url='https://raw.githubusercontent.com/4GeeksAcademy/regularized-linear-regression-project-tutorial/main/demographic_health_data.csv'
data_df=pd.read_csv(data_url, sep=',')

### 1.2. Inspect

In [None]:
data_df.head().transpose()

In [None]:
data_df.info(verbose=True, show_counts=True)

### 1.3. Train-test split

In [None]:
# Do the test-train split
training_df, testing_df=train_test_split(
    data_df,
    test_size=0.25,
    random_state=315
)

## 2. EDA

### 2.1. Baseline model performance

In [None]:
anycondition_rates=training_df['anycondition_number']/training_df['TOT_POP']
mean_anycondition_rate=np.mean(anycondition_rates)
print(f'Mean any condition rate: {mean_anycondition_rate:.2f}')

rmse=root_mean_squared_error(anycondition_rates, [mean_anycondition_rate]*len(training_df))
print(f'Mean any condition rate model RMSE: {rmse:.2f}')

### 2.2. Inital feature selection

#### 2.2.1. Drop pathology related features

In [None]:
training_labels=training_df['anycondition_number']
testing_labels=testing_df['anycondition_number']

path_features=training_df.columns[:83]
training_df=training_df[path_features]
testing_df=testing_df[path_features]

#### 2.2.2. Initial feature selection

In [None]:
string_features=['COUNTY_NAME', 'STATE_NAME']
ordinal_encoder=OrdinalEncoder()
training_df[string_features]=ordinal_encoder.fit_transform(training_df[string_features])

linear_model=LinearRegression()
selector=RFE(linear_model, n_features_to_select=20, step=5)
selector=selector.fit(training_df, training_labels)

training_features=training_df.loc[:, selector.get_support()]
testing_features=testing_df.loc[:, selector.get_support()]

training_features.head().transpose()

### 2.3. Feature distributions and cleaning

### 2.4. Feature interactions & selection

In [None]:
# Your code here...

### 2.5. Feature encoding & scaling

In [None]:
# Your code here...

## 3. Linear model training

In [None]:
linear_model=LinearRegression()
result=linear_model.fit(training_features, training_labels)

train_predictions=linear_model.predict(training_features)
train_rmse=root_mean_squared_error(training_labels, train_predictions)

test_predictions=linear_model.predict(testing_features)
test_rmse=root_mean_squared_error(testing_labels, test_predictions)

print(f'Prediction RMSE: training: {train_rmse:.0f}, testing: {test_rmse:.0f}')

## 4. Model regularization

In [None]:
penalties=[0.0001, 0.001, 0.01, 0.1]

for penalty in penalties:
    lasso_model=Lasso(alpha=penalty, max_iter=1000)
    result=lasso_model.fit(training_features, training_labels)

    train_predictions=lasso_model.predict(training_features)
    train_rmse=root_mean_squared_error(training_labels, train_predictions)

    test_predictions=lasso_model.predict(testing_features)
    test_rmse=root_mean_squared_error(testing_labels, test_predictions)

    print(f'Prediction RMSE: training: {train_rmse:.0f}, testing: {test_rmse:.0f}')

## 5. Hyperparameter optimization

In [None]:
# Your code here...

## 6. Final model evaluation

In [None]:
# Your code here...