# Regularized regression: US county-level sociodemographic and health resource data (2018-2019)

## 1. Import modules 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
pd.set_option('display.max_rows', 250)

### 1.1. Data download

In [2]:
data_url='https://raw.githubusercontent.com/4GeeksAcademy/regularized-linear-regression-project-tutorial/main/demographic_health_data.csv'
data_df=pd.read_csv(data_url, sep=',')

### 1.2. Data inspection

In [3]:
data_df.head().T

Unnamed: 0,0,1,2,3,4
fips,1001,1003,1005,1007,1009
TOT_POP,55601,218022,24881,22400,57840
0-9,6787,24757,2732,2456,7095
0-9 y/o % of total pop,12.206615,11.355276,10.980266,10.964286,12.266598
19-Oct,7637,26913,2960,2596,7570
10-19 y/o % of total pop,13.735364,12.344167,11.896628,11.589286,13.087828
20-29,6878,23579,3268,3029,6742
20-29 y/o % of total pop,12.370281,10.814964,13.13452,13.522321,11.656293
30-39,7089,25213,3201,3113,6884
30-39 y/o % of total pop,12.749771,11.564429,12.865239,13.897321,11.901798


In [4]:
data_df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Data columns (total 108 columns):
 #    Column                                                                         Non-Null Count  Dtype  
---   ------                                                                         --------------  -----  
 0    fips                                                                           3140 non-null   int64  
 1    TOT_POP                                                                        3140 non-null   int64  
 2    0-9                                                                            3140 non-null   int64  
 3    0-9 y/o % of total pop                                                         3140 non-null   float64
 4    19-Oct                                                                         3140 non-null   int64  
 5    10-19 y/o % of total pop                                                       3140 non-null   float64
 6    20-29         

### 1.3. Train-test split 

In [5]:
training_df, testing_df=train_test_split(data_df,test_size=0.25,random_state=315)

## 2. EDA
### 2.1. Baseline model performance 

In [6]:
accuracy = training_df['anycondition_number']/training_df['TOT_POP']
#print(accuracy)
accuracy_mean=np.mean(accuracy)
print(f"Accuracy of any condition rate: {accuracy_mean:.2f}")

rmse=root_mean_squared_error(accuracy, [accuracy_mean]*len(training_df))
print(f"RMSE of the any condition rate model: {rmse:.2f}")


Accuracy of any condition rate: 0.36
RMSE of the any condition rate model: 0.05


### 2.2. Feature selection 

#### 2.2.1. Excluding features related to pathology data

In [7]:
train_labels=training_df['anycondition_number']/training_df['TOT_POP']
test_labels=testing_df['anycondition_number']/testing_df['TOT_POP']

no_path_features=training_df.columns[:83]
#print(no_path_features)
training_df=training_df[no_path_features]
testing_df=testing_df[no_path_features]
training_df.head().T

Unnamed: 0,340,1171,2303,967,872
fips,12047,22121,42123,20163,19171
TOT_POP,14310,26427,39498,5013,16904
0-9,1484,3621,4072,614,2155
0-9 y/o % of total pop,10.37037,13.701896,10.309383,12.248155,12.748462
19-Oct,1560,3371,4328,605,2324
10-19 y/o % of total pop,10.901468,12.755894,10.957517,12.068622,13.748225
20-29,2422,3542,4078,519,1813
20-29 y/o % of total pop,16.925227,13.402959,10.324573,10.353082,10.725272
30-39,1678,4098,4139,551,1787
30-39 y/o % of total pop,11.726066,15.506868,10.479012,10.991422,10.571462


In [8]:
training_df['COUNTY_NAME'].value_counts()

COUNTY_NAME
Washington        23
Jackson           18
Lincoln           18
Jefferson         17
Madison           16
                  ..
Utah               1
Virginia Beach     1
Goochland          1
Wyandotte          1
Alcona             1
Name: count, Length: 1480, dtype: int64

#### 2.2.2. Filtering features 

In [11]:
#1. Encoding the categorical features
categorical_features=['COUNTY_NAME', 'STATE_NAME']
#print(training_df[categorical_features])
ordinal_encoder=OrdinalEncoder()
training_df[categorical_features]=ordinal_encoder.fit_transform(training_df[categorical_features])
#print(training_df[categorical_features])

#2. Recursive Feature Elimination - selects top 21 features
#removes the least important features one step at a time
linear_reg_model=LinearRegression()
selector=RFE(linear_reg_model, n_features_to_select=21, step=1)
selector=selector.fit(training_df, train_labels)

#3. Select important features 
#.get_support() returns boolean mask on which features were selected by RFE 
training_features=training_df.loc[:, selector.get_support()].copy()
testing_features=testing_df.loc[:, selector.get_support()].copy()

training_features.head().transpose()

Unnamed: 0,340,1171,2303,967,872
0-9 y/o % of total pop,10.37037,13.701896,10.309383,12.248155,12.748462
10-19 y/o % of total pop,10.901468,12.755894,10.957517,12.068622,13.748225
20-29 y/o % of total pop,16.925227,13.402959,10.324573,10.353082,10.725272
30-39 y/o % of total pop,11.726066,15.506868,10.479012,10.991422,10.571462
40-49 y/o % of total pop,12.096436,11.10985,11.11955,10.572511,10.955987
50-59 y/o % of total pop,13.214535,13.497559,15.448883,13.764213,13.961193
60-69 y/o % of total pop,12.550664,11.174178,15.524837,14.103331,13.541174
70-79 y/o % of total pop,8.266946,5.687365,9.89164,9.136246,7.856129
80+ y/o % of total pop,3.948288,3.163431,5.944605,6.762418,5.892097
% White-alone,62.948987,57.944526,97.713808,95.93058,87.245622


In [12]:
testing_features.head().T

Unnamed: 0,2808,2126,2043,525,644
0-9 y/o % of total pop,9.568,11.865558,11.904363,12.933718,10.525085
10-19 y/o % of total pop,9.84,13.161613,14.125965,13.978925,11.182538
20-29 y/o % of total pop,8.784,11.504184,13.085868,14.171688,11.875055
30-39 y/o % of total pop,9.056,11.729703,10.96288,12.929434,12.514975
40-49 y/o % of total pop,11.6,11.642756,11.163829,12.092698,12.26076
50-59 y/o % of total pop,16.112,13.935985,12.981673,12.97941,14.706484
60-69 y/o % of total pop,17.296,13.109988,12.935157,10.968958,13.452941
70-79 y/o % of total pop,11.984,8.148571,8.103079,6.471143,8.225462
80+ y/o % of total pop,5.76,4.901641,4.737185,3.474027,5.256699
% White-alone,96.336,96.614498,96.749465,58.954222,91.660579
