# Regularized regression: US county-level sociodemographic and health resource data (2018-2019)

## 1. Import modules 

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
pd.set_option('display.max_rows', 500)

### 1.1. Data download

In [7]:
data_url='https://raw.githubusercontent.com/4GeeksAcademy/regularized-linear-regression-project-tutorial/main/demographic_health_data.csv'
data_df=pd.read_csv(data_url, sep=',')

### 1.2. Data inspection

In [8]:
data_df.head().T

Unnamed: 0,0,1,2,3,4
fips,1001,1003,1005,1007,1009
TOT_POP,55601,218022,24881,22400,57840
0-9,6787,24757,2732,2456,7095
0-9 y/o % of total pop,12.206615,11.355276,10.980266,10.964286,12.266598
19-Oct,7637,26913,2960,2596,7570
10-19 y/o % of total pop,13.735364,12.344167,11.896628,11.589286,13.087828
20-29,6878,23579,3268,3029,6742
20-29 y/o % of total pop,12.370281,10.814964,13.13452,13.522321,11.656293
30-39,7089,25213,3201,3113,6884
30-39 y/o % of total pop,12.749771,11.564429,12.865239,13.897321,11.901798


In [13]:
data_df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Data columns (total 108 columns):
 #    Column                                                                         Non-Null Count  Dtype  
---   ------                                                                         --------------  -----  
 0    fips                                                                           3140 non-null   int64  
 1    TOT_POP                                                                        3140 non-null   int64  
 2    0-9                                                                            3140 non-null   int64  
 3    0-9 y/o % of total pop                                                         3140 non-null   float64
 4    19-Oct                                                                         3140 non-null   int64  
 5    10-19 y/o % of total pop                                                       3140 non-null   float64
 6    20-29         

### 1.3. Train-test split 

In [15]:
training_df, testing_df=train_test_split(data_df,test_size=0.25,random_state=315)

## 2. EDA
### 2.1. Baseline model performance 

In [23]:
accuracy = training_df['anycondition_number']/training_df['TOT_POP']
#print(accuracy)
accuracy_mean=np.mean(accuracy)
print(f"Accuracy of any condition rate: {accuracy_mean:.2f}")

rmse=root_mean_squared_error(accuracy, [accuracy_mean]*len(training_df))
print(f"RMSE of the any condition rate model: {rmse:.2f}")


Accuracy of any condition rate: 0.36
RMSE of the any condition rate model: 0.05


In [20]:
training_df.shape

(2355, 108)

In [25]:
training_df['TOT_POP']

340      14310
1171     26427
2303     39498
967       5013
872      16904
         ...  
1770    130090
2242    102811
1591     13122
777      15479
611     104143
Name: TOT_POP, Length: 2355, dtype: int64