In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import datetime
import warnings

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import StandardScaler
 
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings(action="ignore")

In [2]:
df = pd.read_csv(r'C:\Users\agrae\Documents\AAA_Work\Coding\Springboard\HW\000_Capstone_2\Data\DF_end.csv')
df.drop('Unnamed: 0', inplace=True, axis=1)

In [3]:
df.head()

Unnamed: 0,Week,State,Republican,perc_pop_obese,pop_size,perc_pop_urban,GDP_norm,Deaths_norm,total_deaths_norm
0,2021-04-17,Alabama,1,36.1,4903,59.0,46.531231,0.00204,2.112176
1,2021-04-17,Alaska,1,30.5,731,66.0,74.398803,0.0,0.262654
2,2021-04-17,Arizona,1,31.4,7278,89.8,50.854507,0.00371,2.125309
3,2021-04-17,Arkansas,1,37.4,3017,56.2,43.405403,0.0,1.938349
4,2021-04-17,California,0,26.2,39512,95.0,79.28732,0.000911,1.584911


In [4]:
X = df.drop(['Week', 'State', 'Deaths_norm', 'total_deaths_norm'], axis=1)
y = df['total_deaths_norm']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [5]:
mod_LR = LinearRegression()
mod_LR.fit(X_train, y_train)
y_pred = mod_LR.predict(X_test)
r2 = r2_score(y_test, y_pred), 
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f'r2 is {r2}.')
print(f'MAE is {mae :.5f}.')
print(f'MSE is {mse :.5f}.')

r2 is (-0.6538972887266461,).
MAE is 0.35288.
MSE is 0.24255.


In [6]:
mod_RF = RandomForestRegressor()
mod_RF.fit(X_train, y_train)
y_pred = mod_RF.predict(X_test)
r2 = r2_score(y_test, y_pred), 
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f'r2 is {r2}.')
print(f'MAE is {mae :.5f}.')
print(f'MSE is {mse :.5f}.')

r2 is (-1.578464440741906,).
MAE is 0.38777.
MSE is 0.37814.


In [7]:
mod_DR = DummyRegressor()
mod_DR.fit(X_train, y_train)
y_pred = mod_DR.predict(X_test)
r2 = r2_score(y_test, y_pred), 
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f'r2 is {r2}.')
print(f'MAE is {mae :.5f}.')
print(f'MSE is {mse :.5f}.')

r2 is (-0.10500012568602779,).
MAE is 0.30116.
MSE is 0.16205.


First off, we can see that none of our metrics are looking great. It does look like the RandomForest is the best so far though. Let's scale our data.

Scale data, dummy regressor, gridsearchCV, mean squared error, root mean squared

In [8]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
mod_LR = LinearRegression()
mod_LR.fit(X_train_scaled, y_train)
y_pred = mod_LR.predict(X_test_scaled)
r2 = r2_score(y_test, y_pred), 
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f'r2 is {r2}.')
print(f'MAE is {mae :.5f}.')
print(f'MSE is {mse :.5f}.')

r2 is (-0.6538972887266445,).
MAE is 0.35288.
MSE is 0.24255.


In [10]:
mod_RF = RandomForestRegressor()
mod_RF.fit(X_train_scaled, y_train)
y_pred = mod_RF.predict(X_test_scaled)
r2 = r2_score(y_test, y_pred), 
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f'r2 is {r2}.')
print(f'MAE is {mae :.5f}.')
print(f'MSE is {mse :.5f}.')

r2 is (-1.4878759599826559,).
MAE is 0.39363.
MSE is 0.36486.
