In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

from sklearn.preprocessing import StandardScaler

In [2]:
# fetch data 

covid_data = pd.read_csv('covid_data.csv')

us_country = pd.read_csv('us_county.csv')

land_area = pd.read_excel('LND01_land_area_columnH.xls')

land_area.rename(columns = {'STCOU' : 'fips'})
land_area.isna().any()
us_country.isna().any()
covid_data.isna().any()
us_country = us_country.dropna()
covid_data = covid_data.dropna()
land_area.isna().any()
us_country.isna().any()
covid_data.isna().any()
covid_data.head()
us_country.head()
land_area.head()

Unnamed: 0,Areaname,fips,LND010200D
0,UNITED STATES,0,3794083.06
1,ALABAMA,1000,52419.02
2,"Autauga, AL",1001,604.45
3,"Baldwin, AL",1003,2026.93
4,"Barbour, AL",1005,904.52
...,...,...,...
3193,"Sweetwater, WY",56037,10491.17
3194,"Teton, WY",56039,4221.80
3195,"Uinta, WY",56041,2087.56
3196,"Washakie, WY",56043,2242.75


Areaname      False
STCOU         False
LND010200D    False
dtype: bool

fips                 False
county               False
state                False
state_code            True
male                 False
female               False
median_age           False
population           False
female_percentage    False
lat                  False
long                 False
dtype: bool

fips           True
county         True
state         False
lat           False
long          False
date          False
cases         False
state_code     True
deaths        False
dtype: bool

Areaname      False
STCOU         False
LND010200D    False
dtype: bool

fips                 False
county               False
state                False
state_code           False
male                 False
female               False
median_age           False
population           False
female_percentage    False
lat                  False
long                 False
dtype: bool

fips          False
county        False
state         False
lat           False
long          False
date          False
cases         False
state_code    False
deaths        False
dtype: bool

Unnamed: 0,fips,county,state,lat,long,date,cases,state_code,deaths
0,1001.0,Autauga,Alabama,32.54,-86.64,2021-02-14,6023,AL,84
1,1003.0,Baldwin,Alabama,30.73,-87.72,2021-02-14,19105,AL,252
2,1005.0,Barbour,Alabama,31.87,-85.39,2021-02-14,2042,AL,48
3,1007.0,Bibb,Alabama,33.0,-87.13,2021-02-14,2395,AL,57
4,1009.0,Blount,Alabama,33.98,-86.57,2021-02-14,5961,AL,121


Unnamed: 0,fips,county,state,state_code,male,female,median_age,population,female_percentage,lat,long
0,1001,Autauga County,Alabama,AL,26874,28326,37.8,55200,51.32,32.53,-86.64
1,1003,Baldwin County,Alabama,AL,101188,106919,42.8,208107,51.38,30.73,-87.72
2,1005,Barbour County,Alabama,AL,13697,12085,39.9,25782,46.87,31.87,-85.39
3,1007,Bibb County,Alabama,AL,12152,10375,39.9,22527,46.06,33.0,-87.13
4,1009,Blount County,Alabama,AL,28434,29211,40.8,57645,50.67,33.98,-86.57


Unnamed: 0,Areaname,STCOU,LND010200D
0,UNITED STATES,0,3794083.06
1,ALABAMA,1000,52419.02
2,"Autauga, AL",1001,604.45
3,"Baldwin, AL",1003,2026.93
4,"Barbour, AL",1005,904.52


In [3]:
#land_area.rename(columns = {'STCOU' : 'fips'})
joint_data = pd.merge(us_country,land_area.rename(columns = {'STCOU' : 'fips'}), on=['fips'], how='inner')

In [4]:
print(joint_data)
dataframe = pd.merge(joint_data, covid_data, on='fips')

       fips             county    state state_code    male  female  \
0      1001     Autauga County  Alabama         AL   26874   28326   
1      1003     Baldwin County  Alabama         AL  101188  106919   
2      1005     Barbour County  Alabama         AL   13697   12085   
3      1007        Bibb County  Alabama         AL   12152   10375   
4      1009      Blount County  Alabama         AL   28434   29211   
...     ...                ...      ...        ...     ...     ...   
3134  56037  Sweetwater County  Wyoming         WY   22882   21235   
3135  56039       Teton County  Wyoming         WY   11911   11148   
3136  56041       Uinta County  Wyoming         WY   10505   10104   
3137  56043    Washakie County  Wyoming         WY    4137    3992   
3138  56045      Weston County  Wyoming         WY    3768    3332   

      median_age  population  female_percentage   lat    long        Areaname  \
0          37.80       55200              51.32 32.53  -86.64     Autauga, AL 

In [5]:
#dataframe.drop(dataframe[dataframe['LND010200D'] == 0].index)
#dataframe.drop(dataframe[dataframe['population'] >= 1000000].index)
subset_one = dataframe[dataframe['LND010200D'] != 0]
subset_end = subset_one[subset_one['population'] < 1000000]
#subset_end.loc[:,'population-density'] = subset_end['population']/subset_end['LND010200D']
#subset_end.loc[:,'case-ratio'] = subset_end['cases']/subset_end['population']
subset_end = subset_end.assign(population_density=lambda x: x.population / x.LND010200D)
subset_end = subset_end.assign(case_ratio=lambda x: x.cases / x.population)
print(subset_end)
subset_end = subset_end[subset_end['cases'] != 0]
print(subset_end)

       fips           county_x  state_x state_code_x    male  female  \
0      1001     Autauga County  Alabama           AL   26874   28326   
1      1003     Baldwin County  Alabama           AL  101188  106919   
2      1005     Barbour County  Alabama           AL   13697   12085   
3      1007        Bibb County  Alabama           AL   12152   10375   
4      1009      Blount County  Alabama           AL   28434   29211   
...     ...                ...      ...          ...     ...     ...   
3134  56037  Sweetwater County  Wyoming           WY   22882   21235   
3135  56039       Teton County  Wyoming           WY   11911   11148   
3136  56041       Uinta County  Wyoming           WY   10505   10104   
3137  56043    Washakie County  Wyoming           WY    4137    3992   
3138  56045      Weston County  Wyoming           WY    3768    3332   

      median_age  population  female_percentage  lat_x  ...    county_y  \
0          37.80       55200              51.32  32.53  ... 

In [6]:
subset_end['cases'] = np.log(subset_end['cases'])
subset_end['male'] = np.log(subset_end['male'])
subset_end['female'] = np.log(subset_end['female'])
subset_end['population'] = np.log(subset_end['population'])
#subset_end = subset_end.drop(columns = ['cases','male'],axis = 1)
cols = subset_end.columns
print(cols)
subset_end = subset_end.assign(log_popl_density=lambda x: x.population - np.log(x.LND010200D))
print(subset_end)

Index(['fips', 'county_x', 'state_x', 'state_code_x', 'male', 'female',
       'median_age', 'population', 'female_percentage', 'lat_x', 'long_x',
       'Areaname', 'LND010200D', 'county_y', 'state_y', 'lat_y', 'long_y',
       'date', 'cases', 'state_code_y', 'deaths', 'population_density',
       'case_ratio'],
      dtype='object')
       fips           county_x  state_x state_code_x  male  female  \
0      1001     Autauga County  Alabama           AL 10.20   10.25   
1      1003     Baldwin County  Alabama           AL 11.52   11.58   
2      1005     Barbour County  Alabama           AL  9.52    9.40   
3      1007        Bibb County  Alabama           AL  9.41    9.25   
4      1009      Blount County  Alabama           AL 10.26   10.28   
...     ...                ...      ...          ...   ...     ...   
3134  56037  Sweetwater County  Wyoming           WY 10.04    9.96   
3135  56039       Teton County  Wyoming           WY  9.39    9.32   
3136  56041       Uinta County  

In [7]:
# fips:discrete  contry-x:categorical state_x: categorical state_code_x:categorical male: discrete famale: discrete
# median_age: discrete population: discrete female_percentage:discrete lat_x:continuous long_x:continuous Areaname: categorical
# LND010200D: continuous county_y:categorical state_y: categorical lat_y: continuous Long_y: continuous date: categorical
# cases: discrete stste_code_y: categotical deaths: discrete populatin-density: discrete case-ratio: discrete

"""subset_end[['male','female','median_age','population','female_percentage','cases','deaths','population_density','case_ratio']].mean()
subset_end[['male','female','median_age','population','female_percentage','cases','deaths','population_density','case_ratio']].std()
subset_end[['male','female','median_age','population','female_percentage','cases','deaths','population_density','case_ratio']].hist()

cols = ['male','female','median_age','population','female_percentage','cases','deaths','population_density','case_ratio']
for col in cols: 
    subset_end[col].hist()
    print(col)
    plt.show()"""
hosp = pd.read_csv("Hospital_Beds_per_County_and_per_capita.csv")
hosp.head()
hosp = hosp[['CoSt','St','ICUBeds','BedsPC','StaffPC','FoodInsc']]
data_final = pd.merge(subset_end, hosp, left_on=['county_x','state_code_x'], right_on = ['CoSt','St'], how='left')

data_final.isna().any()
data_final = data_final.dropna()
print(data_final)

"subset_end[['male','female','median_age','population','female_percentage','cases','deaths','population_density','case_ratio']].mean()\nsubset_end[['male','female','median_age','population','female_percentage','cases','deaths','population_density','case_ratio']].std()\nsubset_end[['male','female','median_age','population','female_percentage','cases','deaths','population_density','case_ratio']].hist()\n\ncols = ['male','female','median_age','population','female_percentage','cases','deaths','population_density','case_ratio']\nfor col in cols: \n    subset_end[col].hist()\n    print(col)\n    plt.show()"

Unnamed: 0,FID,GEOID,NAME,Co,St,Pop18,CoSt,UnwelPct,pct65pls,Staffed,...,F_ICUBeds,F_BedsPC,F_ICUPC,F_StaffPC,BedsPC,ICUPC,StaffPC,FoodInsc,SHAPE_Length,SHAPE_Area
0,1,1059,Franklin,Franklin,AL,31844,Franklin County,22.41,16.47,74,...,7,254.75,4549.14,430.32,254.75,4549.14,430.32,13.0,1.78,0.16
1,2,13111,Fannin,Fannin,GA,26644,Fannin County,15.89,28.73,50,...,5,532.88,5328.8,532.88,532.88,5328.8,532.88,11.4,1.77,0.1
2,3,19109,Kossuth,Kossuth,IA,15201,Kossuth County,11.88,23.35,24,...,0,608.04,0.0,633.37,608.04,0.0,633.37,9.8,2.13,0.28
3,4,40115,Ottawa,Ottawa,OK,31795,Ottawa County,22.96,18.12,94,...,9,271.75,3532.78,338.24,271.75,3532.78,338.24,17.2,1.48,0.13
4,5,42115,Susquehanna,Susquehanna,PA,42315,Susquehanna County,15.15,22.92,50,...,4,846.3,10578.75,846.3,846.3,10578.75,846.3,11.4,2.03,0.23


fips                  False
county_x              False
state_x               False
state_code_x          False
male                  False
female                False
median_age            False
population            False
female_percentage     False
lat_x                 False
long_x                False
Areaname              False
LND010200D            False
county_y              False
state_y               False
lat_y                 False
long_y                False
date                  False
cases                 False
state_code_y          False
deaths                False
population_density    False
case_ratio            False
log_popl_density      False
CoSt                   True
St                     True
ICUBeds                True
BedsPC                 True
StaffPC                True
FoodInsc               True
dtype: bool

       fips           county_x  state_x state_code_x  male  female  \
0      1001     Autauga County  Alabama           AL 10.20   10.25   
1      1003     Baldwin County  Alabama           AL 11.52   11.58   
2      1005     Barbour County  Alabama           AL  9.52    9.40   
3      1007        Bibb County  Alabama           AL  9.41    9.25   
4      1009      Blount County  Alabama           AL 10.26   10.28   
...     ...                ...      ...          ...   ...     ...   
3060  56037  Sweetwater County  Wyoming           WY 10.04    9.96   
3061  56039       Teton County  Wyoming           WY  9.39    9.32   
3062  56041       Uinta County  Wyoming           WY  9.26    9.22   
3063  56043    Washakie County  Wyoming           WY  8.33    8.29   
3064  56045      Weston County  Wyoming           WY  8.23    8.11   

      median_age  population  female_percentage  lat_x  ...  deaths  \
0          37.80       10.92              51.32  32.53  ...      84   
1          42.80 

In [8]:
data_final['BedsPC'] = data_final['BedsPC']+1
data_final['StaffPC'] = data_final['StaffPC']+1

data_final['BedsPC'] = np.log(data_final['BedsPC'])
data_final['StaffPC'] = np.log(data_final['StaffPC'])
data_final['ICUBeds'] = data_final['ICUBeds']**0.25

data_final['ICUBeds'].mean()
data_final['ICUBeds'].std()

data_final['BedsPC'].mean()
data_final['BedsPC'].std()

data_final['StaffPC'].mean()
data_final['StaffPC'].std()

data_final['FoodInsc'].mean()
data_final['FoodInsc'].std()

print(data_final)



0.9346091086156869

1.1159659349791922

4.786711750126151

2.5715724516043483

4.925338494944696

2.6466470337194723

13.6958250497018

4.199838507068791

       fips           county_x  state_x state_code_x  male  female  \
0      1001     Autauga County  Alabama           AL 10.20   10.25   
1      1003     Baldwin County  Alabama           AL 11.52   11.58   
2      1005     Barbour County  Alabama           AL  9.52    9.40   
3      1007        Bibb County  Alabama           AL  9.41    9.25   
4      1009      Blount County  Alabama           AL 10.26   10.28   
...     ...                ...      ...          ...   ...     ...   
3060  56037  Sweetwater County  Wyoming           WY 10.04    9.96   
3061  56039       Teton County  Wyoming           WY  9.39    9.32   
3062  56041       Uinta County  Wyoming           WY  9.26    9.22   
3063  56043    Washakie County  Wyoming           WY  8.33    8.29   
3064  56045      Weston County  Wyoming           WY  8.23    8.11   

      median_age  population  female_percentage  lat_x  ...  deaths  \
0          37.80       10.92              51.32  32.53  ...      84   
1          42.80 

In [9]:
# death, cases numbers are corelated to male and female numbers, also pupolations. Not very suprising
data_final_select = data_final[['lat_x','long_x','male','female','median_age','population','female_percentage','LND010200D','log_popl_density','ICUBeds','BedsPC','StaffPC','FoodInsc','cases']]
data_final_select.head()

data_final_select.isna().sum()
data_final_select.dropna(inplace=True)

print(np.isinf(data_final_select).all())


Unnamed: 0,lat_x,long_x,male,female,median_age,population,female_percentage,LND010200D,log_popl_density,ICUBeds,BedsPC,StaffPC,FoodInsc,cases
0,32.53,-86.64,10.2,10.25,37.8,10.92,51.32,604.45,4.51,1.57,6.51,6.94,13.4,8.7
1,30.73,-87.72,11.52,11.58,42.8,12.25,51.38,2026.93,4.63,2.58,6.44,6.59,12.3,9.86
2,31.87,-85.39,9.52,9.4,39.9,10.16,46.87,904.52,3.35,1.5,5.89,6.79,23.2,7.62
3,33.0,-87.13,9.41,9.25,39.9,10.02,46.06,626.16,3.58,0.0,6.49,6.83,15.8,7.78
4,33.98,-86.57,10.26,10.28,40.8,10.96,50.67,650.6,4.48,1.57,7.28,7.75,11.0,8.69


lat_x                0
long_x               0
male                 0
female               0
median_age           0
population           0
female_percentage    0
LND010200D           0
log_popl_density     0
ICUBeds              0
BedsPC               0
StaffPC              0
FoodInsc             0
cases                0
dtype: int64

lat_x                False
long_x               False
male                 False
female               False
median_age           False
population           False
female_percentage    False
LND010200D           False
log_popl_density     False
ICUBeds              False
BedsPC               False
StaffPC              False
FoodInsc             False
cases                False
dtype: bool


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_final_select.dropna(inplace=True)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(data_final_select.drop(columns = ['cases'],axis = 1), data_final_select['cases'], test_size=0.20, random_state = 2)

X_train
X_test
y_train
y_test



data_final_select = StandardScaler()
X_train = pd.DataFrame(data_final_select.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(data_final_select.transform(X_test), columns = X_test.columns)

Unnamed: 0,lat_x,long_x,male,female,median_age,population,female_percentage,LND010200D,log_popl_density,ICUBeds,BedsPC,StaffPC,FoodInsc
797,42.73,-93.26,8.54,8.54,42.40,9.23,50.05,583.01,2.87,0.00,6.08,6.08,10.40
2509,34.53,-102.26,8.27,8.26,35.30,8.96,49.85,899.32,2.16,0.00,6.14,6.14,8.40
2583,32.43,-97.83,10.24,10.27,46.70,10.95,50.78,436.80,4.87,1.86,6.70,7.12,15.90
1196,42.34,-72.66,11.23,11.36,36.30,11.99,53.20,545.44,5.69,1.82,7.08,7.25,9.90
1030,38.07,-82.73,8.97,8.97,41.10,9.67,49.96,420.12,3.63,1.57,5.18,5.51,17.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2557,29.39,-94.96,11.99,12.02,37.50,12.70,50.70,872.93,5.93,0.00,6.38,6.38,17.00
2390,36.29,-82.13,10.23,10.27,44.90,10.94,51.02,347.62,5.09,1.68,6.15,6.26,15.20
1647,41.58,-96.65,9.81,9.83,39.00,10.51,50.59,543.92,4.21,0.00,6.29,6.62,11.20
2584,33.15,-95.56,9.79,9.82,39.50,10.50,50.88,792.74,3.82,1.93,5.98,6.52,18.20


Unnamed: 0,lat_x,long_x,male,female,median_age,population,female_percentage,LND010200D,log_popl_density,ICUBeds,BedsPC,StaffPC,FoodInsc
2813,37.55,-77.92,9.63,9.50,45.10,10.26,46.63,262.40,4.69,0.00,0.00,0.00,6.50
1801,42.49,-75.61,10.10,10.09,44.70,10.79,49.89,898.70,3.99,1.63,6.77,6.77,11.00
1028,37.55,-85.70,8.86,8.87,40.80,9.56,50.36,263.72,3.98,0.00,0.00,0.00,13.20
2839,38.82,-77.09,11.23,11.30,36.50,11.96,51.86,15.41,9.23,2.51,6.14,6.14,10.00
2443,36.50,-87.38,11.49,11.50,30.60,12.19,50.04,543.84,5.89,2.14,6.47,6.59,14.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,34.27,-91.93,10.46,10.48,38.80,11.16,50.43,913.70,4.34,2.41,5.09,5.64,25.80
2575,35.84,-102.60,8.13,7.76,39.10,8.66,40.85,1463.20,1.37,0.00,5.63,5.63,13.30
1348,48.78,-95.81,8.99,8.92,41.40,9.65,48.24,1678.33,2.22,0.00,6.47,6.47,8.90
990,36.89,-87.49,10.56,10.43,28.40,11.19,46.85,724.01,4.60,2.06,5.96,6.34,18.70


797     6.99
2509    6.80
2583    8.70
1196    8.82
1030    7.08
        ... 
2557   10.41
2390    8.67
1647    8.43
2584    7.95
2618    8.80
Name: cases, Length: 2414, dtype: float64

2813   7.35
1801   7.75
1028   7.04
2839   9.21
2443   9.74
       ... 
138    9.04
2575   6.02
1348   7.45
990    8.71
314    7.27
Name: cases, Length: 604, dtype: float64

In [11]:



model = LinearRegression(fit_intercept = True)

model.fit(X_train, y_train)

model.score(X_train, y_train)

model.coef_ # this is beta 1, the slope of the regression function

model.intercept_ # this is beta 0

LinearRegression()

0.9391591162380319

array([-6.83031130e-02,  3.43904753e-02, -4.86762705e-01, -3.07177361e+00,
       -1.54185039e-01,  4.81658447e+00,  7.06590732e-02, -5.54255991e-04,
        1.64675973e-03,  9.05175113e-03,  4.00189835e-02,  4.98134071e-03,
       -3.96813762e-02])

7.729457667845861

In [16]:
test_output = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['predict cases'])
test_output.head()

test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()

mean_absolute_error = abs(test_output['predict cases'] - test_output['cases']).mean()

ratio = mean_absolute_error / test_output['cases'].mean()

print(ratio)

Unnamed: 0,predict cases
0,7.71
1,8.2
2,7.07
3,9.56
4,9.91


Unnamed: 0,predict cases,cases
4,9.91,8.69
14,4.5,7.2
15,8.82,8.55
18,7.64,6.73
23,8.08,8.1


0.21820710961251982


In [20]:



model = Lasso(fit_intercept = True, alpha = 0.2)

model.fit(X_train, y_train)

model.score(X_train, y_train)

model.coef_ # this is beta 1, the slope of the regression function

model.intercept_ # this is beta 0

Lasso(alpha=0.2)

0.9079748938651118

array([-0.        ,  0.        ,  1.17156364,  0.        , -0.00993591,
        0.00671842,  0.        , -0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ])

7.729457667845855

In [21]:
test_output = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['predict cases'])
test_output.head()

test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()

mean_absolute_error = abs(test_output['predict cases'] - test_output['cases']).mean()

ratio = mean_absolute_error / test_output['cases'].mean()

print(ratio)

Unnamed: 0,predict cases
0,7.81
1,8.2
2,7.16
3,9.18
4,9.41


Unnamed: 0,predict cases,cases
4,9.41,8.69
14,5.1,7.2
15,8.77,8.55
18,7.61,6.73
23,7.83,8.1


0.20356140554782343
