In [1]:
import sys
import sklearn
import numpy as np
import os
import tarfile
import urllib
import pandas as pd                       #provides a dataframe
import urllib.request
# NEW (From Covid19_DataSet_Visualisation)
import seaborn as sns  
import folium                             #For plotting Choropleth Map
# NEW (From Covid19_MachineLearning_Model)
import json


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score

In [2]:
# For graph plotting
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# mpl.rc('axes', labelsize=14)
# mpl.rc('xtick', labelsize=12)
# mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

# Get Data

In [3]:
C19_View = pd.read_csv('/Users/emmanueljumbo/dataScience/python/Dissertation/experimentation/data/ML_Ultimate_Covid19_DataSet.csv')


In [4]:
C19_View = C19_View.drop("Unnamed: 0", axis=1)
# C19_View['Date'] = pd.to_datetime(C19_View['Date'])

pd.set_option("display.max_columns", None)   # Lets you see ALL dataframe ROW.

In [31]:
# EMMYTABLE = ['State', 'County', 'Metro_Status',
#                         'FIPS','Lat','Long_', 
#                         'Population_Density', 'Cases_pct', 'Death_pct',
#                        'EP_POV', 'EP_UNEMP', 'EP_PCI',
#                        'EP_MINRTY', 'EP_UNINSUR', 
#                        'Vaxx_pct', 'Dose1_pct', 'Vaxx_Booster_pct',
#                        'Beds_utilization', 'Bed_covid_pct',
#                        'Bed_covid_utilization', 'ICU_bed_covid_Adult_UTIL',
#                        'ICU_bed_Adult_UTIL'] 

# C19_View[EMMYTABLE].head()

Unnamed: 0,State,County,Metro_Status,FIPS,Lat,Long_,Population_Density,Cases_pct,Death_pct,EP_POV,EP_UNEMP,EP_PCI,EP_MINRTY,EP_UNINSUR,Vaxx_pct,Dose1_pct,Vaxx_Booster_pct,Beds_utilization,Bed_covid_pct,Bed_covid_utilization,ICU_bed_covid_Adult_UTIL,ICU_bed_Adult_UTIL
0,Alabama,Autauga,1,1001.0,32.539527,-86.644082,93.985389,20310.010918,288.174122,15.4,4.2,29372.0,25.0,7.1,38.6,48.5,26.9,73.252732,11.459204,8.384716,23.277799,85.995844
1,Alabama,Baldwin,1,1003.0,30.72775,-87.722071,140.417022,18293.808291,266.088499,10.6,4.4,31203.0,17.0,10.2,47.8,60.6,28.2,73.252732,11.459204,8.384716,23.277799,85.995844
2,Alabama,Barbour,0,1005.0,31.868263,-85.387129,27.893734,16199.465284,328.121202,28.9,9.5,18461.0,53.9,11.2,42.1,51.8,23.7,73.252732,11.459204,8.384716,23.277799,85.995844
3,Alabama,Bibb,1,1007.0,32.996421,-87.125115,35.976546,20612.664106,424.220773,14.0,7.5,20199.0,25.4,7.9,32.9,40.2,26.1,73.252732,11.459204,8.384716,23.277799,85.995844
4,Alabama,Blount,1,1009.0,33.982109,-86.567906,89.676285,19766.195137,342.40653,14.4,4.1,22656.0,12.9,11.0,29.7,35.9,26.2,73.252732,11.459204,8.384716,23.277799,85.995844


#### XXXX XXX
1. XXXX XXX
2. XXXX XXX
3. XXXX XXX

# Specify Attributes

In [5]:
Test_Map = ['FIPS','EP_UNEMP']

# USED FOR: All_Variables cluster
Relavant_Attributes = ['Death_pct','Population_Density', 
                       'EP_POV', 'EP_UNEMP', 'EP_PCI',
                       'EP_MINRTY', 'EP_UNINSUR', 
                       'Vaxx_pct', 'Dose1_pct', 'Vaxx_Booster_pct',
                       'Beds_utilization', 'Bed_covid_pct',
                       'Bed_covid_utilization', 'ICU_bed_covid_Adult_UTIL',
                       'ICU_bed_Adult_UTIL']
# ALL Location and object varaiables have been removed from "Relavant_Attributes" Eg=>
# Metro_Status
# Cases_pct
# State_Abr
# State
# County
# FIPS
# Lat
# Long_


# USED FOR:
# both Population_Density & loction based variables have been removed
No_Population_Density = ['Death_pct',
                       'EP_POV', 'EP_UNEMP', 'EP_PCI',
                       'EP_MINRTY', 'EP_UNINSUR', 
                       'Vaxx_pct', 'Dose1_pct', 'Vaxx_Booster_pct',
                       'Beds_utilization', 'Bed_covid_pct',
                       'Bed_covid_utilization', 'ICU_bed_covid_Adult_UTIL',
                       'ICU_bed_Adult_UTIL']

# USED FOR: 
# both (state level) bed & loction based variables have been removed
No_Bed_Attributes = ['Death_pct','Population_Density', 
                       'EP_POV', 'EP_UNEMP', 'EP_PCI',
                       'EP_MINRTY', 'EP_UNINSUR', 
                       'Vaxx_pct', 'Dose1_pct', 'Vaxx_Booster_pct']

# NO Population_Density’, state level ‘Bed’ attributes and attributes which could be used by the ML model to identify ‘Location’.
No_Bed_Population = ['Death_pct', 
                       'EP_POV', 'EP_UNEMP', 'EP_PCI',
                       'EP_MINRTY', 'EP_UNINSUR', 
                       'Vaxx_pct', 'Dose1_pct', 'Vaxx_Booster_pct']


# Choropleth Maps (Folium)

#### READ/ MODIFY/ WRITE JSON File

In [7]:
# JsonFile = r'/Users/emmanueljumbo/dataScience/python/Dissertation/experimentation/data/Original_County_Boundary_files/us-county-boundaries.geojson'
# #JsonFile = r'/Users/emmanueljumbo/dataScience/python/Dissertation/experimentation/data/Original_County_Boundary_files/us-county-boundaries.json'

# #--READ JSON File--
# with open(JsonFile) as file:    #When Using .Geojson file
#   data = json.load(file)
# # with open(JsonFile) as file:   #When Using .json file
# #   data = json.load(file)

# #--MODIFY File--
# for id in data['features']:     #When Using .Geojson file
#     id['properties']['geoid'] = float(id['properties']['geoid']) #Float
# #    id['properties']['geoid'] = int(id['properties']['geoid'])   #int
# #for id in data:                 #When Using .json file
# #    id['fields']['geoid'] = float(id['fields']['geoid'])

# #--WRITE JSON File--
# with open('New_us-county-boundaries.json', 'w') as file:
#   json.dump(data, file, indent=2)





# #                     What And Why:
# # Fristly Only ONE of the above functions Should be run at a time. 
# # Each of the above functions were created to read a json/ Geojson file, modify it 
# # (change the “geoid” attribute from a string to a float) and then make/write a new 
# # JSON file. This is needed cause it lets me match(“key_on:”) my dataset key(FIPs) to 
# # the .json county boundary dataset key(GEOID).
# #
# # IDEA FROM= 
# #        ---JSON Datasets FOR US County & State Boundaries---
# # Opendatasoft= https://public.opendatasoft.com/explore/dataset/us-county-boundaries/information/?disjunctive.statefp&disjunctive.countyfp&disjunctive.name&disjunctive.namelsad&disjunctive.stusab&disjunctive.state_name&sort=stusab&dataChart=eyJxdWVyaWVzIjpbeyJjb25maWciOnsiZGF0YXNldCI6InVzLWNvdW50eS1ib3VuZGFyaWVzIiwib3B0aW9ucyI6eyJkaXNqdW5jdGl2ZS5zdGF0ZWZwIjp0cnVlLCJkaXNqdW5jdGl2ZS5jb3VudHlmcCI6dHJ1ZSwiZGlzanVuY3RpdmUubmFtZSI6dHJ1ZSwiZGlzanVuY3RpdmUubmFtZWxzYWQiOnRydWUsImRpc2p1bmN0aXZlLnN0dXNhYiI6dHJ1ZSwiZGlzanVuY3RpdmUuc3RhdGVfbmFtZSI6dHJ1ZX19LCJjaGFydHMiOlt7ImFsaWduTW9udGgiOnRydWUsInR5cGUiOiJjb2x1bW4iLCJmdW5jIjoiQVZHIiwieUF4aXMiOiJhbGFuZCIsInNjaWVudGlmaWNEaXNwbGF5Ijp0cnVlLCJjb2xvciI6IiNGRjUxNUEifV0sInhBeGlzIjoic3RhdGVmcCIsIm1heHBvaW50cyI6NTAsInNvcnQiOiIifV0sInRpbWVzY2FsZSI6IiIsImRpc3BsYXlMZWdlbmQiOnRydWUsImFsaWduTW9udGgiOnRydWV9&location=4,43.96119,-79.36523&basemap=jawg.light
# # HIFLD=        https://hifld-geoplatform.opendata.arcgis.com/datasets/geoplatform::us-county-boundaries/about
# #        ---Modify JSON File---
# # https://www.geeksforgeeks.org/reading-and-writing-json-to-a-file-in-python/


#### PLOT Choropleth Maps

In [8]:
# #latitude and Longitude the map is to focus on when loaded
# ChoroplethMap = folium.Map(location = [40, -95], zoom_start = 4) 

# # Import data on county boundaries (CountyBoundaries_df)
# # Geo_Data = r'/Users/emmanueljumbo/dataScience/python/Dissertation/experimentation/data/us-county-boundaries.geojson'
# Geo_Data = r'/Users/emmanueljumbo/dataScience/python/Dissertation/experimentation/data/New_us-county-boundaries.json'

# # Variable from NON_TimeSeries_df to be plotted
# Attribute_data_For_Plotting = C19_View[Test_Map]

# # Plot Choropleth Map
# folium.Choropleth(
#     geo_data = Geo_Data,                  #Boundaries of a geographic area                   
#     name = "choropleth",
#     data = Attribute_data_For_Plotting,   #Stats to be displayed                          
#     columns = ["FIPS", "EP_UNEMP"],       #Matching Key and a single stat from my CSV File
#     fill_color = "YlGn",     
#     fill_opacity = 0.7,
#     line_opacity = .1,
#     key_on = "feature.properties.geoid",  #Matching Key from geo location JSON file
#     #key_on = "fields.geoid",
#     legend_name = "CHANGE NAME Rate (%)",
# ).add_to(ChoroplethMap)                                 
  
# ChoroplethMap.save('/Users/emmanueljumbo/dataScience/python/Dissertation/experimentation/maps/final_map.html') #Save newly created map as a .html file
# # ChoroplethMap  #This will create map in this Pyhton notebook (P.S. It may crash notebook)



# #                     What And Why:
# # Made to visualise data on map. Black spots on the map represent counties I no longer have data on.
# # rows for these missing counties were removed during the handling of null values & outlier handling.
# # such as counties in puerto rico(had all null values) or Rio Arriba in New Mexico. 
# #
# # Began by installing Folium => [conda install -c conda-forge folium]
# # then import Folium => [import folium]
# #
# # IDEA FROM= 
# #        ---ChoroplethMap---
# # ChoroplethMap=  https://www.geeksforgeeks.org/visualizing-geospatial-data-using-folium-in-python/?ref=gcse
# #        ---JSON Datasets FOR US County & State Boundaries---
# # Opendatasoft=  https://public.opendatasoft.com/explore/dataset/us-county-boundaries/information/?disjunctive.statefp&disjunctive.countyfp&disjunctive.name&disjunctive.namelsad&disjunctive.stusab&disjunctive.state_name&sort=stusab&dataChart=eyJxdWVyaWVzIjpbeyJjb25maWciOnsiZGF0YXNldCI6InVzLWNvdW50eS1ib3VuZGFyaWVzIiwib3B0aW9ucyI6eyJkaXNqdW5jdGl2ZS5zdGF0ZWZwIjp0cnVlLCJkaXNqdW5jdGl2ZS5jb3VudHlmcCI6dHJ1ZSwiZGlzanVuY3RpdmUubmFtZSI6dHJ1ZSwiZGlzanVuY3RpdmUubmFtZWxzYWQiOnRydWUsImRpc2p1bmN0aXZlLnN0dXNhYiI6dHJ1ZSwiZGlzanVuY3RpdmUuc3RhdGVfbmFtZSI6dHJ1ZX19LCJjaGFydHMiOlt7ImFsaWduTW9udGgiOnRydWUsInR5cGUiOiJjb2x1bW4iLCJmdW5jIjoiQVZHIiwieUF4aXMiOiJhbGFuZCIsInNjaWVudGlmaWNEaXNwbGF5Ijp0cnVlLCJjb2xvciI6IiNGRjUxNUEifV0sInhBeGlzIjoic3RhdGVmcCIsIm1heHBvaW50cyI6NTAsInNvcnQiOiIifV0sInRpbWVzY2FsZSI6IiIsImRpc3BsYXlMZWdlbmQiOnRydWUsImFsaWduTW9udGgiOnRydWV9&location=4,43.96119,-79.36523&basemap=jawg.light
# # HIFLD=         https://hifld-geoplatform.opendata.arcgis.com/datasets/geoplatform::us-county-boundaries/about

# MachineLearning Model

In [9]:
## Creating Independent(X) & Dependent(Y) Variables sets

X = C19_View[No_Bed_Population].values # Variables being used to create cluster
Y = C19_View['Metro_Status'].values

In [10]:
## Instance Normalization/ Scaling

scale = StandardScaler()

X = scale.fit_transform(X)

#### PCA (Principal Component Analysis)

In [11]:
# from sklearn.decomposition import IncrementalPCA
# pca = IncrementalPCA(n_components = 2)
# X = pca.fit_transform(X)
from sklearn.decomposition import PCA

k = 4 #Number of clusters

pca = PCA(n_components = k)
X = pca.fit_transform(X)

#Dimension reduction by means of compressing the dataset and then holding onto only the most relevant
#info which is then used to create "principal components"
#  (so from a large amount of correlated variables to a smaller amount of uncorrelated variables)

#### K-Means Clustering (Unsupervised Machine Learning)

In [12]:
from sklearn.cluster import KMeans #Training Cluster Model

k #Number of clusters

kMeans = KMeans(n_clusters=k, random_state=42).fit(X)
pred_cluster = kMeans.fit_predict(X) #Pass our data to train our model
pred_cluster

# This is an example of hard clustering where in each instance is assigning to a single cluster.

array([0, 0, 2, ..., 0, 1, 0], dtype=int32)

## Plot Kmeans Results

In [13]:
# Store Clusters in new column
C19_View['Clusters'] = kMeans.labels_

# C19_View.head()
# C19_View['Clusters'].max()

In [14]:
Test_Map = ['FIPS','Clusters']


ChoroplethMap = folium.Map(location = [40, -95], zoom_start = 4) 

Geo_Data = r'/Users/emmanueljumbo/dataScience/python/Dissertation/experimentation/data/New_us-county-boundaries.json'

Attribute_data_For_Plotting = C19_View[Test_Map]


folium.Choropleth(
        geo_data = Geo_Data,                  #Boundaries of a geographic area                   
        name = "choropleth",
        data = Attribute_data_For_Plotting,   #Stats to be displayed                          
        columns = ["FIPS", "Clusters"],       #Matching Key and a single stat from my CSV File
        fill_color = "YlGnBu",
        fill_opacity = 0.7,
        line_opacity = .1,
        key_on = "feature.properties.geoid",  #Matching Key from geo location JSON file
        legend_name = "Clusters",
    ).add_to(ChoroplethMap)                                 


# ChoroplethMap.save('/Users/emmanueljumbo/dataScience/python/Dissertation/experimentation/maps/Name_Not_given.html') #This will create map file in map folder(Change names)


<folium.features.Choropleth at 0x7fd9039cfac0>

In [15]:
##                              OBSERVATION



# All_Variables:
# This cluster was created using all attributes expect does which could be used by the ML model to identify Location, such as FIP codes.
# -most coastal region appear to all be of the same colour/ cluster (dark blue)
# -Certain clusters(light blue) form into the shape of states. Eg Texas, Nevada, Washington Is that cause of the state level bed data?
# -(Light yellow) clusters are focused at the bottom of the US main land.  Is the above cause of latitude(vitamin D)?
# -(Light green) clusters are focused at the upper and middle regions of the US main land   


# No_Population_Density:
# This cluster was created using all attributes expect ‘Population_Density’ and attributes which could be used by the ML model to identify ‘Location’.
# -most coastal region appear to all be of the same colour/ cluster (dark blue)
# -Certain clusters(light blue) form into the shape of states. Eg Texas, Nevada, Washington Is that cause of the state level bed data?
# -(Light green) clusters are focused at the bottom of the US main land. Is the above cause of latitude(vitamin D)?
# -(Light yellow) clusters are focused at the upper and middle regions of the US main land   


# No_Bed_Attributes:
# This cluster was created using all attributes expect state level ‘Bed’ attributes and attributes which could be used by the ML model to identify ‘Location’.
# -most coastal region appear to all be of the same colour/ cluster (Light yellow)
# -No more shape of states can be seen. Cause state level data has been removed.
# -(Light green) clusters are focused at the bottom of the US main land. Is the above cause of latitude(vitamin D)?
# -(dark blue) clusters are scattered across the upper and middle regions of the US main land but still remaining in clumps, side by side each other

# No_Bed_Population:
# This cluster was created using all attributes expect ‘Population_Density’, state level ‘Bed’ attributes and attributes which could be used by the ML model to identify ‘Location’.
# -(dark blue) clusters mostly appear at coastal regions.
# -No more shape of states can be seen. Cause state level data has been removed.
# -(Light green) clusters are focused at the bottom of the US main land. Is the above cause of latitude(vitamin D)?
# -(light Yellow) clusters are scattered across the upper and middle regions of the US main land but still remaining in clumps, side by side each other


# List of Clusters Types:
# (dark blue)
# (light blue)
# (Light yellow) 
# (Light green) 



# SUMMARY
# Even after taking out; variables which indicate location, State level variables  and  population density variable (proxy for metro and non metro).  
# Positive autocorrelation (Custers gathered in clumps) remained present across all 4 maps.  
# Rather than becoming negative spatial autocorrelation (dissimilar values next to each other like chest board)

# Probable reason behind this pattern maybe:
# - individual state policies,
# - population's access to vaccines and health insurance. 
# - population's willingness to get vaccines or follow government’s covid guideline

# SEE NOTE BOOK (31st March) For “Future Work” Suggestion.





## CHECK Kmeans Results

#### Inertia

In [16]:
# Array where the model stores predictions
pred_cluster is kMeans.labels_


True

#### Inertia

In [19]:
kMeans.inertia_

# Inertia
# used to calculate the performance of the clusters created by K-means.

11564.697980192966

In [20]:
# Negative Inertia
kMeans.score(X)

-11564.697980192966