# import package

In [None]:
# for dataset manipulation
import math
import pandas as pd
import numpy as np
import datetime
import json
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup

#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)


# for model
import joblib
from sklearn.cluster import KMeans
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression


from sklearn import linear_model

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score,cross_val_predict, ShuffleSplit
from sklearn.metrics import r2_score
from sklearn import metrics

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize

from sklearn.model_selection import StratifiedKFold # cross-validation

# for model visualization
from IPython.display import Image
import pydotplus
import matplotlib.pyplot as plt
import seaborn as sns

# import data

In [None]:
case_data_path = ('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/07-25-2020.csv')
df_case = pd.read_csv(case_data_path) 

state = 'Florida'
case_df = df_case.query('Province_State == @state').dropna().reset_index(drop = True)
case_df = case_df.rename(columns = {'Lat':'Latitude', 'Long_':'Longitude','Admin2': 'County','Case-Fatality_Ratio':'Lethality'})

case_data = case_df[['County','Confirmed','Deaths','Lethality']]


In [None]:
transformed_data_path = ('/Users/chenhaoyi/JupyterNotebook/SummerProject/data/model_transformed_data.csv')
df_transformed = pd.read_csv(transformed_data_path)

retrieve_list = ['County','population_density', 'FPLi', 'logG','Logperin',
                                           'population','pergdp','Clinician_Count_DO','Nurse_practitioner_Count',
                                           'Bed Census', 'Total Staffed Bed Capacity','Percent Remaining Vent Capacity',
                                           'Ventilator Capacity']

df_transformed_remain_use = df_transformed.loc[:,retrieve_list]


In [None]:
floridahealth_data_path = ('/Users/chenhaoyi/JupyterNotebook/SummerProject/data/Florida_COVID19_Cases_by_County.csv')
df_floridahealth = pd.read_csv(floridahealth_data_path)

retrieve_list = ['County_1','C_Age_0_4', 'C_Age_5_14', 'C_Age_15_24', 'C_Age_25_34', 'C_Age_35_44', 
                 'C_Age_45_54' , 'C_Age_55_64', 'C_Age_65_74' , 'C_Age_75_84', 'C_Age_85plus', 'C_RaceWhite', 'C_RaceBlack',
                 'C_HispanicYES', 'C_HispanicNO', 'T_total' , 'T_negative', 'T_positive', 'Deaths']

fhd_df_68 = df_floridahealth.loc[:,retrieve_list].rename(columns = {'County_1':'County'}).sort_values(by = 'County').reset_index(drop = True)

fhd_df_67 = fhd_df_68.drop([64], axis = 0).reset_index(drop = True)

fhd_df_67.County.iloc[12] = 'Miami-Dade'
fhd_df_67.County.iloc[13] = 'DeSoto'
fhd_df_67 = fhd_df_67.sort_values(by = 'County')

# merge data

In [None]:
combined_1_df = pd.merge(df_transformed_remain_use, fhd_df_67,how = 'inner', on = 'County')
combined_2_df =  pd.merge(combined_1_df, case_data,how = 'inner', on = 'County')

#combined_2_df.to_csv('/Users/chenhaoyi/JupyterNotebook/SummerProject/data/combined_all_df.csv', index = False)
combined_refine = combined_2_df.drop(columns = ['Confirmed', 'Deaths_y', 'Lethality']).rename(columns = {'Deaths_x':'Deaths'})

combined_refine['C_65_plus'] = combined_refine['C_Age_65_74'] + combined_refine['C_Age_75_84'] + combined_refine['C_Age_85plus']
combined_refine['C_65_plus_rate'] = (combined_refine['C_65_plus'] / combined_refine['T_positive'])*100
combined_refine['C_RBlack_rate'] = (combined_refine['C_RaceBlack'] / combined_refine['T_positive'])*100
combined_refine['C_RWhite_rate'] = (combined_refine['C_RaceWhite'] / combined_refine['T_positive'])*100
combined_refine['C_HispanicYes_rate'] = (combined_refine['C_HispanicYES'] / combined_refine['T_positive'])*100
combined_refine['C_HispanicNo_rate'] = (combined_refine['C_HispanicNO'] / combined_refine['T_positive'])*100
combined_refine['Lethality'] = (combined_refine['Deaths'] / combined_refine['T_positive'])*100

combined_refine
#combined_refine.to_csv('/Users/chenhaoyi/JupyterNotebook/SummerProject/data/new_model_data.csv', index = False)


# dataset checking

In [None]:
case_data_path = ('/Users/chenhaoyi/JupyterNotebook/SummerProject/data/07-23-2020.csv')
df_case = pd.read_csv(case_data_path)

state = 'Florida'
case_df = df_case.query('Province_State == @state').dropna().reset_index(drop = True)
case_df = case_df.rename(columns = {'Lat':'Latitude', 'Long_':'Longitude','Admin2': 'County'})

lethality_df = case_df['Deaths'] / case_df['Confirmed']


transformed_data_path = ('/Users/chenhaoyi/JupyterNotebook/SummerProject/data/model_transformed_data.csv')
df_transformed = pd.read_csv(transformed_data_path)

df_get_transformed = pd.concat([df_transformed,lethality_df], axis = 1)  
df_get_transformed.columns = list(df_transformed.columns) + ['Lethality'] 

df_get_transformed = df_get_transformed.drop(columns = ['lethality'])

df_get_transformed