In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
cases_train = pd.read_csv('../data/cases_2021_train.csv')
cases_test = pd.read_csv('../data/cases_2021_test.csv')
location = pd.read_csv('../data/location_2021.csv')

In [None]:
cases_train.info()
cases_test.info()
location.info()

In [None]:
"""Basic check to see if there is some kind of numerical correlation in Locations"""
ax = sns.heatmap(location.corr(),vmin=0,vmax=1,cmap='Blues')
plt.show()

In [None]:
"""Heatmap of confirmed cases across the world"""
plt.figure(figsize=(25,10))
plt.scatter(x=location['Long_'],y=location['Lat'],c=location['Confirmed'],cmap='Reds')
cbar = plt.colorbar()
cbar.set_label('# of Confirmed cases based on location')
plt.xlabel("Longitude")
plt.title("Known confirmed cases based on location")
plt.ylabel("Latitude")
plt.savefig('../plots/task-1.3/figure1.png')

In [None]:
"""Heatmap of Incident_Rates across the world"""
plt.figure(figsize=(25,10))
plt.scatter(x=location['Long_'],y=location['Lat'],c=location['Incident_Rate'],cmap='gist_heat_r')
cbar = plt.colorbar()
cbar.set_label('Incident_Rate based on location')
plt.xlabel("Longitude")
plt.title("Known Incident_Rate based on location")
plt.ylabel("Latitude")
plt.savefig('../plots/task-1.3/figure2.png')

In [None]:
"""Heatmap of active cases across the world"""
plt.figure(figsize=(25,10))
plt.scatter(x=location['Long_'],y=location['Lat'],c=location['Active'],cmap='PuRd')
cbar = plt.colorbar()
cbar.set_label('Active cases #')
plt.xlabel("Longitude")
plt.title("Known Active # based on location")
plt.ylabel("Latitude")
plt.savefig('../plots/task-1.3/figure3.png')

In [None]:
"""Heatmap of recovered cases across the world"""
plt.figure(figsize=(25,10))
plt.scatter(x=location['Long_'],y=location['Lat'],c=location['Recovered'],cmap='viridis')
cbar = plt.colorbar()
cbar.set_label('Known Recovered cases #')
plt.xlabel("Longitude")
plt.title("Known Recovered cases based on location")
plt.ylabel("Latitude")
plt.savefig('../plots/task-1.3/figure4.png')

In [None]:
"""Heatmap of case fatality ratio percentage across the world"""
plt.figure(figsize=(25,10))
plt.scatter(x=location['Long_'],y=location['Lat'],c=location['Case_Fatality_Ratio'],cmap='plasma')
cbar = plt.colorbar()
cbar.set_label('Case Fatality Ratio (%)')
plt.xlabel("Longitude")
plt.title("Known Case Fatality Ratio based on location")
plt.ylabel("Latitude")
plt.savefig('../plots/task-1.3/figure5.png')

In [None]:
"""Creating a detailed map of mainland US since it is clustered. Therefore we will remove non mainland US territories"""
location_US = location.loc[location['Country_Region']=='US']
location_US = location_US[~location_US['Province_State'].isin(['Northern Mariana Islands','Guam','Hawaii','Alaska','Puerto Rico','Virgin Islands'])]

In [None]:
"""Detailed map of mainland US for Incident Rate"""
plt.figure(figsize=(30,15))
plt.scatter(x=location_US['Long_'],y=location_US['Lat'],c=location_US['Incident_Rate'],cmap='gist_heat_r')
cbar = plt.colorbar()
cbar.set_label('Incident Rate',fontsize = 30)
plt.xlabel("Longitude",fontsize = 30)
plt.title("Incident Rate in mainland US",fontsize = 40)
plt.ylabel("Latitude",fontsize = 30)
plt.savefig('../plots/task-1.3/figure6.png')

In [None]:
"""Detailed map of mainland US for Case Fatality Ratio"""
plt.figure(figsize=(30,15))
plt.scatter(x=location_US['Long_'],y=location_US['Lat'],c=location_US['Case_Fatality_Ratio'],cmap='plasma')
cbar = plt.colorbar()
cbar.set_label('Case Fatality Ratio (%)',fontsize = 30)
plt.xlabel("Longitude",fontsize = 30)
plt.title("Case Fatality Ratio in mainland US",fontsize = 40)
plt.ylabel("Latitude",fontsize = 30)
plt.savefig('../plots/task-1.3/figure7.png')

In [None]:
"""Checking to see if there is any correlation on outcome through Chronic disease binary"""
chronic_d = cases_train.loc[cases_train['chronic_disease_binary']==True]
not_chronic_d = cases_train.loc[cases_train['chronic_disease_binary']==False]

In [None]:
chronic_d=chronic_d.groupby('outcome_group').size()
not_chronic_d=not_chronic_d.groupby('outcome_group').size()

In [None]:
ax = chronic_d.plot.bar(title="Outcome of cases with a chronic disease from training data")
plt.xticks(rotation = 0)
plt.savefig('../plots/task-1.3/figure8.png')

In [None]:
ax = not_chronic_d.plot.bar(title="Outcome of cases without a chronic disease from training data")
plt.xticks(rotation = 0)
plt.savefig('../plots/task-1.3/figure9.png')

In [None]:
"""Categorize training cases by outcome group"""
plt.figure(figsize=(25,10))
colors={'recovered':'green','deceased':'red','hospitalized':'yellow','nonhospitalized':'purple'}
plt.scatter(x=cases_train['longitude'],y=cases_train['latitude'],c=cases_train['outcome_group'].map(colors))
plt.xlabel("Longitude",fontsize = 30)
plt.title("Cases train outcome group based on location",fontsize = 40)
plt.ylabel("Latitude",fontsize = 30)
plt.savefig('../plots/task-1.3/figure10.png')

In [None]:
"""Create a detailed version for India since it is cluttered"""
cases_train_india = cases_train.loc[cases_train['country']=='India']

In [None]:
plt.figure(figsize=(20,20))
colors={'recovered':'green','deceased':'red','hospitalized':'yellow','nonhospitalized':'purple'}
plt.scatter(x=cases_train_india['longitude'],y=cases_train_india['latitude'],c=cases_train_india['outcome_group'].map(colors))
plt.xlabel("Longitude",fontsize = 30)
plt.title("Cases train outcome group in India",fontsize = 40)
plt.ylabel("Latitude",fontsize = 30)
plt.savefig('../plots/task-1.3/figure11.png')

In [None]:
"""Checking if there is a difference for confimed cases between train and test based on date"""
cases_train_date = cases_train[cases_train['date_confirmation'].str.len() == 10]
cases_train_date.loc[:,('date_confirmation')] = pd.to_datetime(cases_train_date['date_confirmation'])

In [None]:
ax = cases_train_date['date_confirmation'].hist(bins = 12)
ax.set_xlabel('Date')
ax.set_title('# of cases by date in training data')
plt.savefig('../plots/task-1.3/figure12.png')

In [None]:
cases_test_date = cases_test[cases_test['date_confirmation'].str.len() == 10]
cases_test_date.loc[:,('date_confirmation')] = pd.to_datetime(cases_test_date['date_confirmation'])

In [None]:
ax = cases_test_date['date_confirmation'].hist(bins = 12)
ax.set_xlabel('Date')
ax.set_title('# of cases by date in test data')
plt.savefig('../plots/task-1.3/figure13.png')

In [None]:
"""Checking if there are any differences in outcome depending on gender"""
cases_train_male = cases_train.loc[cases_train['sex']=="male"]
cases_train_female = cases_train.loc[cases_train['sex']=="female"]
cases_train_mf = pd.concat([cases_train_male,cases_train_female])
cases_train_male = cases_train_male.groupby("outcome_group").size()
cases_train_female = cases_train_female.groupby("outcome_group").size()
cases_train_mf = cases_train_mf.groupby("outcome_group").size()

In [None]:
ax = cases_train_mf.plot.bar(title="Total outcome group for cases with know gender cases")
plt.xticks(rotation = 0)
plt.savefig('../plots/task-1.3/figure14.png')

In [None]:
ax = cases_train_male.plot.bar(title="Outcome Group for male cases")
plt.xticks(rotation = 0)
plt.savefig('../plots/task-1.3/figure15.png')

In [None]:
ax = cases_train_female.plot.bar(title="Outcome Group for female cases")
plt.xticks(rotation = 0)
plt.savefig('../plots/task-1.3/figure16.png')

In [None]:
"""Printing out NaN sum"""
print(cases_train.isna().sum())

In [None]:
print(cases_test.isna().sum())

In [None]:
print(location.isna().sum())