In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib.colors as mcolors
import pandas as pd 
import random
import math
import time
import os
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
import datetime
import operator 
plt.style.use('fivethirtyeight')
%matplotlib inline

In [26]:
confirmed_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
deaths_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
recoveries_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')


In [29]:
# Analyse data
latest_data.columns

Index(['FIPS', 'Admin2', 'Province_State', 'Country_Region', 'Last_Update',
       'Lat', 'Long_', 'Confirmed', 'Deaths', 'Recovered', 'Active',
       'Combined_Key', 'Incidence_Rate', 'Case-Fatality_Ratio'],
      dtype='object')

In [28]:
## Test same columns
print(np.all(confirmed_df.columns == deaths_df.columns))
print(np.all(deaths_df.columns == recoveries_df.columns))
print(np.all(deaths_df.columns == latest_data.columns))

True
True


ValueError: Lengths must match to compare

In [5]:
print(confirmed_df['Country/Region'].unique().size, 
deaths_df['Country/Region'].unique().size,
recoveries_df['Country/Region'].unique().size)

188 188 188


In [6]:
for i in range(recoveries_df['Country/Region'].unique().size):
    print(
        recoveries_df['Country/Region'].unique()[i] == deaths_df['Country/Region'].unique()[i],
        confirmed_df['Country/Region'].unique()[i], 
        deaths_df['Country/Region'].unique()[i],
        recoveries_df['Country/Region'].unique()[i])
# décallage dans les indices pour deaths_df
# nom non consistant (ex : republic)
# -> simplification, selections de quelque pays


True Afghanistan Afghanistan Afghanistan
True Albania Albania Albania
True Algeria Algeria Algeria
True Andorra Andorra Andorra
True Angola Angola Angola
True Antigua and Barbuda Antigua and Barbuda Antigua and Barbuda
True Argentina Argentina Argentina
True Armenia Armenia Armenia
True Australia Australia Australia
True Austria Austria Austria
True Azerbaijan Azerbaijan Azerbaijan
True Bahamas Bahamas Bahamas
True Bahrain Bahrain Bahrain
True Bangladesh Bangladesh Bangladesh
True Barbados Barbados Barbados
True Belarus Belarus Belarus
True Belgium Belgium Belgium
False Benin Benin Belize
False Bhutan Bhutan Benin
False Bolivia Bolivia Bhutan
False Bosnia and Herzegovina Bosnia and Herzegovina Bolivia
False Brazil Brazil Bosnia and Herzegovina
False Brunei Brunei Brazil
False Bulgaria Bulgaria Brunei
False Burkina Faso Burkina Faso Bulgaria
False Cabo Verde Cabo Verde Burkina Faso
False Cambodia Cambodia Cabo Verde
False Cameroon Cameroon Cambodia
False Canada Canada Cameroon
False Cen

In [7]:
cols = confirmed_df.keys()
print(cols)

Index(['Province/State', 'Country/Region', 'Lat', 'Long', '1/22/20', '1/23/20',
       '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       ...
       '7/24/20', '7/25/20', '7/26/20', '7/27/20', '7/28/20', '7/29/20',
       '7/30/20', '7/31/20', '8/1/20', '8/2/20'],
      dtype='object', length=198)


In [8]:
confirmed = confirmed_df.loc[:, cols[4]:cols[-1]]
deaths = deaths_df.loc[:, cols[4]:cols[-1]]
recoveries = recoveries_df.loc[:, cols[4]:cols[-1]]


In [9]:
dates = confirmed.keys()
world_cases = []
total_deaths = [] 
mortality_rate = []
recovery_rate = [] 
total_recovered = [] 
total_active = [] 

china_cases = [] 
italy_cases = []
us_cases = [] 
spain_cases = [] 
france_cases = [] 
germany_cases = [] 
uk_cases = [] 
russia_cases = [] 
brazil_cases = []
india_cases = []
peru_cases = [] 

china_deaths = [] 
italy_deaths = []
us_deaths = [] 
spain_deaths = [] 
france_deaths = [] 
germany_deaths = [] 
uk_deaths = [] 
russia_deaths = []
brazil_deaths = [] 
india_deaths = []
peru_deaths = []

china_recoveries = [] 
italy_recoveries = []
us_recoveries = [] 
spain_recoveries = [] 
france_recoveries = [] 
germany_recoveries = [] 
uk_recoveries = [] 
russia_recoveries = [] 
brazil_recoveries = [] 
india_recoveries = [] 
peru_recoveries = [] 

for i in dates:
    confirmed_sum = confirmed[i].sum()
    death_sum = deaths[i].sum()
    recovered_sum = recoveries[i].sum()
    
    # confirmed, deaths, recovered, and active
    world_cases.append(confirmed_sum)
    total_deaths.append(death_sum)
    total_recovered.append(recovered_sum)
    total_active.append(confirmed_sum-death_sum-recovered_sum)
    
    # calculate rates
    mortality_rate.append(death_sum/confirmed_sum)
    recovery_rate.append(recovered_sum/confirmed_sum)

    # case studies 
    china_cases.append(confirmed_df[confirmed_df['Country/Region']=='China'][i].sum())
    italy_cases.append(confirmed_df[confirmed_df['Country/Region']=='Italy'][i].sum())
    us_cases.append(confirmed_df[confirmed_df['Country/Region']=='US'][i].sum())
    spain_cases.append(confirmed_df[confirmed_df['Country/Region']=='Spain'][i].sum())
    france_cases.append(confirmed_df[confirmed_df['Country/Region']=='France'][i].sum())
    germany_cases.append(confirmed_df[confirmed_df['Country/Region']=='Germany'][i].sum())
    uk_cases.append(confirmed_df[confirmed_df['Country/Region']=='United Kingdom'][i].sum())
    russia_cases.append(confirmed_df[confirmed_df['Country/Region']=='Russia'][i].sum())
    brazil_cases.append(confirmed_df[confirmed_df['Country/Region']=='Brazil'][i].sum())
    india_cases.append(confirmed_df[confirmed_df['Country/Region']=='India'][i].sum())
    peru_cases.append(confirmed_df[confirmed_df['Country/Region']=='Peru'][i].sum())
    
    # moving average for case studies 
    china_deaths.append(deaths_df[deaths_df['Country/Region']=='China'][i].sum())
    italy_deaths.append(deaths_df[deaths_df['Country/Region']=='Italy'][i].sum())
    us_deaths.append(deaths_df[deaths_df['Country/Region']=='US'][i].sum())
    spain_deaths.append(deaths_df[deaths_df['Country/Region']=='Spain'][i].sum())
    france_deaths.append(deaths_df[deaths_df['Country/Region']=='France'][i].sum())
    germany_deaths.append(deaths_df[deaths_df['Country/Region']=='Germany'][i].sum())
    uk_deaths.append(deaths_df[deaths_df['Country/Region']=='United Kingdom'][i].sum())
    russia_deaths.append(deaths_df[deaths_df['Country/Region']=='Russia'][i].sum())
    brazil_deaths.append(deaths_df[deaths_df['Country/Region']=='Brazil'][i].sum())
    india_deaths.append(deaths_df[deaths_df['Country/Region']=='India'][i].sum())
    peru_deaths.append(deaths_df[deaths_df['Country/Region']=='Peru'][i].sum())
    
    china_recoveries.append(recoveries_df[recoveries_df['Country/Region']=='China'][i].sum())
    italy_recoveries.append(recoveries_df[recoveries_df['Country/Region']=='Italy'][i].sum())
    us_recoveries.append(recoveries_df[recoveries_df['Country/Region']=='US'][i].sum())
    spain_recoveries.append(recoveries_df[recoveries_df['Country/Region']=='Spain'][i].sum())
    france_recoveries.append(recoveries_df[recoveries_df['Country/Region']=='France'][i].sum())
    germany_recoveries.append(recoveries_df[recoveries_df['Country/Region']=='Germany'][i].sum())
    uk_recoveries.append(recoveries_df[recoveries_df['Country/Region']=='United Kingdom'][i].sum())
    russia_recoveries.append(recoveries_df[recoveries_df['Country/Region']=='Russia'][i].sum())
    brazil_recoveries.append(recoveries_df[recoveries_df['Country/Region']=='Brazil'][i].sum())
    india_recoveries.append(recoveries_df[recoveries_df['Country/Region']=='India'][i].sum())
    peru_recoveries.append(recoveries_df[recoveries_df['Country/Region']=='Peru'][i].sum())

In [10]:
recoveries = [["china"]+china_recoveries, 
["italy"]+italy_recoveries,
["us"]+us_recoveries, 
["spain"]+spain_recoveries, 
["france"]+france_recoveries, 
["germany"]+germany_recoveries, 
["uk"]+uk_recoveries, 
["russia"]+russia_recoveries, 
["brazil"]+brazil_recoveries, 
["india"]+india_recoveries, 
["peru"]+peru_recoveries]


In [11]:
deaths = [["china"]+china_deaths, 
["italy"]+italy_deaths,
["us"]+us_deaths, 
["spain"]+spain_deaths, 
["france"]+france_deaths, 
["germany"]+germany_deaths, 
["uk"]+uk_deaths, 
["russia"]+russia_deaths, 
["brazil"]+brazil_deaths, 
["india"]+india_deaths, 
["peru"]+peru_deaths]


In [12]:
cases = [["china"]+china_cases, 
["italy"]+italy_cases,
["us"]+us_cases, 
["spain"]+spain_cases, 
["france"]+france_cases, 
["germany"]+germany_cases, 
["uk"]+uk_cases, 
["russia"]+russia_cases, 
["brazil"]+brazil_cases, 
["india"]+india_cases, 
["peru"]+peru_cases]


In [13]:
columns = np.append(["country"],dates)
recoveries_df = pd.DataFrame(data=recoveries, columns=columns)
deaths_df = pd.DataFrame(data=deaths, columns=columns)
cases_df = pd.DataFrame(data=cases, columns=columns)

In [14]:
index=["cases", "total_deaths", "total_recovery", "total_active", "mortality_rate", "recovery_rate"]
data = [
    world_cases,
    total_deaths,
    total_recovered,
    total_active,
    mortality_rate,
    recovery_rate
]
world_summary = pd.DataFrame(data=data, index=index, columns=dates)

In [15]:
recoveries_df.to_csv(os.path.join("data",'recoveries_data.csv'))
deaths_df.to_csv(os.path.join("data",'deaths_data.csv'))
cases_df.to_csv(os.path.join("data",'cases_data.csv'))
world_summary.to_csv(os.path.join("data",'world_data.csv'))

In [16]:
confirmed_cases = cases_df.loc[cases_df["country"] == "china"]["7/27/20"][0]
print(confirmed_cases)
#cases_df.loc[cases_df["country"] == str(country)][str(date)][0]

86783


In [40]:
world_summary["2/1/20"].to_list()

[12038.0, 259.0, 284.0, 11495.0, 0.021515201860774213, 0.023591958797142383]

In [45]:
cases_df['toto'] = 0


In [59]:
n = cases_df.append({'country' : 'Sahil', 'Age' : 1}, ignore_index=True).fillna(0)

In [60]:
n

Unnamed: 0,country,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,7/26/20,7/27/20,7/28/20,7/29/20,7/30/20,7/31/20,8/1/20,8/2/20,toto,Age
0,china,548.0,643.0,920.0,1406.0,2075.0,2877.0,5509.0,6087.0,8141.0,...,86570.0,86783.0,86990.0,87213.0,87489.0,87655.0,87827.0,87985.0,0.0,0.0
1,italy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,246118.0,246286.0,246488.0,246776.0,247158.0,247537.0,247832.0,248070.0,0.0,0.0
2,us,1.0,1.0,2.0,2.0,5.0,5.0,5.0,5.0,5.0,...,4233923.0,4290337.0,4356206.0,4426982.0,4495015.0,4562038.0,4620444.0,4667955.0,0.0,0.0
3,spain,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,272421.0,278782.0,280610.0,282641.0,285430.0,288522.0,288522.0,288522.0,0.0,0.0
4,france,0.0,0.0,2.0,3.0,3.0,3.0,4.0,5.0,5.0,...,217801.0,220352.0,221077.0,221077.0,222469.0,225197.0,225198.0,225198.0,0.0,0.0
5,germany,0.0,0.0,0.0,0.0,0.0,1.0,4.0,4.0,4.0,...,206667.0,207112.0,207707.0,208546.0,209535.0,210399.0,211005.0,211220.0,0.0,0.0
6,uk,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,301020.0,301708.0,302261.0,303063.0,303910.0,304793.0,305562.0,306309.0,0.0,0.0
7,russia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,811073.0,816680.0,822060.0,827509.0,832993.0,838461.0,843890.0,849277.0,0.0,0.0
8,brazil,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2419091.0,2442375.0,2483191.0,2552265.0,2610102.0,2662485.0,2707877.0,2733677.0,0.0,0.0
9,india,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1435616.0,1480073.0,1531669.0,1581963.0,1634746.0,1695988.0,1750723.0,1803695.0,0.0,0.0


In [66]:
#cases_df.loc[cases_df["country"]=="italy"].index[0]
cases_df.loc[0,"1/22/20"] = 1
#["1/22/20"] = 0

In [68]:
cases_df.columns.to_list()[0]

'country'

In [67]:
cases_df

Unnamed: 0,country,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,7/25/20,7/26/20,7/27/20,7/28/20,7/29/20,7/30/20,7/31/20,8/1/20,8/2/20,toto
0,china,1,643,920,1406,2075,2877,5509,6087,8141,...,86381,86570,86783,86990,87213,87489,87655,87827,87985,0
1,italy,0,0,0,0,0,0,0,0,0,...,245864,246118,246286,246488,246776,247158,247537,247832,248070,0
2,us,1,1,2,2,5,5,5,5,5,...,4178970,4233923,4290337,4356206,4426982,4495015,4562038,4620444,4667955,0
3,spain,0,0,0,0,0,0,0,0,0,...,272421,272421,278782,280610,282641,285430,288522,288522,288522,0
4,france,0,0,2,3,3,3,4,5,5,...,217801,217801,220352,221077,221077,222469,225197,225198,225198,0
5,germany,0,0,0,0,0,1,4,4,4,...,206278,206667,207112,207707,208546,209535,210399,211005,211220,0
6,uk,0,0,0,0,0,0,0,0,0,...,300270,301020,301708,302261,303063,303910,304793,305562,306309,0
7,russia,0,0,0,0,0,0,0,0,0,...,805332,811073,816680,822060,827509,832993,838461,843890,849277,0
8,brazil,0,0,0,0,0,0,0,0,0,...,2394513,2419091,2442375,2483191,2552265,2610102,2662485,2707877,2733677,0
9,india,0,0,0,0,0,0,0,0,1,...,1385635,1435616,1480073,1531669,1581963,1634746,1695988,1750723,1803695,0
