In [3]:
import pandas as pd
import numpy as np
import requests
import json

In [4]:
datasource = requests.get("https://raw.githubusercontent.com/pomber/covid19/master/docs/timeseries.json")

In [5]:
data = datasource.json()

In [6]:
type(data)

dict

In [7]:
#taking only of Italy.However can be done for any country(have to check how accurate the model is based on MAE)
country_df = data["Italy"]

In [8]:
country_df = pd.DataFrame(country_df)

In [9]:
country_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 4 columns):
date         104 non-null object
confirmed    104 non-null int64
deaths       104 non-null int64
recovered    104 non-null int64
dtypes: int64(3), object(1)
memory usage: 3.4+ KB


In [10]:
#converting date object to datetime 
country_df["date"] = pd.to_datetime(country_df["date"])

In [12]:
country_df["date"][0]

Timestamp('2020-01-22 00:00:00')

In [13]:
country_df_tmp = country_df.copy()
country_df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 4 columns):
date         104 non-null datetime64[ns]
confirmed    104 non-null int64
deaths       104 non-null int64
recovered    104 non-null int64
dtypes: datetime64[ns](1), int64(3)
memory usage: 3.4 KB


In [14]:
#mybad again converting datetime back to object(string) 
country_df_tmp["date"] = country_df_tmp["date"].apply(lambda x: x.strftime('%Y-%m-%d'))
country_df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 4 columns):
date         104 non-null object
confirmed    104 non-null int64
deaths       104 non-null int64
recovered    104 non-null int64
dtypes: int64(3), object(1)
memory usage: 3.4+ KB


In [15]:
#splitting into label and target as shown below
X = country_df_tmp.drop("confirmed",axis = 1)
y = country_df_tmp["confirmed"]

#lets split into training and validation
train_split = round(0.8 * len(country_df_tmp)) # 80 % for training the data
X_train,y_train = X[:train_split],y[:train_split]
X_valid,y_valid = X[train_split:],y[train_split:]

len(X_train),len(X_valid)

(83, 21)

In [16]:
from datetime import datetime,timedelta,date
FMT = '%Y-%m-%d'
date = country_df_tmp['date']
#note: I have counted the data from january 1,2020 .for e.g : if today is march 05,2020 then the data column is obtained by
#subtracting today date with jan 1,2020.

country_df_tmp['data'] = date.map(lambda x : (datetime.strptime(x, FMT) - datetime.strptime("2020-01-01", FMT)).days)

In [17]:
country_df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 5 columns):
date         104 non-null object
confirmed    104 non-null int64
deaths       104 non-null int64
recovered    104 non-null int64
data         104 non-null int64
dtypes: int64(4), object(1)
memory usage: 4.2+ KB


In [18]:
import math
def linearregression(date,growthrate):
    x,y = date,growthrate
    lr = {} #dict for returning
    n = len(growthrate)
    sum_x = 0
    sum_y = 0
    sum_xy = 0
    sum_xx = 0
    sum_yy = 0
    
    for i in range(n):
        sum_x += x[i]
        sum_y += y[i]
        sum_xy += x[i]*y[i]
        sum_xx += x[i]*x[i]
        sum_yy += y[i]*y[i]
        
    lr['slope'] = (n*sum_xy - sum_x * sum_y)/(n*sum_xx - sum_x * sum_y)
    lr['intercept'] = (sum_y - lr['slope'] * sum_x)/n
    return lr;
    

In [19]:
country_df_tmp

Unnamed: 0,date,confirmed,deaths,recovered,data
0,2020-01-22,0,0,0,21
1,2020-01-23,0,0,0,22
2,2020-01-24,0,0,0,23
3,2020-01-25,0,0,0,24
4,2020-01-26,0,0,0,25
...,...,...,...,...,...
99,2020-04-30,205463,27967,75945,120
100,2020-05-01,207428,28236,78249,121
101,2020-05-02,209328,28710,79914,122
102,2020-05-03,210717,28884,81654,123


In [20]:
growthdata = []  #for observing the daily growth rate
totalcase = []  #for observing and predicting the total future cases
dailycase = [] #for observing and predicting the daily future new cases
data = np.array(country_df_tmp["confirmed"])
result = 0 

for i in range(len(data)):
    totalcase.append({
        'date' : country_df_tmp["data"][i],
        'value': country_df_tmp["confirmed"][i]
    })
    if data[i] >= 2:# and data[i] != data[i-1] != data[i-2]:
        if(data[i] != data[i-1] != data[i-2]):
            current = data[i] - data[i-1]
            prev = data[i-1] -data[i-2]
            result = current / prev
            growthdata.append({
                'date':country_df_tmp["data"][i],
                'value':result
            })
            dailycase.append({
                'date' : country_df_tmp["data"][i],
                'value':current
            })
        else:
            growthdata.append({
                'date':country_df_tmp["data"][i],
                'value':result
            })
        
    else:
        growthdata.append({
            'date':country_df_tmp["data"][i],
            'value': 0
        })

In [21]:
growthdata[102]['date']

123

In [22]:
startindex = train_split     #80% of originaldata
endindex = len(country_df_tmp)-1  #remaining data

x_values = []
y_values = []

for i in range(startindex,endindex+1):
    x_values.append(growthdata[i]['date'])
    y_values.append(growthdata[i]['value'])
    

In [23]:
print(y_values)

[0.9425943545829368, 0.8973755047106325, 1.4195725534308212, 0.9226096143687269, 0.9994274262811337, 0.8728158120882269, 0.7404003938299967, 1.2096631205673758, 1.234884573103701, 0.7851632047477745, 1.1417233560090703, 0.7802052300562727, 0.9859991514637251, 0.7482788296041308, 1.2024151811385855, 0.9976087996174079, 0.8974113135186961, 1.0496794871794872, 0.9669211195928753, 0.7310526315789474, 0.8790496760259179]


In [24]:
result = linearregression(x_values,y_values)

In [25]:
result    #we get our slope and intercept value


{'slope': -1.802414520920869e-05, 'intercept': 0.9737143398632042}

In [26]:
growthtrend = []
for i in range(startindex,endindex+1):
    actualgrowthrate = max(result['slope'] * growthdata[i]['date'] + result['intercept'], 0)
    
    growthtrend.append({
    "date" : country_df_tmp["data"][i],
    "value":actualgrowthrate
      })


In [27]:
#calculating the mean of acutal growthrate:
sumrate = 0
for i,data in enumerate(growthtrend):
    sumrate += growthtrend[i]["value"]

growthratemean = (sumrate/(i+1))
growthratemean,i+1

(0.9716595873093541, 21)

In [28]:
estimatedTotal = []  #for prediction of totalcases
estimatedDailyCases= []  #for prediction of daily new cases
currentDatedata = country_df_tmp['data'][endindex]

newCases = country_df_tmp['confirmed'][endindex] - country_df_tmp['confirmed'][endindex-1]
totalCases = country_df_tmp['confirmed'][endindex]
print(newCases,totalCases)


1221 211938


In [29]:
#validation step
#we have y_valid step which is 20% of original data
#first finding y_predicts for the same range
y_predicts = []  #empty list
currentcase = country_df_tmp["confirmed"][startindex] - country_df_tmp['confirmed'][startindex-1]
totalvalidCases = country_df_tmp['confirmed'][startindex]
for i in range(startindex,endindex+1):
    newCases = currentcase * actualgrowthrate
    totalvalidCases += newCases
    y_predicts.append(totalvalidCases)

In [30]:
#now calculating Mean_absolute_error
tmp_df = pd.DataFrame(y_valid)
tmp_df["ypredicts"] = y_predicts
tmp_df["difference"] = tmp_df["ypredicts"] - tmp_df["confirmed"]

mae = np.mean(tmp_df["difference"]) #which in is bit fine for such small data....
print(mae)

3685.5075366703954


In [31]:
#now for future data prediction
nextdate = datetime.strptime(country_df_tmp["date"][endindex],FMT)

for i in range(1,366):
    nextDatedata = currentDatedata + i
    nextdate += timedelta(days=1)
    
    estimatedGrowthrate = max(result['slope']*nextDatedata+result['intercept'],0)
    #print(estimatedGrowthrate)
    newCases = newCases * estimatedGrowthrate
    totalCases += newCases
    
    estimatedDailyCases.append({
        'datedata': nextDatedata,
        'date': nextdate,
        'estimate':newCases
    })
    
    estimatedTotal.append({
        'datedata':nextDatedata,
        'date': nextdate,
        'estimate':totalCases
    })
#estimatedGrowthrate

In [35]:
#lets see the future daily cases result for Italy 
estimateddailycase_df = pd.DataFrame(estimatedDailyCases)

In [37]:
estimateddailycase_df.head(10)

Unnamed: 0,datedata,date,estimate
0,125,2020-05-05,2804.838699
1,126,2020-05-06,2724.741755
2,127,2020-05-07,2646.883004
3,128,2020-05-08,2571.201338
4,129,2020-05-09,2497.637276
5,130,2020-05-10,2426.13292
6,131,2020-05-11,2356.631919
7,132,2020-05-12,2289.079425
8,133,2020-05-13,2223.422054
9,134,2020-05-14,2159.60785


In [38]:
#saving to csv file
estimateddailycase_df.to_csv("EstimatedDailyCase.csv",index = False)

In [39]:
estimatedtotalcase_df = pd.DataFrame(estimatedTotal)
estimatedtotalcase_df.head(10)

Unnamed: 0,datedata,date,estimate
0,125,2020-05-05,214742.838699
1,126,2020-05-06,217467.580454
2,127,2020-05-07,220114.463458
3,128,2020-05-08,222685.664796
4,129,2020-05-09,225183.302072
5,130,2020-05-10,227609.434992
6,131,2020-05-11,229966.066911
7,132,2020-05-12,232255.146337
8,133,2020-05-13,234478.568391
9,134,2020-05-14,236638.176241


In [40]:
estimatedtotalcase_df.to_csv("EstimatedTotalCase.csv",index = False)