## Covid-19 Death Percentage Prediction 

In [1]:
import numpy as np
import pandas as pd
from numpy import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [24]:
data = pd.read_csv("covidData.csv")

In [25]:
data

Unnamed: 0,newCasesByPublishDate,cumCasesByPublishDate,deathsToday,deathsTillDate,Recovered
0,24470,5880667,95,156048,200
1,26144,5856528,85,155953,223
2,29622,5830774,94,155868,342
3,31117,5801561,83,155774,140
4,27734,5770928,77,155691,221
...,...,...,...,...,...
118,2762,4362150,44,152537,167
119,2297,4359388,51,152493,145
120,3423,4357091,40,152442,123
121,3402,4353668,50,152402,100


In [26]:
data  = data.rename(columns={'newCasesByPublishDate': 'newCasesOnPublishDate'})

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123 entries, 0 to 122
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   newCasesOnPublishDate  123 non-null    int64
 1   cumCasesByPublishDate  123 non-null    int64
 2   deathsToday            123 non-null    int64
 3   deathsTillDate         123 non-null    int64
 4   Recovered              123 non-null    int64
dtypes: int64(5)
memory usage: 4.9 KB


In [28]:
data.describe()

Unnamed: 0,newCasesOnPublishDate,cumCasesByPublishDate,deathsToday,deathsTillDate,Recovered
count,123.0,123.0,123.0,123.0,123.0
mean,12788.569106,4707550.0,30.552846,153893.390244,249.03252
std,14215.767197,434023.8,21.879028,818.762377,106.152761
min,1649.0,4350266.0,4.0,152352.0,100.0
25%,2490.5,4419366.0,14.5,153391.0,164.0
50%,4330.0,4490438.0,23.0,153789.0,239.0
75%,24359.0,4841816.0,42.5,154281.0,306.0
max,54674.0,5880667.0,95.0,156048.0,755.0


In [29]:
data.isnull().sum()

newCasesOnPublishDate    0
cumCasesByPublishDate    0
deathsToday              0
deathsTillDate           0
Recovered                0
dtype: int64

## Splitting Data

In [30]:
x=data
y=data.pop('deathsToday')
print(x.shape)
print(y.shape)

(123, 4)
(123,)


In [31]:
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.2,random_state=4)

## Linear Regression 

In [32]:
model=LinearRegression()
model.fit(train_x,train_y)

LinearRegression()

In [33]:
prediction=model.predict(test_x)
prediction

array([65.76018843, 78.58218739, 35.13291979, 53.32123041, 46.83474281,
       15.88494874, 81.01045985, 35.64633262, 17.87263252, 45.06292275,
       10.9922925 , 15.00194685, 23.74127739, 46.40691441, 13.78563153,
       49.57112431, 72.38642502, 65.74688347, 20.2491987 , 34.35701112,
       23.20843109, 27.39087859, 19.49471258, 51.14274539, 16.27983404])

In [34]:
err=math.sqrt(mean_squared_error(test_y,prediction))
err

6.419931701794294

In [35]:
model.score(test_x,test_y)

0.9299151083939202

In [36]:
df=pd.DataFrame({'actual':test_y,'prediction':prediction,'diff':(test_y-prediction)})
df

Unnamed: 0,actual,prediction,diff
13,83,65.760188,17.239812
2,94,78.582187,15.417813
25,35,35.13292,-0.13292
16,52,53.32123,-1.32123
19,48,46.834743,1.165257
41,12,15.884949,-3.884949
5,74,81.01046,-7.01046
24,40,35.646333,4.353667
83,8,17.872633,-9.872633
20,39,45.062923,-6.062923


In [37]:
test1=np.array([4479,4350266,152352,300])
test1=test1.reshape(1,-1)
t1=model.predict(test1)
n1=t1[0]
d1=test1[0][0]-test1[0][3]
print(round(n1))
print(d1) 

57
4179


In [38]:
print('Death percentage predicted for given data is',round(((n1/d1)*100),2))

Death percentage predicted for given data is 1.35


## DecisionTree Regressor 

In [39]:
regressor=DecisionTreeRegressor(random_state=0)
regressor.fit(train_x,train_y)

DecisionTreeRegressor(random_state=0)

In [40]:
prediction2=regressor.predict(test_x)
err=math.sqrt(mean_squared_error(test_y,prediction2))
err

9.658157174119708

In [41]:
pd.DataFrame({'actual':test_y,'prediction':prediction2,'diff':(test_y-prediction2)})

Unnamed: 0,actual,prediction,diff
13,83,57.0,26.0
2,94,85.0,9.0
25,35,28.0,7.0
16,52,69.0,-17.0
19,48,35.0,13.0
41,12,16.0,-4.0
5,74,75.0,-1.0
24,40,47.0,-7.0
83,8,10.0,-2.0
20,39,35.0,4.0


In [42]:
regressor.score(test_x,test_y)

0.8413821248809685

In [43]:
test2=np.array([4479,4350266,152352,300])
test2=test2.reshape(1,-1)
t2=regressor.predict(test2)
n2=t2[0]
d2=test2[0][0]-test2[0][3]
print(round(n2))
print(d2)


62
4179


In [44]:
print('Death percentage predicted for given data is',round(((n2/d2)*100),2))

Death percentage predicted for given data is 1.48


## RandomForest Regressor 

In [45]:
regr=RandomForestRegressor(max_depth=10,random_state=0,n_estimators=100)
regr.fit(train_x,train_y)

RandomForestRegressor(max_depth=10, random_state=0)

In [46]:
prediction3=regr.predict(test_x)
err=math.sqrt(mean_squared_error(test_y,prediction3))
err

6.330807520206957

In [47]:
pd.DataFrame({'actual':test_y,'prediction':prediction3,'diff':(test_y-prediction3)})

Unnamed: 0,actual,prediction,diff
13,83,64.89,18.11
2,94,83.04,10.96
25,35,34.89,0.11
16,52,54.49,-2.49
19,48,41.95,6.05
41,12,17.374865,-5.374865
5,74,77.05,-3.05
24,40,43.03,-3.03
83,8,16.431813,-8.431813
20,39,40.98,-1.98


In [48]:
regr.score(test_x,test_y)

0.9318474971808105

In [49]:
test3=np.array([4479,4350266,152352,300])
test3=test3.reshape(1,-1)
t3=regr.predict(test3)
n3=t3[0]
d3=test3[0][0]-test3[0][3]
print(round(n3))
print(d3)

55
4179


In [50]:
print('Death percentage predicted for given data is',round(((n3/d3)*100),2))

Death percentage predicted for given data is 1.31
