In [1]:
# Importing relevant libraries
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("submission.csv")


In [19]:
# Check train dataset
train.head()

Unnamed: 0,Id,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities
0,1,Afghanistan,33.0,65.0,2020-01-22,0.0,0.0
1,2,Afghanistan,33.0,65.0,2020-01-23,0.0,0.0
2,3,Afghanistan,33.0,65.0,2020-01-24,0.0,0.0
3,4,Afghanistan,33.0,65.0,2020-01-25,0.0,0.0
4,5,Afghanistan,33.0,65.0,2020-01-26,0.0,0.0


In [54]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17892 entries, 0 to 17891
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Id              17892 non-null  int64  
 1   Country/Region  17892 non-null  object 
 2   Lat             17892 non-null  float64
 3   Long            17892 non-null  float64
 4   Date            17892 non-null  int32  
 5   ConfirmedCases  17892 non-null  float64
 6   Fatalities      17892 non-null  float64
dtypes: float64(4), int32(1), int64(1), object(1)
memory usage: 908.7+ KB


In [5]:
train.shape

(17892, 8)

In [6]:
test.shape

(12212, 6)

In [7]:
# Description of the dataset
train.describe()

Unnamed: 0,Id,Lat,Long,ConfirmedCases,Fatalities
count,17892.0,17892.0,17892.0,17892.0,17892.0
mean,13191.5,26.287693,4.766191,325.207523,11.974737
std,7624.675152,22.935092,79.923261,3538.599684,174.346267
min,1.0,-41.4545,-157.4983,0.0,0.0
25%,6596.25,13.145425,-71.516375,0.0,0.0
50%,13191.5,32.98555,9.775,0.0,0.0
75%,19786.75,42.501575,64.688975,10.0,0.0
max,26382.0,71.7069,174.886,69176.0,6820.0


In [8]:
# Basic information
print(f"Total Reported Cases: {len(train)}")
print(f"Total Confirmed Cases: {train['ConfirmedCases'].sum()}")
print(f"Total Fatalities Cases: {train['Fatalities'].sum()}")
print(f"Total Countries Cases: {len(train['Country/Region'].unique())}")

Total Reported Cases: 17892
Total Confirmed Cases: 5818613.0
Total Fatalities Cases: 214252.0
Total Countries Cases: 163


In [9]:
########################
## DATA CLEANING
########################

In [15]:
# Null values in Province/State Column
train.isnull().sum()

Id                0
Country/Region    0
Lat               0
Long              0
Date              0
ConfirmedCases    0
Fatalities        0
dtype: int64

In [11]:
# Remove NA's
train.drop(labels = 'Province/State', axis = 1, inplace = True)

In [51]:
# ML Method cant process date as an object. Need to convert to int or float
train['Date'] = train['Date'].str.replace("-", "").astype(int)

In [55]:
# Doing same to test dataset
test['Date'] = test['Date'].str.replace("-", "").astype(int)

In [17]:
test.isnull().sum()

ForecastId           0
Province/State    6622
Country/Region       0
Lat                  0
Long                 0
Date                 0
dtype: int64

In [18]:
test.drop(labels = 'Province/State', axis = 1, inplace = True)

In [61]:
# Prepare dataset for training
# 2 Y-Variables i.e. ConfirmedCases and Fatalities
# 3 X-Variables
y1 = train['ConfirmedCases']
y2 = train['Fatalities']
x = train[['Lat', 'Long', 'Date']]
test_x = test[['Lat', 'Long', 'Date']]

In [62]:
from sklearn.ensemble import RandomForestClassifier
Rf_model = RandomForestClassifier(max_depth=500, random_state=0)

In [64]:
Rf_model.fit(x, y1)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=500, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [65]:
pred1 = Rf_model.predict(test_x)

In [69]:
pred1 = pd.DataFrame(pred1)

In [72]:
pred1.columns = ['ConfirmedCases']

In [76]:
pred1.head()

Unnamed: 0,ConfirmedCases
0,7.0
1,7.0
2,11.0
3,21.0
4,21.0


In [83]:
Rf_model.fit(x, y2)
pred2 = Rf_model.predict(test_x)
pred2 = pd.DataFrame(pred2)
pred2.columns = ['Fatalities']

In [84]:
pred2.head()

Unnamed: 0,Fatalities
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [100]:
sub_new = submission[['ForecastId']]

In [103]:
submit = pd.concat([sub_new, pred1, pred2], axis = 1)

In [106]:
submit.columns = ['ForecastId', 'ConfirmedCases', 'Fatalities']

submit.head(

In [107]:
submit.head()

Unnamed: 0,ForecastId,ConfirmedCases,Fatalities
0,1,7.0,0.0
1,2,7.0,0.0
2,3,11.0,0.0
3,4,21.0,0.0
4,5,21.0,0.0


In [108]:
submit.describe()

Unnamed: 0,ForecastId,ConfirmedCases,Fatalities
count,12212.0,12212.0,12212.0
mean,6106.5,1208.889125,53.222486
std,3525.445078,6234.287452,417.608734
min,1.0,0.0,0.0
25%,3053.75,6.0,0.0
50%,6106.5,81.0,0.0
75%,9159.25,367.0,3.0
max,12212.0,67800.0,6077.0
