In [28]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline

#import plotly.express as px
#import plotly.offline as py
#import plotly.graph_objs as go

# ML
#from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# Ignore warnings
import warnings  
warnings.filterwarnings('ignore')


In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [4]:
# Loading data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('submission.csv')

In [5]:
train.sample(10) # Labels = 'ConfirmedCases' and 'Fatalities'

Unnamed: 0,Id,Province/State,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities
11034,16285,,Paraguay,-23.4425,-58.4438,2020-01-31,0.0,0.0
5135,7566,,Congo (Brazzaville),-4.0383,21.7587,2020-02-23,0.0,0.0
8250,12151,,Iraq,33.0,44.0,2020-03-22,233.0,20.0
6372,9403,France,France,46.2276,2.2137,2020-01-31,5.0,0.0
14259,21040,Grand Princess,US,37.6489,-122.6655,2020-02-12,0.0,0.0
5247,7738,,Costa Rica,9.7489,-83.7534,2020-02-09,0.0,0.0
1247,1818,,Bahrain,26.0275,50.55,2020-03-12,195.0,0.0
829,1220,South Australia,Australia,-34.9285,138.6007,2020-02-01,1.0,0.0
14891,21972,Maine,US,44.6939,-69.3819,2020-02-14,0.0,0.0
12899,19020,,Switzerland,46.8182,8.2275,2020-03-09,374.0,2.0


In [6]:
print(train.shape) # All the columns
print(test.shape) # Two columns less (labels)
print(sub.shape) # Id and both labels 

(17892, 8)
(12212, 6)
(12212, 3)


In [7]:
# Checking missing values
train.isnull().sum()

Id                   0
Province/State    9702
Country/Region       0
Lat                  0
Long                 0
Date                 0
ConfirmedCases       0
Fatalities           0
dtype: int64

In [8]:
# Exploring data
grouped = train.groupby('Date')['Date', 'ConfirmedCases', 'Fatalities'].sum().reset_index()

fig = px.line(grouped, x="Date", y="ConfirmedCases", 
              title="Confirmed Cases x Time")
fig.show()


fig = px.line(grouped, x="Date", y="Fatalities", 
              title="Fatalities x Time")
fig.show()


NameError: name 'px' is not defined

In [9]:
# Remove columns
train.drop(columns = ['Province/State'], inplace=True)
test.drop(columns = ['Province/State'], inplace=True)

In [10]:
# Remove labels and ID
X_train = train.drop(["Fatalities", "ConfirmedCases"], axis=1)
Y_test = test.copy()

In [11]:
# Date column to datetime
X_train['Date'] = pd.to_datetime(X_train['Date']) 
Y_test['Date'] = pd.to_datetime(Y_test['Date']) 

In [12]:
# Set the index to the date
X_train = X_train.set_index(['Date'])
Y_test = Y_test.set_index(['Date'])

In [13]:
def create_time_features(df):

    df['date'] = df.index
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.weekofyear
    
    X = df[['dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear']]
    return X

In [14]:
create_time_features(X_train)
X_train.drop(columns = ['date'], inplace=True)

X_train

Unnamed: 0_level_0,Id,Country/Region,Lat,Long,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-01-22,1,Afghanistan,33.0000,65.0000,2,1,1,2020,22,22,4
2020-01-23,2,Afghanistan,33.0000,65.0000,3,1,1,2020,23,23,4
2020-01-24,3,Afghanistan,33.0000,65.0000,4,1,1,2020,24,24,4
2020-01-25,4,Afghanistan,33.0000,65.0000,5,1,1,2020,25,25,4
2020-01-26,5,Afghanistan,33.0000,65.0000,6,1,1,2020,26,26,4
2020-01-27,6,Afghanistan,33.0000,65.0000,0,1,1,2020,27,27,5
2020-01-28,7,Afghanistan,33.0000,65.0000,1,1,1,2020,28,28,5
2020-01-29,8,Afghanistan,33.0000,65.0000,2,1,1,2020,29,29,5
2020-01-30,9,Afghanistan,33.0000,65.0000,3,1,1,2020,30,30,5
2020-01-31,10,Afghanistan,33.0000,65.0000,4,1,1,2020,31,31,5


In [15]:
create_time_features(Y_test)
Y_test.drop(columns = ['date'], inplace=True)

Y_test

Unnamed: 0_level_0,ForecastId,Country/Region,Lat,Long,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-03-12,1,Afghanistan,33.0000,65.0000,3,1,3,2020,72,12,11
2020-03-13,2,Afghanistan,33.0000,65.0000,4,1,3,2020,73,13,11
2020-03-14,3,Afghanistan,33.0000,65.0000,5,1,3,2020,74,14,11
2020-03-15,4,Afghanistan,33.0000,65.0000,6,1,3,2020,75,15,11
2020-03-16,5,Afghanistan,33.0000,65.0000,0,1,3,2020,76,16,12
2020-03-17,6,Afghanistan,33.0000,65.0000,1,1,3,2020,77,17,12
2020-03-18,7,Afghanistan,33.0000,65.0000,2,1,3,2020,78,18,12
2020-03-19,8,Afghanistan,33.0000,65.0000,3,1,3,2020,79,19,12
2020-03-20,9,Afghanistan,33.0000,65.0000,4,1,3,2020,80,20,12
2020-03-21,10,Afghanistan,33.0000,65.0000,5,1,3,2020,81,21,12


In [16]:
# Return index
X_train = X_train.set_index(['Id'])
Y_test = Y_test.set_index(['ForecastId'])

In [17]:
# Identifying data types 
display(X_train.dtypes)
display(X_train.dtypes.value_counts())

Country/Region     object
Lat               float64
Long              float64
dayofweek           int64
quarter             int64
month               int64
year                int64
dayofyear           int64
dayofmonth          int64
weekofyear          int64
dtype: object

int64      7
float64    2
object     1
dtype: int64

In [18]:
# One hot encode to Country/Region - Train
X_train = pd.concat([X_train,pd.get_dummies(X_train['Country/Region'], prefix='cr')],axis=1)
X_train.drop(['Country/Region'],axis=1, inplace=True)

# One hot encode to Country/Region - Test
Y_test = pd.concat([Y_test,pd.get_dummies(Y_test['Country/Region'], prefix='cr')],axis=1)
Y_test.drop(['Country/Region'],axis=1, inplace=True)

X_train

Unnamed: 0_level_0,Lat,Long,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear,cr_Afghanistan,...,cr_Turkey,cr_US,cr_Ukraine,cr_United Arab Emirates,cr_United Kingdom,cr_Uruguay,cr_Uzbekistan,cr_Venezuela,cr_Vietnam,cr_Zambia
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,33.0000,65.0000,2,1,1,2020,22,22,4,1,...,0,0,0,0,0,0,0,0,0,0
2,33.0000,65.0000,3,1,1,2020,23,23,4,1,...,0,0,0,0,0,0,0,0,0,0
3,33.0000,65.0000,4,1,1,2020,24,24,4,1,...,0,0,0,0,0,0,0,0,0,0
4,33.0000,65.0000,5,1,1,2020,25,25,4,1,...,0,0,0,0,0,0,0,0,0,0
5,33.0000,65.0000,6,1,1,2020,26,26,4,1,...,0,0,0,0,0,0,0,0,0,0
6,33.0000,65.0000,0,1,1,2020,27,27,5,1,...,0,0,0,0,0,0,0,0,0,0
7,33.0000,65.0000,1,1,1,2020,28,28,5,1,...,0,0,0,0,0,0,0,0,0,0
8,33.0000,65.0000,2,1,1,2020,29,29,5,1,...,0,0,0,0,0,0,0,0,0,0
9,33.0000,65.0000,3,1,1,2020,30,30,5,1,...,0,0,0,0,0,0,0,0,0,0
10,33.0000,65.0000,4,1,1,2020,31,31,5,1,...,0,0,0,0,0,0,0,0,0,0


In [19]:
X_train.head()

Unnamed: 0_level_0,Lat,Long,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear,cr_Afghanistan,...,cr_Turkey,cr_US,cr_Ukraine,cr_United Arab Emirates,cr_United Kingdom,cr_Uruguay,cr_Uzbekistan,cr_Venezuela,cr_Vietnam,cr_Zambia
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,33.0,65.0,2,1,1,2020,22,22,4,1,...,0,0,0,0,0,0,0,0,0,0
2,33.0,65.0,3,1,1,2020,23,23,4,1,...,0,0,0,0,0,0,0,0,0,0
3,33.0,65.0,4,1,1,2020,24,24,4,1,...,0,0,0,0,0,0,0,0,0,0
4,33.0,65.0,5,1,1,2020,25,25,4,1,...,0,0,0,0,0,0,0,0,0,0
5,33.0,65.0,6,1,1,2020,26,26,4,1,...,0,0,0,0,0,0,0,0,0,0


In [20]:
Y_test.head()

Unnamed: 0_level_0,Lat,Long,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear,cr_Afghanistan,...,cr_Turkey,cr_US,cr_Ukraine,cr_United Arab Emirates,cr_United Kingdom,cr_Uruguay,cr_Uzbekistan,cr_Venezuela,cr_Vietnam,cr_Zambia
ForecastId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,33.0,65.0,3,1,3,2020,72,12,11,1,...,0,0,0,0,0,0,0,0,0,0
2,33.0,65.0,4,1,3,2020,73,13,11,1,...,0,0,0,0,0,0,0,0,0,0
3,33.0,65.0,5,1,3,2020,74,14,11,1,...,0,0,0,0,0,0,0,0,0,0
4,33.0,65.0,6,1,3,2020,75,15,11,1,...,0,0,0,0,0,0,0,0,0,0
5,33.0,65.0,0,1,3,2020,76,16,12,1,...,0,0,0,0,0,0,0,0,0,0


In [24]:
#Asign columns for training and testing
y1_train = train[['ConfirmedCases']]
y2_train = train[['Fatalities']]


In [29]:
# Apply model to ConfirmedCases
random_forest = RandomForestClassifier(n_estimators=150, min_samples_leaf=3, max_features=0.5, n_jobs=-1)
random_forest.fit(X_train, y1_train)

random_forest.score(X_train, y1_train)
acc_random_forest = round(random_forest.score(X_train, y1_train) * 100, 2)
print('Accuracy model RandomForestClassifier:',acc_random_forest,"\n")

ConfirmedCases = random_forest.predict(Y_test)


MemoryError: could not allocate 33521664 bytes

In [None]:
# Apply model to Fatalities
random_forest = RandomForestClassifier(n_estimators=150, min_samples_leaf=3, max_features=0.5, n_jobs=-1)
random_forest.fit(X_train, y2_train)

random_forest.score(X_train, y2_train)
acc_random_forest = round(random_forest.score(X_train, y2_train) * 100, 2)
print('Accuracy model RandomForestClassifier:',acc_random_forest,"\n")

Fatalities = random_forest.predict(Y_test)

In [None]:
# Apply model to ConfirmedCases
decision_tree = DecisionTreeClassifier(criterion='entropy')
decision_tree.fit(X_train, y1_train)

decision_tree.score(X_train, y1_train)

acc_decision_tree = round(decision_tree.score(X_train, y1_train) * 100, 2)
print('Accuracy model DecisionTreeClassifier:',acc_decision_tree, "\n")

ConfirmedCases = random_forest.predict(Y_test)

In [None]:
# Apply model to Fatalities
decision_tree = DecisionTreeClassifier(criterion='entropy')
decision_tree.fit(X_train, y2_train)

decision_tree.score(X_train, y2_train)

acc_decision_tree = round(decision_tree.score(X_train, y2_train) * 100, 2)
print('Accuracy model DecisionTreeClassifier:',acc_decision_tree, "\n")

Fatalities = random_forest.predict(Y_test)

In [None]:
# Sumission
sub_df = pd.DataFrame()
sub_df['ForecastId'] = sub['ForecastId']
sub_df['ConfirmedCases'] = ConfirmedCases
sub_df['Fatalities'] = Fatalities

In [23]:
sub_df.shape,sub.shape

NameError: name 'sub_df' is not defined

In [None]:
sub_df.to_csv('submission.csv',index=False)