In [None]:
import numpy as np #linear algebra
import pandas as pd #data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline

#import plotly.express as px
#import plotly.offline as py
#import plotly.graph_objs as go

# ML
#from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# Ignore warnings
import warnings  
warnings.filterwarnings('ignore')


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Loading data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('submission.csv')

In [None]:
train.sample(10) # Labels = 'ConfirmedCases' and 'Fatalities'

In [None]:
print(train.shape) # All the columns
print(test.shape) # Two columns less (labels)
print(sub.shape) # Id and both labels 

In [None]:
# Checking missing values
train.isnull().sum()

In [None]:
# Exploring data
grouped = train.groupby('Date')['Date', 'ConfirmedCases', 'Fatalities'].sum().reset_index()

fig = px.line(grouped, x="Date", y="ConfirmedCases", 
              title="Confirmed Cases x Time")
fig.show()


fig = px.line(grouped, x="Date", y="Fatalities", 
              title="Fatalities x Time")
fig.show()


In [None]:
# Remove columns
train.drop(columns = ['Province/State'], inplace=True)
test.drop(columns = ['Province/State'], inplace=True)

In [None]:
# Remove labels and ID
X_train = train.drop(["Fatalities", "ConfirmedCases"], axis=1)
Y_test = test.copy()

In [None]:
# Date column to datetime
X_train['Date'] = pd.to_datetime(X_train['Date']) 
Y_test['Date'] = pd.to_datetime(Y_test['Date']) 

In [None]:
# Set the index to the date
X_train = X_train.set_index(['Date'])
Y_test = Y_test.set_index(['Date'])

In [None]:
def create_time_features(df):

    df['date'] = df.index
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.weekofyear
    
    X = df[['dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear']]
    return X

In [None]:
create_time_features(X_train)
X_train.drop(columns = ['date'], inplace=True)

X_train

In [None]:
create_time_features(Y_test)
Y_test.drop(columns = ['date'], inplace=True)

Y_test

In [None]:
# Return index
X_train = X_train.set_index(['Id'])
Y_test = Y_test.set_index(['ForecastId'])

In [None]:
# Identifying data types 
display(X_train.dtypes)
display(X_train.dtypes.value_counts())

In [None]:
# One hot encode to Country/Region - Train
X_train = pd.concat([X_train,pd.get_dummies(X_train['Country/Region'], prefix='cr')],axis=1)
X_train.drop(['Country/Region'],axis=1, inplace=True)

# One hot encode to Country/Region - Test
Y_test = pd.concat([Y_test,pd.get_dummies(Y_test['Country/Region'], prefix='cr')],axis=1)
Y_test.drop(['Country/Region'],axis=1, inplace=True)

X_train

In [None]:
X_train.head()

In [None]:
Y_test.head()

In [None]:
#Asign columns for training and testing
y1_train = train[['ConfirmedCases']]
y2_train = train[['Fatalities']]


In [None]:
# Apply model to ConfirmedCases
random_forest = RandomForestClassifier(n_estimators=150, min_samples_leaf=3, max_features=0.5, n_jobs=-1)
random_forest.fit(X_train, y1_train)

random_forest.score(X_train, y1_train)
acc_random_forest = round(random_forest.score(X_train, y1_train) * 100, 2)
print('Accuracy model RandomForestClassifier:',acc_random_forest,"\n")

ConfirmedCases = random_forest.predict(Y_test)


In [None]:
# Apply model to Fatalities
random_forest = RandomForestClassifier(n_estimators=150, min_samples_leaf=3, max_features=0.5, n_jobs=-1)
random_forest.fit(X_train, y2_train)

random_forest.score(X_train, y2_train)
acc_random_forest = round(random_forest.score(X_train, y2_train) * 100, 2)
print('Accuracy model RandomForestClassifier:',acc_random_forest,"\n")

Fatalities = random_forest.predict(Y_test)

In [None]:
# Apply model to ConfirmedCases
decision_tree = DecisionTreeClassifier(criterion='entropy')
decision_tree.fit(X_train, y1_train)

decision_tree.score(X_train, y1_train)

acc_decision_tree = round(decision_tree.score(X_train, y1_train) * 100, 2)
print('Accuracy model DecisionTreeClassifier:',acc_decision_tree, "\n")

ConfirmedCases = random_forest.predict(Y_test)

In [None]:
# Apply model to Fatalities
decision_tree = DecisionTreeClassifier(criterion='entropy')
decision_tree.fit(X_train, y2_train)

decision_tree.score(X_train, y2_train)

acc_decision_tree = round(decision_tree.score(X_train, y2_train) * 100, 2)
print('Accuracy model DecisionTreeClassifier:',acc_decision_tree, "\n")

Fatalities = random_forest.predict(Y_test)

In [None]:
# Sumission
sub_df = pd.DataFrame()
sub_df['ForecastId'] = sub['ForecastId']
sub_df['ConfirmedCases'] = ConfirmedCases
sub_df['Fatalities'] = Fatalities

In [None]:
sub_df.shape,sub.shape

In [None]:
sub_df.to_csv('submission.csv',index=False)