# Milestone 2
## 2.1 and 2.2 Split Train Dataset, Build and Evaluate a Model

In [1]:
import collections as c
import datetime
from IPython.core.interactiveshell import InteractiveShell
import matplotlib as mp
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn import svm, tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm #creates progress bar to let you know how long is left till function is complete
import xgboost as xgb

InteractiveShell.ast_node_interactivity = 'all'

In [2]:
raw_data = pd.read_csv('../dataset/cases_train_processed.csv')

In [3]:
# this way I dont have to load data all the time
df = raw_data.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367634 entries, 0 to 367633
Data columns (total 23 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Unnamed: 0              367634 non-null  int64  
 1   index                   367634 non-null  int64  
 2   age                     367634 non-null  int64  
 3   sex                     367634 non-null  object 
 4   province                367634 non-null  object 
 5   country                 367634 non-null  object 
 6   date_confirmation       367634 non-null  object 
 7   additional_information  367634 non-null  object 
 8   source                  367634 non-null  object 
 9   outcome                 367634 non-null  object 
 10  Province_State          334812 non-null  object 
 11  Country_Region          356275 non-null  object 
 12  Last_Update             356275 non-null  object 
 13  Lat_right               356275 non-null  float64
 14  Long_right          

In [4]:
%%time
#remove nan values
df = df.drop(['Unnamed: 0','index','source','additional_information','Last_Update','Lat_right','Long_right','Province_State','Country_Region'],1)
df['Confirmed'].fillna(df['Confirmed'].mean(),inplace=True)
df['Deaths'].fillna(df.Deaths.mean(),inplace=True)
df['Recovered'].fillna(df.Recovered.mean(),inplace=True)
df['Active'].fillna(df.Active.mean(),inplace=True)
df['Incidence_Rate'].fillna(df.Incidence_Rate.mean(),inplace=True)
df['Case-Fatality_Ratio'].fillna(df['Case-Fatality_Ratio'].mean(),inplace=True)
df.date_confirmation = pd.to_datetime(df.date_confirmation,infer_datetime_format=True) 
df.Combined_Key.fillna((df.province+" ,"+df.country),inplace=True)


CPU times: user 162 ms, sys: 14.3 ms, total: 177 ms
Wall time: 176 ms


In [5]:
# set this as a string so that my encode function doesn't pick it up
df.outcome = df.outcome.astype('string')

In [6]:
#might create duplicates so check and delete them
def dropDuplicates(data):
    duplicates = data.columns[data.columns.duplicated()]
    if len(duplicates) > 0:
        data = data.loc[:,~data.columns.duplicated()]
    return data

In [7]:
# once hot encode and add new cols to dataframe
def oneHotEncode_df(dataframe):
    col2Encode = list(dataframe.select_dtypes(include=['object'])) #gets a list of all the features that are objects assumption is that those are categorical
    dummies = pd.get_dummies(dataframe,columns=col2Encode,prefix=col2Encode,sparse=True)
    res = pd.concat([dataframe, dummies], axis=1)
    #if we decide to drop one hot encoded values
    res = res.drop(col2Encode, axis=1)
    output = dropDuplicates(res)
    return output

In [8]:
%%time
# add dummy variables  
ohe_df = oneHotEncode_df(df)
ohe_df

CPU times: user 1.17 s, sys: 0 ns, total: 1.17 s
Wall time: 1.27 s


Unnamed: 0,age,date_confirmation,outcome,Confirmed,Deaths,Recovered,Active,Incidence_Rate,Case-Fatality_Ratio,dist_between_in_km,...,"Combined_Key_Zamboanga del Sur ,Philippines","Combined_Key_Zamfara ,Nigeria","Combined_Key_Zanjan ,Iran","Combined_Key_Zeeland, Netherlands","Combined_Key_Zhejiang, China",Combined_Key_Zimbabwe,"Combined_Key_Zuid-Holland, Netherlands","Combined_Key_galapagos ,Ecuador","Combined_Key_Ñeembucu ,Paraguay","Combined_Key_ñeembucu ,Paraguay"
0,7,2020-05-26,recovered,238828.000000,4907.000000,201671.000000,32250.000000,1276.409575,2.054617,10.390897,...,0,0,0,0,0,0,0,0,0,0
1,7,2020-05-20,hospitalized,342788.000000,4869.000000,270094.000000,67825.000000,144.099577,1.420411,209.514800,...,0,0,0,0,0,0,0,0,0,0
2,7,2020-05-26,hospitalized,2859.000000,2.000000,2639.000000,218.000000,464.331421,0.069955,204.687238,...,0,0,0,0,0,0,0,0,0,0
3,3,2020-03-15,nonhospitalized,46779.000000,1871.000000,41228.000000,3680.000000,422.592353,3.999658,126.061115,...,0,0,0,0,0,0,0,0,0,0
4,7,2020-05-20,hospitalized,120336.000000,3286.000000,100974.000000,16076.000000,188.400627,2.730687,217.585376,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367629,3,2020-03-31,nonhospitalized,106331.902334,2739.064287,77838.956438,25753.923921,968.698456,2.721038,-1.000000,...,0,0,0,0,0,0,0,0,0,0
367630,7,2020-04-16,hospitalized,238828.000000,4907.000000,201671.000000,32250.000000,1276.409575,2.054617,10.390897,...,0,0,0,0,0,0,0,0,0,0
367631,7,2020-05-30,hospitalized,2859.000000,2.000000,2639.000000,218.000000,464.331421,0.069955,111.254821,...,0,0,0,0,0,0,0,0,0,0
367632,3,2020-03-02,recovered,57558.000000,27.000000,57142.000000,389.000000,983.839751,0.046909,0.367129,...,0,0,0,0,0,0,0,0,0,0


In [10]:
ohe_np = ohe_df.to_numpy()
ohe_np

array([[7, Timestamp('2020-05-26 00:00:00'), 'recovered', ..., 0, 0, 0],
       [7, Timestamp('2020-05-20 00:00:00'), 'hospitalized', ..., 0, 0,
        0],
       [7, Timestamp('2020-05-26 00:00:00'), 'hospitalized', ..., 0, 0,
        0],
       ...,
       [7, Timestamp('2020-05-30 00:00:00'), 'hospitalized', ..., 0, 0,
        0],
       [3, Timestamp('2020-03-02 00:00:00'), 'recovered', ..., 0, 0, 0],
       [1, Timestamp('2020-05-27 00:00:00'), 'nonhospitalized', ..., 0,
        0, 0]], dtype=object)

In [9]:
#split into dependent and independent var
x = ohe_df.copy().drop(columns=['outcome','date_confirmation','dist_between_in_km'])
y = ohe_df.outcome.copy()

In [10]:
%%time
#split train data
trainData_x, validationData_x, trainData_y, validationData_y = train_test_split(x,y, train_size=0.8, random_state=1)

CPU times: user 33.9 s, sys: 0 ns, total: 33.9 s
Wall time: 34 s


# ----------------------------------Model Building-----------------------------------

In [12]:
#create 3 diff classififers and append them to list
classifiers = []

xgb_model = xgb.XGBClassifier()
classifiers.append(xgb_model)

nn_model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
classifiers.append(nn_model)

svm_model = svm.SVC()
classifiers.append(svm_model)

In [22]:
# #test models with full samples
# for clas in tqdm(classifiers):
#     clas.fit(trainData_x,trainData_y)
#     predicted = clas.predict(validationData_x)
#     accuracy = accuracy_score(validationData_y,predicted)
#     print("Accuracy of %s is %s"%(clas, accuracy))
#     con_mat = confusion_matrix(validationData_y,predicted)
#     print("Confusion Matrix of %s is %s"%(clas, con_mat))
#     pickle.dump( clas, open( "{}{}.pkl".format(clas,i), "wb" ) )

In [17]:
y_trainInt = pd.Series(trainData_y).replace({'recovered' : 0, 'deceased' : 1, 'nonhospitalized' : 2, 'hospitalized' : 3})
y_trainInt

218247    2
33854     0
234440    3
316468    3
250495    2
         ..
117583    3
73349     2
312201    3
267336    3
128037    0
Name: outcome, Length: 294107, dtype: int64

In [18]:
y_validInt = pd.Series(validationData_y).replace({'recovered' : 0, 'deceased' : 1, 'nonhospitalized' : 2, 'hospitalized' : 3})
y_validInt

247886    2
364290    2
237307    3
22967     3
29696     3
         ..
47882     3
319748    2
153914    2
219937    2
352073    2
Name: outcome, Length: 73527, dtype: int64

In [1]:
%%time
#convert into Dmatrix
data_matrix = xgb.DMatrix(data=trainData_x,label=y_trainInt)
valid_matrix = xgb.DMatrix(data=validationData_x,label=y_validInt)

NameError: name 'xgb' is not defined

In [20]:
dm = xgb.DMatrix(data=trainData_x.iloc[0:1000],label=y_trainInt.iloc[0:1000])
vm = xgb.DMatrix(data=validationData_x.iloc[0:1000],label=y_validInt.iloc[0:1000])

In [41]:
n = [1,5,10,25,50]

for i in tqdm(n):
    xgb_model = xgb.XGBClassifier(n_estimators=i)
    xfit = xgb_model.fit(trainData_x,trainData_y)
    predicted = xgb_model.predict(validationData_x)

    tscore = xgb_model.score(trainData_x,y_trainInt)
    print("train score is {}".format(tscore))
    vscore = xgb_model.score(validationData_x,y_validInt)
    print("val score is {}".format(vscore))
    accuracy = accuracy_score(y_validInt,predicted)
    print("Accuracy of XGBClassifier is {}".format(accuracy))
    con_mat = confusion_matrix(validationData_y,predicted)
    print("Confusion Matrix of xgbmodel is {}".format(con_mat))
    pickle.dump( xgb_model, open( "xgboost{}.pkl".format(i), "wb" ) )


Error: connect ECONNREFUSED 127.0.0.1:45361

In [15]:
%%time
n = [1,5,10,25,50]
xgb_model = xgb.XGBClassifier(n_estimators=1)
xfit = xgb_model.fit(trainData_x,trainData_y)
predicted = xgb_model.predict(validationData_x)

tscore = xgb_model.score(trainData_x,trainData_y)
print("train score is {}".format(tscore))
vscore = xgb_model.score(validationData_x,validationData_y)
print("val score is {}".format(vscore))
accuracy = accuracy_score(validationData_y,predicted)
print("Accuracy of XGBClassifier is {}".format(accuracy))
pickle.dump( xgb_model, open( "xgboost_n1.pkl", "wb" ) )


train score is 0.7799916356972122
val score is 0.7778639139363771
Accuracy of XGBClassifier is 0.7778639139363771
CPU times: user 2min 45s, sys: 7.78 s, total: 2min 53s
Wall time: 1min 11s


In [16]:
%%time
xgb_model = xgb.XGBClassifier(n_estimators=5)
xfit = xgb_model.fit(trainData_x,trainData_y)
predicted = xgb_model.predict(validationData_x)

tscore = xgb_model.score(trainData_x,trainData_y)
print("train score is {}".format(tscore))
vscore = xgb_model.score(validationData_x,validationData_y)
print("val score is {}".format(vscore))
accuracy = accuracy_score(validationData_y,predicted)
print("Accuracy of XGBClassifier is {}".format(accuracy))
pickle.dump( xgb_model, open( "xgboost_n5.pkl", "wb" ) )

train score is 0.7809096689300152
val score is 0.7787343424864336
Accuracy of XGBClassifier is 0.7787343424864336
CPU times: user 9min 24s, sys: 8.86 s, total: 9min 32s
Wall time: 3min 6s


In [17]:
%%time
xgb_model = xgb.XGBClassifier(n_estimators=10)
xfit = xgb_model.fit(trainData_x,trainData_y)
predicted = xgb_model.predict(validationData_x)

tscore = xgb_model.score(trainData_x,trainData_y)
print("train score is {}".format(tscore))
vscore = xgb_model.score(validationData_x,validationData_y)
print("val score is {}".format(vscore))
accuracy = accuracy_score(validationData_y,predicted)
print("Accuracy of XGBClassifier is {}".format(accuracy))
pickle.dump( xgb_model, open( "xgboost_n10.pkl", "wb" ) )

train score is 0.7813380844386567
val score is 0.779196757653651
Accuracy of XGBClassifier is 0.779196757653651
CPU times: user 17min 34s, sys: 11.6 s, total: 17min 46s
Wall time: 5min 21s


In [None]:
%%time
xgb_model = xgb.XGBClassifier(n_estimators=25)
xfit = xgb_model.fit(trainData_x,trainData_y)
predicted = xgb_model.predict(validationData_x)

tscore = xgb_model.score(trainData_x,trainData_y)
print("train score is {}".format(tscore))
vscore = xgb_model.score(validationData_x,validationData_y)
print("val score is {}".format(vscore))
accuracy = accuracy_score(validationData_y,predicted)
print("Accuracy of XGBClassifier is {}".format(accuracy))
pickle.dump( xgb_model, open( "xgboost_n25.pkl", "wb" ) )

In [None]:
%%time
xgb_model = xgb.XGBClassifier(n_estimators=50)
xfit = xgb_model.fit(trainData_x,trainData_y)
predicted = xgb_model.predict(validationData_x)

tscore = xgb_model.score(trainData_x,trainData_y)
print("train score is {}".format(tscore))
vscore = xgb_model.score(validationData_x,validationData_y)
print("val score is {}".format(vscore))
accuracy = accuracy_score(validationData_y,predicted)
print("Accuracy of XGBClassifier is {}".format(accuracy))
pickle.dump( xgb_model, open( "xgboost_n50.pkl", "wb" ) )

# Exploratory Analysis