In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
import sklearn.model_selection as model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import pickle
import xgboost as xgb
from sklearn.metrics import confusion_matrix
import pickle

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score
from sklearn.neural_network import MLPClassifier
from sklearn import svm, tree
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from tqdm import tqdm #creates progress bar to let you know how long is left till function is complete
import xgboost as xgb
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTENC
from imblearn.pipeline import Pipeline
from pandas.tseries.offsets import DateOffset

# Original Processed Data

In [2]:
%%time
df = pd.read_csv('/content/drive/MyDrive/DoR/cases_train_processed.csv')
df = df.drop(['Unnamed: 0','index','source','additional_information','Last_Update','Lat_right','Long_right','Province_State','Country_Region','dist_between_in_km'],1)
df['Confirmed'].fillna(df['Confirmed'].mean(),inplace=True)
df['Deaths'].fillna(df.Deaths.mean(),inplace=True)
df['Recovered'].fillna(df.Recovered.mean(),inplace=True)
df['Active'].fillna(df.Active.mean(),inplace=True)
df['Incidence_Rate'].fillna(df.Incidence_Rate.mean(),inplace=True)
df['Case-Fatality_Ratio'].fillna(df['Case-Fatality_Ratio'].mean(),inplace=True)
df.date_confirmation = pd.to_datetime(df.date_confirmation,infer_datetime_format=True) 
df.Combined_Key.fillna((df.province+" ,"+df.country),inplace=True)

# Make 2 week Bins for date_confirmation

In [None]:
#add bins to dataframe
%%time
dateBinDict = getDateBins(df.date_confirmation)
df['date_labels'] = df.date_confirmation.apply(lambda curr_date : binDate(curr_date,dateBinDict))

In [None]:
def getDateBins(data):

  """ funtion takes in the date column, creates an array of 2 week intervals
   starting from the earliest day in the dataset
    and returns a dictionary of ordinal data where the k is the new label
    and value is the 2 week interval.
    ex: earliest confrimed date is 2020-01-02 -> 
    interval is (2020-01-02, 2020-01-16). dict returns 1:(2020-01-02, 2020-01-16)
  """
  start_date = data.min() - DateOffset(days=14)#earliest date + buffer
  print("start date is ",start_date)
  end_date = data.max() + DateOffset(days=14) #buffer to make sure nothing missed
  print("end date is ", end_date)
  date_interval = pd.interval_range(start=start_date,end=end_date,freq='14D')
  date_dict = {}
  for i in range(len(date_interval)):
    date_dict[i+1] = date_interval[i]
  return date_dict

def binDate(row_date,ddict):
  """ input is date_confrimation column. if date is in interval
  returns the dictionary key"""
  for k,v in ddict.items():
    # if row_date in v:
    #   print("here is key ",k)
    #   print("here is interval",v)
    if row_date in v:
      return k


# Over and Undersampling

In [None]:
#drops duplicated columns created
def dropDuplicates(data):
    duplicates = data.columns[data.columns.duplicated()]
    if len(duplicates) > 0:
        data = data.loc[:,~data.columns.duplicated()]
    return data

#drops all the columns not used in X_train
def colsToDrop(dataframe):
    if 'outcome' in dataframe.columns:
        dataframe = dataframe.drop('outcome',1)
        print("dropping the outcome column")
    if 'Combined_Key' in dataframe.columns:
        dataframe = dataframe.drop('Combined_Key',1)
        print("dropping Combined_Key")
    if 'dist_between_in_km' in dataframe.columns:
        dataframe = dataframe.drop('dist_between_in_km',1)
        print("dropping dist in km column")
    if 'date_confirmation' in dataframe.columns:
        dataframe = dataframe.drop('date_confirmation',1)
        print("dropping date")
    if 'date_labels' in dataframe.columns:
        dataframe.date_labels = dataframe.date_labels.astype('object')
        print("converting date to categorical")
    if 'age' in dataframe.columns:
        dataframe.age = dataframe.age.astype('object')
        print("converting age to categorical")
    return dataframe
    
    # once hot encode and add new cols to dataframe
def oneHotEncode_df(dataframe):
    dataframe = colsToDrop(dataframe)
    col2Encode = list(dataframe.select_dtypes(include=['object'])) #gets a list of all the features that are objects assumption is that those are categorical
    dummies = pd.get_dummies(dataframe,columns=col2Encode,prefix=col2Encode,sparse=True)
    res = pd.concat([dataframe, dummies], axis=1)
    #if we decide to drop one hot encoded values
    res = res.drop(col2Encode, axis=1)
    output = dropDuplicates(res)
    return output

    # SMOTENC needs a list of the indices of all the categorical variables in dataset
    # ex: if country is column 2 and age is col 13. returns [2,13]
def getCategoricalIndices(dataframe):
    #get the indices of all the categorical variables
    numericalVar = list(dataframe.select_dtypes(include=['float64']))
    catlist = list(dataframe.columns.difference(numericalVar))
    print("these are the categorical features: {}".format(catlist))
    indlist = []
    for i in catlist:
        indlist.append(dataframe.columns.get_loc(i))
    return indlist

## Code for  Creating OverSampled Data 

In [None]:
%%time
# Perform Oversampling before ohe.
#so drop columns not used in independent var 
sparse_df = colsToDrop(df)
#get list of categorical indices
catIndList = getCategoricalIndices(sparse_df)

In [None]:
%%time
#oversample 
oversample = SMOTENC(categorical_features=catIndList,random_state=0,sampling_strategy='not majority')
x_o,y_o = oversample.fit_resample(sparse_df,df.outcome)

In [None]:
# join dependent and independent var back together to save to csv
over_np = np.column_stack([x_o,y_o])

In [None]:
#creating headers to conver oversample data back to dataframe b4 saving to csv
coll = list(sparse_df.columns)
coll.append('outcome')

In [None]:
# save oversampled data to csv
pd.DataFrame(over_np,columns=coll).to_csv("oversampledTrain.csv")

### ------End of Synthertic Data Creation -----------------------

## Begin analysis with Synthetic Data

### load in oversample csv

In [None]:
o_df = pd.read_csv('/content/drive/MyDrive/DoR/oversampledTrain.csv')

In [None]:
# ensure age is categorical, drop unwanted columns
o_df.age = o_df.age.astype('object')
o_df.data_labels = o_df.date_labels.astype('object')
y_tr = o_df.outcome
X_tr = o_df
if 'outcome' in X_tr.columns:
      X_tr = X_tr.drop('outcome',1)
      print("dropping the outcome column")
if 'Unnamed: 0' in X_tr.columns:
      X_tr = X_tr.drop('Unnamed: 0',1)
      print("dropping the Unnamed column")      


In [None]:
## ------------------------------- End of Over Sampling Section------------------------------------------

In [4]:
%%time
X = res
y = y_tr
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

Wall time: 18.9 s


In [10]:
%%time
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 5, stop = 50, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [11]:
%%time
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, n_jobs = -1)
# Fit the random search model9
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 24.4min finished


Wall time: 25min 14s


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [5, 16, 27, 38, 50]},
                   verbose=2)

In [12]:
rf_random.cv_results_

{'mean_fit_time': array([157.13379264, 280.14043705, 741.09245086, 479.70631162,
        528.13843187, 338.64585431, 280.68518726,  85.83966589,
        356.48751616, 320.33887943]),
 'std_fit_time': array([48.65763395, 26.34444578, 34.5810544 , 27.10991371,  8.2603901 ,
        10.46760837, 17.88680982,  5.40086341, 17.04137483,  5.84005381]),
 'mean_score_time': array([ 4.35861023, 11.64101807, 18.48631295,  8.00393907, 10.12099997,
        12.51503968, 12.80548358,  3.40196125,  3.45836441,  2.49748786]),
 'std_score_time': array([0.05164713, 1.01132544, 6.38898783, 3.03023367, 3.60654071,
        2.52892186, 3.6641926 , 1.80489655, 1.24466674, 0.2019956 ]),
 'param_n_estimators': masked_array(data=[5, 27, 50, 27, 38, 27, 27, 5, 27, 38],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[5, 5, 5, 2, 5, 5, 2, 5, 5, 2],
        

In [14]:
pickle.dump( rf_random, open( "rf10overSample", "wb" ) )

In [15]:
rf_random.best_estimator_

RandomForestClassifier(bootstrap=False, max_depth=40, max_features='sqrt',
                       min_samples_split=5, n_estimators=5)

In [16]:
%%time
randomForestprediction = list(rf_random.best_estimator_.predict(X_test))
randomForestConfusionMatrix = confusion_matrix(y_test, randomForestprediction, labels=["recovered","nonhospitalized","hospitalized","deceased"])

In [17]:
randomForestConfusionMatrix

array([[11809,   192, 12821,  5178],
       [  316, 29126,    12,   545],
       [ 5327,     4, 20061,  4608],
       [ 3367,   122, 12152, 14359]], dtype=int64)

In [18]:
%%time
xgBoost = xgb.XGBClassifier(random_state = 0)

In [20]:
xgBoost.getparams()

AttributeError: 'XGBClassifier' object has no attribute 'getparams'

In [None]:
%%time
# Learning Rate
eta = [double(x) for x in np.linspace(start = 0, stop = 1, num = 0.25)]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
%%time


clf_xgb = xgb.XGBClassifier()
param_dist = {'n_estimators': stats.randint(150, 1000),
              'learning_rate': stats.uniform(0.01, 0.6),
              'subsample': stats.uniform(0.3, 0.9),
              'max_depth': [3, 4, 5, 6, 7, 8, 9],              
              'min_child_weight': [1, 2, 3, 4]
             }

xgb_rs = RandomizedSearchCV(clf_xgb, 
                         param_distributions = param_dist,
                         cv = 3,  
                         n_iter = 10, 
                         verbose = 5, 
                         n_jobs = -1)
xgb_rs.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  5.3min
