In [None]:
from google.colab import files 
uploaded = files.upload()

Saving AllDataWithStateMeanDailyWithScale.csv to AllDataWithStateMeanDailyWithScale.csv


In [309]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn import preprocessing

In [310]:
df_dataset = pd.read_csv('./AllDataWithStateMeanDailyWithScale.csv')
print(df_dataset)

     open_covid_region_code        date  ...  symptom:pancreatitis  hospitalized_new
0                     US-WY  2020-03-09  ...             24.028174               0.0
1                     US-WY  2020-03-16  ...             24.028174               0.0
2                     US-WY  2020-03-23  ...             87.182260              15.0
3                     US-WY  2020-03-30  ...             24.028174               8.0
4                     US-WY  2020-04-06  ...             24.028174              17.0
...                     ...         ...  ...                   ...               ...
1445                  US-AK  2020-08-24  ...              5.303184               0.0
1446                  US-AK  2020-08-31  ...              7.839357               0.0
1447                  US-AK  2020-09-07  ...             10.658002               0.0
1448                  US-AK  2020-09-14  ...              5.303184               0.0
1449                  US-AK  2020-09-21  ...              9.60101

### Create a log file to save all the results 

In [311]:
logFile = open("log.txt", "w")

## Task 3.1 Split Data

### Data acquisition and preparation

In [312]:
# Convert Pandas dataframe to Numpy array
dataset_array = df_dataset.values
column_names = df_dataset.columns
print(dataset_array.shape)

(1450, 425)


In [313]:
# Extract all states
states = np.unique(dataset_array[:,0])


In [314]:
# Initialize a dict to store state (key) and the row ranges in the matrix (value)
state_row_dict = {state: [] for state in states} 
for state in states:
  state_row_dict[state] = [np.where(dataset_array == state)[0][0]
                             , np.where(dataset_array == state)[0][-1]]


In [315]:
# sort the dataset array by dates
dataset_array_sort = dataset_array.copy()
dataset_array_sort = dataset_array_sort[dataset_array_sort[:,1].argsort(kind='mergesort')]


In [316]:
# Extract all dates
dates = np.unique(dataset_array_sort[:,1])

In [317]:
# Initialize a dict to store dates (keys) and the row ranges in the matrix (values)
date_row_dict = {date: [] for date in dates} 
for date in dates:
  date_row_dict[date] = [np.where(dataset_array_sort == date)[0][0]
                             , np.where(dataset_array_sort == date)[0][-1]]
print(date_row_dict)

{'2020-03-09': [0, 49], '2020-03-16': [50, 99], '2020-03-23': [100, 149], '2020-03-30': [150, 199], '2020-04-06': [200, 249], '2020-04-13': [250, 299], '2020-04-20': [300, 349], '2020-04-27': [350, 399], '2020-05-04': [400, 449], '2020-05-11': [450, 499], '2020-05-18': [500, 549], '2020-05-25': [550, 599], '2020-06-01': [600, 649], '2020-06-08': [650, 699], '2020-06-15': [700, 749], '2020-06-22': [750, 799], '2020-06-29': [800, 849], '2020-07-06': [850, 899], '2020-07-13': [900, 949], '2020-07-20': [950, 999], '2020-07-27': [1000, 1049], '2020-08-03': [1050, 1099], '2020-08-10': [1100, 1149], '2020-08-17': [1150, 1199], '2020-08-24': [1200, 1249], '2020-08-31': [1250, 1299], '2020-09-07': [1300, 1349], '2020-09-14': [1350, 1399], '2020-09-21': [1400, 1449]}


### Key Functions for splitting data and shuffling

In [318]:
def validation_shuffle(dict,num_fold):
  '''This function changes data in the validation set for k-fold cross-validation.
    Arguments: 1. the dictionary that store states (key) and the row ranges in the dataset_array (value)
                2. the number of folds for k-fold cross-validation
    Returns: the shuffled dictionary with origianl key-value pairs in shuffled order
  '''
  validation_size = (len(dict)//num_fold)+1
  shuffled_dict = dict.copy() 
  for i in range(validation_size):
    # move the first key-value pair to the last position
    del shuffled_dict[list(dict.keys())[i]] # remove from current position
    shuffled_dict[list(dict.keys())[i]] = dict[list(dict.keys())[i]] #add to the last position

  return shuffled_dict

In [319]:
def data_split(row_index_dict):
  '''This function split 80% of the data into training set and 20% into validation(test) set
    Arguments: the (shuffled) dictionary that store states/dates (key) and the row ranges in the dataset_array (value)
    Returns: the dictionaries that hold the states/dates(keys) and their row ranges(values) in both training set and validation
  '''
  num_keys = len(row_index_dict) # count the number of states/dates in the post-processed dataset
  train_dict = {} # create a dict to hold the states/dates(keys) and their row ranges(values) in training set
  validation_dict = {} # create a dict to hold the states/dates(keys) and their row ranges(values) in validation set
  counter = 0 
  for key in row_index_dict:
    counter += 1
    if counter <= math.floor(num_keys*0.8): # keep ~80% of regions in training set, the rest in validation set 
      train_dict[key] = row_index_dict[key] # add to training set
    else:
      validation_dict[key] = row_index_dict[key] # add to validation(test) set

  return train_dict, validation_dict

In [320]:
def train_test_generator(train_dict, validation_dict):
  '''This function generate data for training set and validation(test) set
    Arguments: the dictionaries that store states/dates (key) and the row ranges in the dataset_array (value) for training set and validation set
    Returns: the numpy arrays for features and labels in training set and validation(test) set
  '''
  # Create numpy array for training set and validation(test) set
  training_set_array = np.empty((0,dataset_array.shape[1]))
  validation_set_array = np.empty((0,dataset_array.shape[1]))

  for key in train_dict:
    training_set_array = np.append(training_set_array, dataset_array[train_dict[key][0]:train_dict[key][1]+1,:], axis=0)

  x_train = training_set_array[:,2:-1] # Drop the region and date, this is a numpy array containing symptom features in training set
  y_train = training_set_array[:,-1] # this is a numpy array containing hospitalization_new as output in training set

  for key in validation_dict:
    validation_set_array = np.append(validation_set_array, dataset_array[validation_dict[key][0]:validation_dict[key][1]+1,:], axis=0)
 
  x_test = validation_set_array[:,2:-1] # Drop the region and date, this is a numpy array containing symptom features in test set
  y_test = validation_set_array[:,-1] # this is a numpy array containing hospitalization_new as output in test set

  return x_train, y_train, x_test, y_test

### Split data based on regions and prepare for K-fold cross-validation (Default K=5)

In [321]:
train_set_list = [] # create a list to hold (x_train, y_train) tuples for cross-validation 
test_set_list = [] # create a list to hold (x_test, y_test) tuples for cross-validation
num_fold = 5 
shuffled_state_row_dict = state_row_dict.copy()

for i in range(num_fold):
  train_region_dict, validation_region_dict = data_split(shuffled_state_row_dict)
  splitted_tuple = train_test_generator(train_region_dict, validation_region_dict) # This tuple is (x_train_region, y_train_region, x_test_region, y_test_region)
  train_set_list.append(splitted_tuple[0:2]) # list: [(x_train_region,y_train_region), (.. , ..), ...]
  test_set_list.append(splitted_tuple[2:]) # list: [(x_test_region,y_test_region), (.. , ..), ...]
  shuffled_state_row_dict = validation_shuffle(shuffled_state_row_dict,num_fold) # shuffle the dataset 

### Split data based on time (No cross-validation)

In [322]:
train_time_dict, validation_time_dict = data_split(date_row_dict)
x_train_time, y_train_time, x_test_time, y_test_time = train_test_generator(train_time_dict, validation_time_dict) 

## Task 3.2 KNN and Decision Tree

In [323]:
import numpy as np
import sklearn
import matplotlib.pyplot as plt 
plt.style.use('seaborn')
from sklearn.decomposition import PCA
import pandas as pd

# Import model from sklearn
from sklearn.metrics import r2_score
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor


### Hyper parameter sets for grid search

In [324]:
nn_range = [3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
p_range = [1,2]
weights_range = ['uniform', 'distance']

criterion_range = ['mse', 'friedman_mse', 'mae']
max_depth_range = [5,6,7,8,9,10,11,12,13]

### Function for finding the best hyperparameters for KNN with the training data set



In [325]:
# Find best hyperparameters
def find_best_hyperparameters_for_knn(x_train, y_train):
  clf = GridSearchCV(neighbors.KNeighborsRegressor(), param_grid=dict(n_neighbors=nn_range, p=p_range
                                                 , weights=weights_range))

  x = StandardScaler().fit_transform(x_train)
  X_reduced = PCA(n_components=0.995).fit_transform(x)
  best_model = clf.fit(X_reduced, y_train)
  
  return [best_model.best_estimator_.get_params()['n_neighbors']
          ,best_model.best_estimator_.get_params()['p']
          ,best_model.best_estimator_.get_params()['weights']]

### Function For Training the Model Of KNN and Get the Accuracy

In [326]:
def knn_regression(x_train, y_train, x_test, y_test, parameters):

  # create the model, here k-nearest neighbours model
  knn = neighbors.KNeighborsRegressor(n_neighbors=parameters[0], p=parameters[1]
                                      , weights=parameters[2])

# Use PCA to reduce dimension for the model
  pca = PCA(n_components=0.995)
  X_reduced_train = pca.fit_transform(x_train)
  X_reduced_test = pca.transform(x_test)

  clf = knn.fit(X_reduced_train, y_train)
  y_ = clf.predict(X_reduced_test)
  mode_r2_score = r2_score(y_, y_test)
  accuracy = clf.score(X_reduced_test, y_test)
  
  mse = mean_squared_error(y_, y_test)
  mae = mean_absolute_error(y_, y_test)
  logFile.write("MSE is {} accuracy is {} RMSE is {} MAE is {} with n_neighbors: {}, p: {}, weights: {}\n"
                    .format(mse, accuracy, math.sqrt(mse), mae, parameters[0],
                            parameters[1], parameters[2]))
  return accuracy, mae

### Train the Model Of KNN

In [327]:
# Prediction based on region 
sum_accuracy = 0
min_mse = math.inf
accuracies_knn = [[], [], [], [], []]
errors_knn = [[], [], [], [], []]
hyper_params_knn = []

# Write to log file

logFile.write("KNN Start\n")

# Find the best hypter parameters for each training data set first
for k in range(5):
  x_train = train_set_list[k][0]
  y_train = train_set_list[k][1]
  hyper_params_knn.append(find_best_hyperparameters_for_knn(x_train, y_train))
for m in range(5):
  param = hyper_params_knn[m]
  for k in range(5):
    sum_accuracy = 0
    x_train = train_set_list[k][0]
    y_train = train_set_list[k][1]
    x_test = test_set_list[k][0]
    y_test = test_set_list[k][1]
    accuracy, mse = knn_regression(x_train, y_train, x_test, y_test, param)
    sum_accuracy += accuracy
    accuracies_knn[k].append(accuracy)
    errors_knn[k].append(mse)
    if (mse < min_mse): 
      min_mse = mse


### Print Results based on Time KNN

In [328]:
# Prediction based on time 
knn_regression(x_train_time, y_train_time, x_test_time, y_test_time
              ,find_best_hyperparameters_for_knn(x_train_time, y_train_time))


(0.17676243090422272, 341.226991095283)

In [329]:
# Find best hyperparameters
def find_best_hyperparameters_for_dtr(x_train, y_train):
  clf = GridSearchCV(DecisionTreeRegressor(), param_grid=dict(criterion=criterion_range
                                                 , max_depth=max_depth_range))
  x = StandardScaler().fit_transform(x_train)
  X_reduced = PCA(n_components=0.995).fit_transform(x)    
  best_model = clf.fit(X_reduced, y_train)

  # print("Best criterion is : {}".format(best_model.best_estimator_.get_params()['criterion']))
  # print("Best max_depth is : {}".format(best_model.best_estimator_.get_params()['max_depth']))

  return [best_model.best_estimator_.get_params()['criterion']
          ,best_model.best_estimator_.get_params()['max_depth']]

In [330]:
def decision_tree_regression(x_train, y_train, x_test, y_test, parameters): 
  dtr = DecisionTreeRegressor(criterion=parameters[0], max_depth=parameters[1])

  pca = PCA(n_components=0.995)
  X_reduced_train = pca.fit_transform(x_train)
  X_reduced_test = pca.transform(x_test)

  clf = dtr.fit(X_reduced_train, y_train)
  y_ = clf.predict(X_reduced_test)
  accuracy = clf.score(X_reduced_test, y_test)
  mode_r2_score = r2_score(y_, y_test)
  mse = mean_squared_error(y_, y_test)
  mae = mean_absolute_error(y_, y_test)

  logFile.write("MSE is {} accuracy is {} RMSE is {} MAE is {} with criterion: {}, max_depth: {}\n"
                    .format(mse, accuracy, math.sqrt(mse), mae, parameters[0],
                            parameters[1]))
  return accuracy, mae


### Train the Model Of Decision Tree

In [331]:
# Prediction based on region 
logFile.write("Decision Tree Start\n")
sum_accuracy = 0
min_mse = math.inf
accuracies_dtr = [[],[],[],[],[]]
errors_dtr = [[],[],[],[],[]]
hyper_params_dtr = []
for m in range(5):
  x_train = train_set_list[m][0]
  y_train = train_set_list[m][1]
  hyper_params_dtr.append(
      find_best_hyperparameters_for_dtr(x_train, y_train))
for m in range(5):
  params = hyper_params_dtr[m]
  for k in range(5): 
    sum_accuracy = 0
    x_train = train_set_list[k][0]
    y_train = train_set_list[k][1]
    x_test = test_set_list[k][0]
    y_test = test_set_list[k][1]
    accuracy, mae = decision_tree_regression(x_train, y_train, x_test, y_test, params)
    sum_accuracy += accuracy
    errors_dtr[k].append(mae)
    accuracies_dtr[k].append(accuracy)
    if (mse < min_mse):
      min_mse = mse


### Print Results based on Time Decision Tree




In [332]:
# Prediction based on time 
decision_tree_regression(x_train_time, y_train_time, x_test_time, y_test_time
                         , find_best_hyperparameters_for_dtr(x_train_time, y_train_time))

(0.1208152364982451, 349.7475998656696)

#### Save the log file

In [333]:
logFile.close()

# Results For KNN

In [334]:
accuracies_knn = np.asarray(accuracies_knn)
errors_knn = np.asarray(errors_knn)

#### Best Hyper parameters For Each Fold Of Cross Validation KNN From GridSearch



In [335]:
print(hyper_params_knn)

[[20, 1, 'distance'], [20, 1, 'distance'], [20, 1, 'distance'], [17, 1, 'uniform'], [20, 2, 'distance']]


#### MAEs For Each Fold OF Cross Validation KNN




In [336]:
print(errors_knn)

[[115.44735428 115.44735428 115.44735428 119.02393509 119.17000466]
 [362.58724089 362.58724089 362.58724089 380.37565923 467.5061806 ]
 [130.51334646 130.51334646 130.51334646 114.79858012 125.27939021]
 [152.36886134 152.36886134 152.36886134 152.4693712  153.5718456 ]
 [487.26064668 487.26064668 487.26064668 489.95070994 484.88625215]]


#### Average MAEs For Each Fold OF Cross Validation KNN

In [337]:
print(np.sum(errors_knn, axis=1)/5)

[116.90720052 387.1287125  126.32360194 152.62956016 487.32378042]


#### Accuries For Each Fold Of Cross Validation KNN

In [338]:
print(accuracies_knn)

[[ 0.30021149  0.30021149  0.30021149  0.28112466  0.2926249 ]
 [ 0.18811184  0.18811184  0.18811184  0.12013042 -0.13632791]
 [ 0.14707453  0.14707453  0.14707453  0.31443828  0.14357576]
 [ 0.11083796  0.11083796  0.11083796  0.07375554  0.09864731]
 [ 0.06581623  0.06581623  0.06581623  0.06345678  0.09741683]]


#### Average Accuracies For Each Fold Of Cross Validation

In [339]:
print(np.sum(accuracies_knn, axis=1)/5)

[0.29487681 0.10962761 0.17984752 0.10098335 0.07166446]


### Print The Results For Decision Tree

In [340]:
accuracies_dtr = np.asarray(accuracies_dtr)
errors_dtr = np.asarray(errors_dtr)

#### Best Hyper parameters For Each Fold Of Cross Validation Decision Tree

In [341]:
print(hyper_params_dtr)

[['mae', 6], ['mse', 6], ['mse', 5], ['mse', 5], ['mae', 5]]


#### MAEs For Each Fold OF Cross Validation Decision Tree

In [342]:
print(errors_dtr)

[[121.10517241 127.396496   110.56625262 110.56625262 115.14827586]
 [434.06034483 376.32248771 384.05861202 384.05861202 410.28793103]
 [131.73103448 185.06700475 159.75932644 159.75932644 132.62931034]
 [146.55517241 193.29000189 209.0040861  220.77649989 135.58103448]
 [607.07413793 943.45778871 463.27145306 463.27145306 553.83793103]]


#### Average MAEs For Each Fold OF Cross Validation KNN

In [343]:
print(np.sum(errors_dtr, axis=1)/5)

[116.9564899  397.75759752 153.78920049 181.04135895 606.18255276]


#### Accuracies For Each Fold Of Cross Validation Decision Tree

In [344]:
print(accuracies_dtr)

[[ 0.21407896  0.07979046  0.2941593   0.2941593   0.29460506]
 [-0.10252094  0.10333043  0.09755086  0.09755086 -0.03225535]
 [ 0.25456408 -3.25050883 -1.29862403 -1.29862403  0.22091835]
 [-0.02782158 -0.09354829 -0.09255744 -0.19643405  0.03373176]
 [-0.13376242 -2.07478902  0.00721834  0.00721834 -0.05374279]]


#### Average Accuracies For Each Fold Of Cross Validation Decision Tree

In [345]:
print(np.sum(accuracies_dtr, axis=1)/5)

[ 0.23535862  0.03273117 -1.07445489 -0.07532592 -0.44957151]
