In [1]:
# dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import random
import math
import time
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
import datetime
import operator
plt.style.use('seaborn')
%matplotlib inline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.svm import SVC 

##### Read dataset from PostgreSQL

In [2]:
import os
from sqlalchemy import create_engine
# Create a POSTGRES database with the name 'COVID19_db'
# Replace username:password if it's not set to postgres:postgres
DATABASE_URI = os.environ.get('DATABASE_URL', '') or "postgresql://postgres:password@localhost:5432/COVID19_db"
print(DATABASE_URI)

engine = create_engine(DATABASE_URI)
case_data=pd.read_sql('select * from case_study_all_v', con=engine)

postgresql://postgres:password@localhost:5432/COVID19_db


In [3]:
case_data.head()

Unnamed: 0,status,id,age,gender,final_outcome,age_0_39,age_40_49,age_50_59,age_60_69,age_70_79,...,malaise,breath,fatigue,diarrhea,headache,throat_ache,soreness,precondition,visit_hotspot,from_hotspot
0,open,7289.0,,,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,open,9874.0,,,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,open,6832.0,,,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,open,12994.0,,,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,open,1768.0,,,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [4]:
feature_names=case_data.columns
feature_names=feature_names.drop('final_outcome')

In [5]:
case_data=case_data.drop(["status","id","age","gender"],axis=1)

In [6]:
data=case_data.drop("final_outcome",axis=1)
target=case_data["final_outcome"]
print(data.shape,target.shape)

(14126, 23) (14126,)


In [7]:
target_names="final_outcome"

###### Train Test Split

In [11]:
X=data
y=target

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [14]:
X_train.head()
y_train

13292    3
8381     3
10123    3
1494     3
2680     3
        ..
9964     3
206      3
5605     3
378      3
2079     3
Name: final_outcome, Length: 10594, dtype: int64

#### Preprocessing

In [15]:
# Scale your data
from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import MinMaxScaler
X_scaler = StandardScaler().fit(X_train)

In [16]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Random Forest 

In [19]:
rf = RandomForestClassifier(n_estimators=100)
rf = rf.fit(X_train_scaled, y_train)

In [21]:
print(f"Training Data Score: {rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test)}")

Training Data Score: 0.9613932414574288
Testing Data Score: 0.9467723669309174


#### Model Tuning

In [None]:

# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [100, 200,500,1000],
              'min_impurity_decrease': [0,0.0000001,0.000001,0.0001,0.0001]}
rf = RandomForestClassifier()
grid = GridSearchCV(rf, param_grid, verbose=3)

In [None]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)


In [25]:
print(grid.best_params_)
print(grid.best_score_)

{'min_impurity_decrease': 1e-06, 'n_estimators': 100}
0.9461015669246744


In [26]:
# {'min_impurity_decrease': 1e-06, 'n_estimators': 100} 0.9461015669246744
best_rf=RandomForestClassifier(min_impurity_decrease=0.000001,n_estimators=100)
best_rf.fit(X_train_scaled, y_train)
best_rf.score(X_train_scaled, y_train)

0.9611100622994148

In [31]:
best_rf.score(data,target)

0.9326065411298315

In [27]:
# Random Forests in sklearn will automatically calculate feature importance
importances = best_rf.feature_importances_
importances

array([0.07321724, 0.06534263, 0.0435299 , 0.03594911, 0.02250893,
       0.01479305, 0.15422489, 0.11514029, 0.04208734, 0.09615871,
       0.03544173, 0.00407798, 0.00460544, 0.01326235, 0.01016987,
       0.01035086, 0.00296242, 0.00743821, 0.01137377, 0.00826462,
       0.00456364, 0.12902681, 0.09551022])

In [44]:
# We can sort the features by their importance
sorted(zip(best_rf.feature_importances_, feature_names), reverse=True)

[(0.15422488819158756, 'age_50_59'),
 (0.12902681124669735, 'headache'),
 (0.11514028685796932, 'age_60_69'),
 (0.0961587091871975, 'age_80_up'),
 (0.09551022011724056, 'throat_ache'),
 (0.0732172422009843, 'status'),
 (0.06534263499257414, 'id'),
 (0.043529898909291126, 'age'),
 (0.042087343289645904, 'age_70_79'),
 (0.035949106891628675, 'gender'),
 (0.035441731883560035, 'gender_male'),
 (0.022508929654449398, 'age_0_39'),
 (0.01479304912944536, 'age_40_49'),
 (0.013262351964355793, 'fever'),
 (0.011373768367424202, 'breath'),
 (0.010350856393305798, 'sputum'),
 (0.01016986853658362, 'cough'),
 (0.008264622120320479, 'fatigue'),
 (0.007438208624668093, 'malaise'),
 (0.004605435052695855, 'pneumonia'),
 (0.004563637608969157, 'diarrhea'),
 (0.004077981342956446, 'gender_female'),
 (0.0029624174364492667, 'chills')]

### Decision Tree

In [33]:
# Create and score a decision tree classifier
clf = tree.DecisionTreeClassifier(min_impurity_decrease=0.00001, min_samples_split=4)
clf = clf.fit(X_train_scaled , y_train)


In [34]:
print(f"Training Data Score: {clf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {clf.score(X_test_scaled, y_test)}")

Training Data Score: 0.9598829526146876
Testing Data Score: 0.9476217440543602


#### Model Tuning

In [35]:
# Create the GridSearchCV model
param_grid = {'min_samples_split': [2, 4, 8, 16],
              'min_impurity_decrease': [0, 0.00001, 0.0001, 0.001]}
grid = GridSearchCV(clf, param_grid, verbose=3)

In [36]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] min_impurity_decrease=0, min_samples_split=2 ....................
[CV]  min_impurity_decrease=0, min_samples_split=2, score=0.941, total=   0.0s
[CV] min_impurity_decrease=0, min_samples_split=2 ....................
[CV]  min_impurity_decrease=0, min_samples_split=2, score=0.948, total=   0.0s
[CV] min_impurity_decrease=0, min_samples_split=2 ....................
[CV]  min_impurity_decrease=0, min_samples_split=2, score=0.940, total=   0.0s
[CV] min_impurity_decrease=0, min_samples_split=4 ....................
[CV]  min_impurity_decrease=0, min_samples_split=4, score=0.941, total=   0.0s
[CV] min_impurity_decrease=0, min_samples_split=4 ....................
[CV]  min_impurity_decrease=0, min_samples_split=4, score=0.948, total=   0.0s
[CV] min_impurity_decrease=0, min_samples_split=4 ....................
[CV]  min_impurity_decrease=0, min_samples_split=4, score=0.938, total=   0.0s
[CV] min_impurity_decrease=0, min_sampl

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:    0.7s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=1e-05,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=4,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'min_impurity_decrease': [0, 1e-05, 0.0001, 0.001],
                         'min_samples_split': [2, 4,

In [37]:
print(grid.best_params_)
print(grid.best_score_)

{'min_impurity_decrease': 0, 'min_samples_split': 16}
0.9441193128185765


In [40]:
# {'min_impurity_decrease': 0, 'min_samples_split': 16} 0.9441193128185765
best_clf=RandomForestClassifier(min_impurity_decrease=0,min_samples_split=16)
best_clf.fit(X_train_scaled, y_train)
best_clf.score(X_train_scaled, y_train)



0.9562960166131773

In [41]:
best_clf.score(data,target)

0.9326065411298315

In [43]:

sorted(zip(clf.feature_importances_, feature_names), reverse=True)

[(0.3182950537701415, 'age_50_59'),
 (0.2592677927219476, 'age_60_69'),
 (0.12498391800669037, 'headache'),
 (0.04973248206832567, 'age_70_79'),
 (0.043177846121547396, 'throat_ache'),
 (0.025283879547241775, 'gender_male'),
 (0.021095693660446127, 'age_0_39'),
 (0.020491209372630437, 'age_80_up'),
 (0.018461060101922697, 'status'),
 (0.01801249762737346, 'id'),
 (0.014714501024186729, 'age_40_49'),
 (0.014045516625961521, 'fever'),
 (0.013147352829884726, 'gender'),
 (0.012073018964887883, 'age'),
 (0.009800257909439156, 'breath'),
 (0.008280674763772252, 'sputum'),
 (0.006254673092105946, 'fatigue'),
 (0.005719346243255835, 'cough'),
 (0.005410286940051453, 'gender_female'),
 (0.004464220761714095, 'diarrhea'),
 (0.003164603850623587, 'malaise'),
 (0.002478389294911084, 'chills'),
 (0.0016457247009388494, 'pneumonia')]

In [None]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'min_samples_split': [2, 4, 8, 16],
              'min_impurity_decrease': [0, 0.00001, 0.000001]}
grid = GridSearchCV(clf, param_grid, verbose=3)

In [None]:
grid.fit(data, target)
grid.score(data, target)

In [None]:
# Create a decision tree graph
import graphviz 
dot_data = tree.export_graphviz(
    clf, out_file=None, 
    feature_names=feature_names,  
    class_names=target_names,  
    filled=True, rounded=True,  
    special_characters=True)  

import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png('case_data.png')

graph = graphviz.Source(dot_data)  
graph 

### K-nearest neighbour

In [None]:
X_train.shape

In [None]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data

X_scaler = StandardScaler().fit(X_train.reshape(-1, 1))

In [None]:
# Transform the training and testing data using the X_scaler and y_scaler models

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
# Note that k: 9 provides the best accuracy where the classifier starts to stablize
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train_scaled, y_train)
print('k=9 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

In [None]:
>>> zero_count = random.randint(0, 20)
>>> one_count = 40 - zero_count

>>> my_list = [0]*zero_count + [1]*one_count
>>> random.shuffle(my_list)
>>> my_list

In [None]:
import random
zero_count = random.randint(0, 24)
one_count = 23 - zero_count
my_list = [0]*zero_count + [1]*one_count

random.shuffle(my_list)
my_list

In [None]:
new_case_data=[my_list]
predicted_class=knn.predict(new_case_data)
print(predicted_class)

### SVM

In [None]:
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)

In [None]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test, y_test))

In [None]:
target_names=['Death','Hospital','Stay at Home']

In [None]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))