## Group_29 Project Phase 4

### Part B. Classification (Supervised Learning)

#### onnecting to DB

In this first part, we'll be connecting to our database and fetching our labels and features

In [1]:
import sys
sys.path.insert(1, '../lib/python3.9/site-packages')
from configparser import ConfigParser
import psycopg2
import numpy as np
import pandas as pd
import matplotlib as mp
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import recall_score, precision_score
#Can be very helpful to notice any imbalance in classes
from collections import Counter 
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
import time
from sklearn.svm import SVC
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
import seaborn as sns

connect to PSQL server

In [2]:
def config(filename='../database.ini', section='postgresql'):
    parser = ConfigParser()
    # read config file
    parser.read(filename) 
 
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 
    return db

Get the configuration file as a python dictionary

In [3]:
cfg = config()

Establish the connection and create a cursor to the database

In [4]:
try:
    print("Attempting to connect to the database")
    conn = psycopg2.connect(**cfg)
    cursor = conn.cursor()
    print("Connection Successful")
    
except (Exception, psycopg2.DatabaseError) as error:
    print(error)

Attempting to connect to the database
Connection Successful


We'll now attempt to get our data

In [5]:
try:
    #Lets get our data 
    cursor.execute('''
                    select p.outbreak_related, p.age_group, s.special_measures_key, m.grocery_and_pharmarcy, m.parks,
                    f.total_resolved, f.total_unresolved, f.total_fatal
                    from  covid_19_tracking_fact_table f,
                    patient_dimension p, special_measures_dimension s, mobility_dimension m
                    where f.patient_key =p.patient_key and f.special_measures_key = s.special_measures_key
                    and f.mobility_key = m.mobility_key
                    group by (p.outbreak_related,  p.age_group, s.special_measures_key, m.grocery_and_pharmarcy, m.parks,
                    f.total_resolved, f.total_unresolved, f.total_fatal)''')



    #Get the complete result set. It will be a list of tuples where each tuple is a row from the result set
    result_list = cursor.fetchall()
        
except (Exception, psycopg2.DatabaseError) as error:
    print(error)
    print("Rolling back...")
    conn.rollback()
    print("Rolled back successfully")

In [6]:
#Ensure to run this cell at the end of all your experiments to close all connections
cursor.close()
conn.close()

In [7]:
df = pd.DataFrame(result_list, columns=["outbreak_related","age_group","lockdown_type", "grocery_and_pharmacy",
                                               "parks","total_resolved", "total_unresolved","total_fatal"])

In [8]:
df.head()

Unnamed: 0,outbreak_related,age_group,lockdown_type,grocery_and_pharmacy,parks,total_resolved,total_unresolved,total_fatal
0,False,20s,1,-15,-36,1,0,0
1,False,20s,1,-15,-36,2,0,0
2,False,20s,1,-14,-18,1,0,0
3,False,20s,1,-14,-18,2,0,0
4,False,20s,1,-13,-31,1,0,0


#### preprocess the dataset

In [9]:
def outbreak(x):
    if x == True:
        return 1
    else:
        return 0

In [10]:
df['outbreak_related'] = df['outbreak_related'].apply(outbreak)

##### set up label

In [11]:
def age_group(x):
    if x == '<20':
        return 0
    elif x == '20s':
        return 1
    elif x == '30s':
        return 2
    elif x == '40s':
        return 3
    elif x == '50s':
        return 4
    elif x == '60s':
        return 5
    elif x == '70s':
        return 6
    elif x == '80s':
        return 7
    else:
        return 8

In [12]:
df['label'] = df['age_group'].apply(age_group)

In [13]:
label = pd.DataFrame()
label = df[['label']]

In [14]:
df.drop(['age_group'],axis=1,inplace = True)

In [15]:
df.head()

Unnamed: 0,outbreak_related,lockdown_type,grocery_and_pharmacy,parks,total_resolved,total_unresolved,total_fatal,label
0,0,1,-15,-36,1,0,0,1
1,0,1,-15,-36,2,0,0,1
2,0,1,-14,-18,1,0,0,1
3,0,1,-14,-18,2,0,0,1
4,0,1,-13,-31,1,0,0,1
...,...,...,...,...,...,...,...,...
23002,1,11,6,77,1,0,0,0
23003,1,11,8,2,1,0,0,0
23004,1,11,9,-24,1,0,0,0
23005,1,11,9,106,1,0,0,0


#### split it to train and test

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df, label['label'], test_size=0.2)

### Random Forest 

In [17]:
start = time.time()

In [18]:
rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

In [19]:
stop = time.time()

In [20]:
precision, recall, fscore, support = score(y_test, y_pred, average='micro')

In [21]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

print('Metric for Random Forest Model: Precision: {} | Recall: {} | Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))
print(f"Training time: {stop - start}s")

[[622   0   0   0   0   0   0   0   0]
 [  0 716   0   0   0   0   0   0   0]
 [  0   0 550   0   1   2   0   0   0]
 [  0   0   0 572   3   2   0   0   0]
 [  0   0   0   0 571   6   0   0   0]
 [  0   0   0   0   1 510   2   1   0]
 [  0   0   0   0   1   7 365   3   0]
 [  0   0   0   0   0   0  12 383   1]
 [  0   0   0   0   0   0   0  29 242]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       622
           1       1.00      1.00      1.00       716
           2       1.00      0.99      1.00       553
           3       1.00      0.99      1.00       577
           4       0.99      0.99      0.99       577
           5       0.97      0.99      0.98       514
           6       0.96      0.97      0.97       376
           7       0.92      0.97      0.94       396
           8       1.00      0.89      0.94       271

    accuracy                           0.98      4602
   macro avg       0.98      0.98      0.98      460

### Gradient Boosting

In [22]:
start = time.time()

In [23]:
gbt = GradientBoostingClassifier(n_estimators=50, learning_rate = 0.2, max_depth=20)
gbt_model = gbt.fit(X_train, y_train)
y_pred = gbt_model.predict(X_test)

In [24]:
stop = time.time()

In [25]:
precision, recall, fscore, support = score(y_test, y_pred, average='micro')

In [26]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

print('Metric for Gradient Boosting: Precision: {} | Recall: {} | Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))
print(f"Training time: {stop - start}s")

[[622   0   0   0   0   0   0   0   0]
 [  0 716   0   0   0   0   0   0   0]
 [  0   0 553   0   0   0   0   0   0]
 [  0   0   0 577   0   0   0   0   0]
 [  0   0   0   0 577   0   0   0   0]
 [  0   0   0   0   0 514   0   0   0]
 [  0   0   0   0   0   0 376   0   0]
 [  0   0   0   0   0   0   0 396   0]
 [  0   0   0   0   0   0   0   0 271]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       622
           1       1.00      1.00      1.00       716
           2       1.00      1.00      1.00       553
           3       1.00      1.00      1.00       577
           4       1.00      1.00      1.00       577
           5       1.00      1.00      1.00       514
           6       1.00      1.00      1.00       376
           7       1.00      1.00      1.00       396
           8       1.00      1.00      1.00       271

    accuracy                           1.00      4602
   macro avg       1.00      1.00      1.00      460

### Decision Tree

In [27]:
start = time.time()

In [28]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [29]:
stop = time.time()

In [30]:
precision, recall, fscore, support = score(y_test, y_pred, average='micro')

In [31]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

print('Metric for Decision Tree: Precision: {} | Recall: {} | Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

print(f"Training time: {stop - start}s")

[[622   0   0   0   0   0   0   0   0]
 [  0 716   0   0   0   0   0   0   0]
 [  0   0 553   0   0   0   0   0   0]
 [  0   0   0 577   0   0   0   0   0]
 [  0   0   0   0 577   0   0   0   0]
 [  0   0   0   0   0 514   0   0   0]
 [  0   0   0   0   0   0 376   0   0]
 [  0   0   0   0   0   0   0 396   0]
 [  0   0   0   0   0   0   0   0 271]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       622
           1       1.00      1.00      1.00       716
           2       1.00      1.00      1.00       553
           3       1.00      1.00      1.00       577
           4       1.00      1.00      1.00       577
           5       1.00      1.00      1.00       514
           6       1.00      1.00      1.00       376
           7       1.00      1.00      1.00       396
           8       1.00      1.00      1.00       271

    accuracy                           1.00      4602
   macro avg       1.00      1.00      1.00      460

## Part C. Detecting Outliers SVC

#### DBSCAN

In [32]:
outlier_detection = DBSCAN(min_samples = 2, eps = 3)
clusters = outlier_detection.fit_predict(df)
list(clusters).count(-1)

133

#### SVC

In [33]:
df['label'] = label['label']

In [34]:
svclassifier = SVC()
svclassifer_model = svclassifier.fit(X_train, y_train)
y_pred = svclassifer_model.predict(X_test)