# Clean Data

Run the code in this section once, and then run code in the SVM section. The Clean Data section is slow to run, so the outputs are saved as a CSV file to be loaded later.

In [79]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.impute import KNNImputer
from sklearn.neural_network import MLPClassifier
from sklearn import svm

In [23]:
#Load CSV as pandas dataframe and drop invalid COVID results (Invalid results are listed as 'other')
df = pd.read_csv('./Datasets/corona_tested_individuals.csv', low_memory=False, na_values='None')
df = df.drop('test_date', axis=1)
df.drop(df[df['corona_result'] == 'other'].index, inplace = True) 
df

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
0,0.0,0.0,0.0,0.0,0.0,negative,,female,Other
1,1.0,0.0,0.0,0.0,0.0,negative,,female,Other
2,0.0,1.0,0.0,0.0,0.0,negative,,male,Other
3,1.0,0.0,0.0,0.0,0.0,negative,,female,Other
4,1.0,0.0,0.0,0.0,0.0,negative,,male,Other
...,...,...,...,...,...,...,...,...,...
278842,0.0,0.0,0.0,0.0,0.0,negative,,,Other
278843,0.0,0.0,0.0,0.0,0.0,negative,,,Other
278844,0.0,0.0,0.0,0.0,0.0,negative,,,Other
278845,0.0,0.0,0.0,0.0,0.0,positive,,,Contact with confirmed


In [24]:
#Value map to replace text values with numerical values
value_map = {
    'negative': 0,
    'positive': 1,
    
    'No': 0,
    'Yes': 1,
    
    'male': 0,
    'female': 1,
    
    'Other': 0,
    'Contact with confirmed': 1,
    'Abroad': 2,
    
    0: 0,
    1: 1
}

In [25]:
#Apply map to dataframe
cols = df.columns
for col in cols:
    df[col] = df[col].map(value_map)


In [26]:
#Impute missing data values
labels = df['corona_result'].to_numpy()
labels = np.expand_dims(labels, 1)

features = df.drop('corona_result', axis=1)
features = features.to_numpy()

imputer = KNNImputer()
imputer.fit(features, labels)
features = imputer.transform(features)

print("Imputation Done")

Imputation Done


In [27]:
#Save imputed data as CSV
features = np.round(features, decimals = 0)
features = features.astype(int)
np.savetxt("./Datasets/features_filled.csv", features, delimiter = ',', fmt = '%d')
np.savetxt("./Datasets/labels_filled.csv", labels, delimiter = ',', fmt = '%d')

# SVM

In [83]:
def load_data():
    features = np.genfromtxt("./Datasets/features_filled.csv", delimiter = ',')
    labels = np.genfromtxt("./Datasets/labels_filled.csv", delimiter = ',')
    return labels, features

def undersample(labels, features, ratio):
    positive_labels = np.sum(labels == 1)
    negative_labels = np.sum(labels == 0)
    positive_labels = int(positive_labels * ratio)

    negative_idx = np.where(labels == 0)[0]
    negative_idx = np.random.choice(negative_idx, size = negative_labels - positive_labels, replace = False)
    features = np.delete(features, negative_idx, axis=0)
    labels = np.delete(labels, negative_idx)
    
    return labels, features

def resample(labels, features):
    labels, features = sklearn.utils.shuffle(labels, features)
    return labels, features

def confusion_matrix(labels_predict, labels_test):
    TP = np.logical_and(labels_predict == labels_test, labels_test == 1)
    TN = np.logical_and(labels_predict == labels_test, labels_test == 0)
    FP = np.logical_and(labels_predict != labels_test, labels_test == 0)
    FN = np.logical_and(labels_predict != labels_test, labels_test == 1)
    
    TP = np.sum(TP)
    TN = np.sum(TN)
    FP = np.sum(FP)
    FN = np.sum(FN)
    
    CM = np.array([[TP, FP], [FN, TN]])
    return CM

def cross_validate(labels, features, hyperparams, kfold = 10):  
    print("For hyperparameters: ", hyperparams)
    print("Fitted fold:", end = '')
    #Separate hyperparameters
    undersample_ratio = hyperparams[0]
    regularization = hyperparams[1]
    gamma = hyperparams[2]

    #Remove last few data points to make total number of datapoints divisible by 10
    N, D = features.shape
    N -= N % kfold;
    features = features[0 : N, :]
    labels = labels[0 : N]

    #Number of data points per fold
    step = int(N / kfold)

    #Confusion matricies for each fold
    confusion_matricies = np.zeros((kfold, 2, 2), dtype = int)

    #Run cross-validation
    for i in range(kfold):

        #Separate data into training and testing
        mask = np.ones(N, dtype = bool)
        if i == kfold - 1:
            mask[i * step :] = False
        else:
            mask[i * step : (i + 1) * step] = False
        features_train = features[mask]
        labels_train = labels[mask]
        features_test = features[np.invert(mask)]
        labels_test = labels[np.invert(mask)]

        #Undersample training data
        labels_train, features_train = undersample(labels_train, features_train, undersample_ratio)

        #Fit SVC
        clf = svm.SVC(C = regularization, gamma = gamma)
        clf.fit(features_train, labels_train)

        #Test on testing data
        labels_predict = clf.predict(features_test)

        #Get confusion matrix
        CM = confusion_matrix(labels_predict, labels_test)
        confusion_matricies[i, :, :] = CM

        print(" " + str(i + 1), end = ''),
    print("\n")
    return confusion_matricies
    

def performance_metric(confusion_matricies, kfold = 10):
    CM_avg = np.sum(confusion_matricies, axis = 0) / kfold

    TP = CM_avg[0, 0]
    FP = CM_avg[0, 1]
    FN = CM_avg[1, 0]
    TN = CM_avg[1, 1]

    recall = TP / (TP + FN)
    precision = TP / (TP + FP)
    F1 = 2 * precision * recall / (precision + recall)
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    
    return CM_avg, np.array([[recall, precision, F1, accuracy]])

## Optimize Undersampling Hyperparameter

In [85]:
labels, features = load_data()
labels, features = resample(labels, features)

undersample_ratios = [1, 2, 3, 4, 5]
PMs = np.empty((0, 4))
for undersample_ratio in undersample_ratios:
    hyperparams = [undersample_ratio, 1, 'scale']
    CMs = cross_validate(labels, features, hyperparams, kfold = 5)
    CM_avg, PM = performance_metric(CMs)
    PMs = np.append(PMs, PM, axis = 0)

For hyperparameters:  [1, 1, 'scale']
Fitted fold: 1 2 3 4 5

For hyperparameters:  [2, 1, 'scale']
Fitted fold: 1 2 3 4 5

For hyperparameters:  [3, 1, 'scale']
Fitted fold: 1 2 3 4 5

For hyperparameters:  [4, 1, 'scale']
Fitted fold: 1 2 3 4 5

For hyperparameters:  [5, 1, 'scale']
Fitted fold: 1 2 3 4 5



In [86]:
print(PMs)
np.savetxt("./Datasets/undersampling_hyperparameter.csv", PMs, delimiter = ',', fmt = '%.4f')

[[0.79075294 0.33058954 0.466253   0.90301686]
 [0.73901826 0.47064165 0.57505877 0.94149224]
 [0.68572204 0.58493079 0.63132892 0.95709843]
 [0.65462693 0.67834529 0.66627509 0.96487062]
 [0.65102858 0.69129839 0.67055944 0.96573257]]


# Optimize Regularization Hyperparameter

In [87]:
labels, features = load_data()
labels, features = resample(labels, features)

regularizations = [1, 10, 50, 100, 500, 1000, 5000]
PMs = np.empty((0, 4))
for regularization in regularizations:
    hyperparams = [4, regularization, 'scale']
    CMs = cross_validate(labels, features, hyperparams, kfold = 5)
    CM_avg, PM = performance_metric(CMs)
    PMs = np.append(PMs, PM, axis = 0)

For hyperparameters:  [4, 1, 'scale']
Fitted fold: 1 2 3 4 5

For hyperparameters:  [4, 10, 'scale']
Fitted fold: 1 2 3 4 5

For hyperparameters:  [4, 50, 'scale']
Fitted fold: 1 2 3 4 5

For hyperparameters:  [4, 100, 'scale']
Fitted fold: 1 2 3 4 5

For hyperparameters:  [4, 500, 'scale']
Fitted fold: 1 2 3 4 5

For hyperparameters:  [4, 1000, 'scale']
Fitted fold: 1 2 3 4 5

For hyperparameters:  [4, 5000, 'scale']
Fitted fold: 1 2 3 4 5



In [89]:
print(PMs)
np.savetxt("./Datasets/regularization_hyperparameter.csv", PMs, delimiter = ',', fmt = '%.4f')

[[0.65483061 0.68008743 0.66722009 0.96500882]
 [0.65401589 0.68537887 0.66933018 0.96538343]
 [0.65360853 0.68402728 0.66847203 0.96527068]
 [0.65666372 0.67645825 0.66641403 0.96478333]
 [0.65659583 0.67506631 0.66570298 0.96467422]
 [0.65646004 0.67610657 0.66613848 0.9647506 ]
 [0.65605269 0.67905833 0.6673573  0.96496518]]


## Optimize Gamma

In [90]:
labels, features = load_data()
labels, features = resample(labels, features)

gammas = [1, 10, 50, 100, 200, 500, 1000]
PMs = np.empty((0, 4))
for gamma in gammas:
    hyperparams = [5, 1, gamma]
    CMs = cross_validate(labels, features, hyperparams)
    CM_avg, PM = performance_metric(CMs)
    PMs = np.append(PMs, PM, axis = 0)

For hyperparameters:  [5, 1, 1]
Fitted fold: 1 2 3 4 5 6 7 8 9 10

For hyperparameters:  [5, 1, 10]
Fitted fold: 1 2 3 4 5 6 7 8 9 10

For hyperparameters:  [5, 1, 50]
Fitted fold: 1 2 3 4 5 6 7 8 9 10

For hyperparameters:  [5, 1, 100]
Fitted fold: 1 2 3 4 5 6 7 8 9 10

For hyperparameters:  [5, 1, 200]
Fitted fold: 1 2 3 4 5 6 7 8 9 10

For hyperparameters:  [5, 1, 500]
Fitted fold: 1 2 3 4 5 6 7 8 9 10

For hyperparameters:  [5, 1, 1000]
Fitted fold: 1 2 3 4 5 6 7 8 9 10



In [62]:
print(PMs)
np.savetxt("./Datasets/gamma_hyperparameter.csv", PMs, delimiter = ',', fmt = '%.4f')

[[0.65211488 0.68862919 0.66987481 0.96556829]
 [0.65157173 0.69018339 0.67032199 0.96566648]
 [0.65292959 0.68800973 0.6700108  0.96554646]
 [0.65218277 0.68964032 0.67038872 0.96564466]
 [0.65252224 0.6892076  0.6703634  0.96562284]
 [0.65340485 0.68742857 0.66998503 0.96551737]
 [0.65245434 0.68809967 0.6698031  0.96553919]]
