In [58]:
#imports
from pathlib import Path
import numpy as np
import pandas as pd
import math

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, f1_score, precision_score
from libsvm.svmutil import *
from libsvm.svm import *

# Data Prep

In [59]:
data_path = "../data"
output_path = "./output"

csv_name = 'frequencies.csv'
csv_path = Path(data_path, csv_name)

train_file_name = Path(output_path, "freq_train") 
test_file_name = Path(output_path, "freq_test") 

df = pd.read_csv(csv_path)

In [60]:
#remove index and get outcome from csv
df = df.drop("Unnamed: 0", axis = 1)
y = df['hasHospitilization']
x = df.drop(['hasHospitilization'], axis=1)

#split into training and testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

convert_outcome = lambda x: -2*x+1

#convert outcomes to LibSVM format: -1 (anomaly) and 1(normal)
y_train_convert = convert_outcome(y_train)
y_test_convert = convert_outcome(y_test)

#scale data from 0 to 1 to avoid domination of one factor
scaler = MinMaxScaler(copy = False)
scaler.fit_transform(x_train)
scaler.transform(x_test)

array([[0.91803279, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.16836735, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.37755102, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.06557377, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.31967213, 0.        , 0.        , ..., 0.        , 0.        ,
        0.112     ],
       [0.        , 0.        , 0.35714286, ..., 0.        , 0.        ,
        0.        ]])

In [61]:
outcomes_train = y_train_convert.to_list()
outcomes_test = y_test_convert.to_list()

input_train = x_train.values.tolist()
input_test = x_test.values.tolist()

train_prob = svm_problem(outcomes_train,input_train)

#params:
#  -s 2: Choose OCSVM model
#  -b 1: Choose to predict probability
#  -n:   nu hyperparameter, upper limit of incorrect labels, lower means less tolerance
#  -g:   gamma hyperparameter, determines similarity required to be in same class, higher means more curvature
#  -h:   Use shrinking heuristic or not
params = svm_parameter('-s 2 -b 1 -n 0.01 -g 0.0001 -h 0')

#train and save model
model = svm_train(train_prob, params)

p_labels, p_acc, p_vals = svm_predict(outcomes_test, input_test, model, '-b 1')

Accuracy = 98.7384% (9235/9353) (classification)


In [62]:
pred_outcomes = np.array(p_labels) < 0
actual_outcomes = np.array(y_test_convert) < 0

#numpy wizardy lets you find number of bools based on sum of bools
num_gauss_preds = pred_outcomes.sum()
real_positives = actual_outcomes.sum()

gauss_pred_pos_percent = num_gauss_preds/pred_outcomes.shape[0]
real_pos_percentage = real_positives/actual_outcomes.shape[0]


#find positivity rate for prediction and actual outcomes
print("Number of Predicted Positives:", num_gauss_preds)
print("Actual Positives in Dataset:", real_positives)

print("Percentage of Predicted Positives:",gauss_pred_pos_percent)
print("Percentage of Actual Positives in Test Dataset", real_pos_percentage)

#compute anomaly detection metrics: precision, recall, and f1 score
prec_score = precision_score(actual_outcomes, pred_outcomes, average = "binary")
rec_score = recall_score(actual_outcomes, pred_outcomes, average = "binary")
f_score = f1_score(actual_outcomes, pred_outcomes, average = "binary")

print("Precision Score: " + str(prec_score))
print("Recall Score: " + str(rec_score))
print("F1 Score: " + str(f_score))

Number of Predicted Positives: 86
Actual Positives in Dataset: 32
Percentage of Predicted Positives: 0.009194910723831926
Percentage of Actual Positives in Test Dataset 0.0034213621297979257
Precision Score: 0.0
Recall Score: 0.0
F1 Score: 0.0
