In [2]:
import pandas as pd
import numpy as np
import os
import time
from sklearn import linear_model
from sklearn.metrics import f1_score,roc_curve,precision_score,recall_score,accuracy_score,auc
from collections import defaultdict
from sklearn import metrics
import random
random.seed(10)

One dataset is held out. All the other datasets are concatenated. We implement a Logistic Regression on the concatenated data and then test out on the held out the data.

In [3]:
TRAIN_DIRECTORY = "../Data/Symptoms_Demo/Train/"
TEST_DIRECTORY = "../Data/Symptoms_Demo/Test/"

In [4]:
def read_files(filename):
    return pd.read_csv(filename)

In [5]:
def get_training_data(train_files_,train_directory):
    data = []
    for i in train_files_:
        data.append(read_files(train_directory+i))
    training_data = pd.concat(data)
    x_train = training_data.drop(['virus'],axis = 1)
    y_train = training_data['virus']
    return x_train,y_train

In [6]:
def get_test_data(test_file_,test_directory):
    data = read_files(test_directory+test_file_)
    x_test = data.drop(['virus'],axis = 1)
    y_test = data['virus']
    return x_test,y_test

In [7]:
def linear_regression_model(x_train,x_test,y_train,y_test):
    lm = linear_model.LogisticRegression()
    lm.fit(x_train,y_train)
    y_pred = lm.predict(x_test)
    fpr,tpr,threshold = roc_curve(y_test,y_pred)
    auc_score = auc(fpr,tpr)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    return auc_score,accuracy,precision,recall,f1

In [8]:
def predict_on_unknown(train_files,test_file,train_directory = TRAIN_DIRECTORY,test_directory = TEST_DIRECTORY):
    results = defaultdict()
    x_train,y_train = get_training_data(train_files,train_directory)
    x_test,y_test = get_test_data(test_file,test_directory)
    auc_score,accuracy,precision,recall,f1 = linear_regression_model(x_train,x_test,y_train,y_test)
    results['AUC'] = auc_score
    results['Accuracy'] = accuracy
    results['Precision'] = precision
    results['Recall'] = recall
    results['F1'] = f1
    for k,v in results.items():
        print(k,"\t",v)
    return results

### Predict on NYUMC

In [9]:
train_files = ['goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
test_file = 'nyumc.csv'

In [10]:
print("NYUMC!")
results_nyumc = predict_on_unknown(train_files,test_file)

NYUMC!


FileNotFoundError: File b'../Data/Symptoms_Demo/Test/nyumc.csv' does not exist

### Predict on GoViral

In [16]:
train_files = ['goviral.csv']
test_file = 'goviral.csv'

In [17]:
print("GoViral!")
results_goviral = predict_on_unknown(train_files,test_file)

GoViral!
Recall 	 0.8470588235294118
Accuracy 	 0.8076923076923077
Precision 	 0.8089887640449438
F1 	 0.8275862068965517
AUC 	 0.8038111019055509


### Predict on FluWatch

In [18]:
train_files = ['fluwatch.csv']
test_file = 'fluwatch.csv'
print("FluWatch!")
results_fluwatch = predict_on_unknown(train_files,test_file)

FluWatch!
Recall 	 0.8488372093023255
Accuracy 	 0.7454545454545455
Precision 	 0.7684210526315789
F1 	 0.8066298342541436
AUC 	 0.710826371641454


### Predict on HongKong

In [19]:
train_files = ['hongkong.csv']
test_file = 'hongkong.csv'
print("HongKong")
results_hongkong = predict_on_unknown(train_files,test_file)

HongKong
Recall 	 0.9713024282560706
Accuracy 	 0.9791386271870794
Precision 	 0.9606986899563319
F1 	 0.9659714599341382
AUC 	 0.9769387262287129


### Predict on Hutterite

In [20]:
train_files = ['hutterite.csv']
test_file = 'hutterite.csv'
print("Hutterite!")
results_hutterite = predict_on_unknown(train_files,test_file)

Hutterite!
Recall 	 0.8529411764705882
Accuracy 	 0.7526041666666666
Precision 	 0.7718631178707225
F1 	 0.8103792415169662
AUC 	 0.7209911361804997
