In [1]:
import pandas as pd
import numpy as np
import os
import time
from sklearn import linear_model
from sklearn.metrics import f1_score,roc_curve,precision_score,recall_score,accuracy_score,auc
from collections import defaultdict
from sklearn import metrics
import random
random.seed(10)

One dataset is held out. All the other datasets are concatenated. We implement a Logistic Regression on the concatenated data and then test out on the held out the data.

In [2]:
TRAIN_DIRECTORY = "./Data/Only_Symptoms/Total/"
#TEST_DIRECTORY = "../Data/Symptoms/Test/"

In [3]:
def read_files(filename):
    return pd.read_csv(filename)

In [26]:
def get_training_data(train_files_,train_directory):
    data = []
    for i in train_files_:
        data.append(read_files(train_directory+i))
    training_data = pd.concat(data)
    x_train = training_data.drop(['virus'],axis = 1)
    y_train = training_data['virus']
    return x_train,y_train

In [27]:
def get_test_data(test_file_,test_directory):
    data = read_files(test_directory+test_file_)
    x_test = data.drop(['virus'],axis = 1)
    y_test = data['virus']
    return x_test,y_test

In [28]:
def linear_regression_model(x_train,x_test,y_train,y_test):
    lm = linear_model.LogisticRegression()
    lm.fit(x_train,y_train)
    y_pred = lm.predict(x_test)
    fpr,tpr,threshold = roc_curve(y_test,y_pred)
    auc_score = auc(fpr,tpr)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    return auc_score,accuracy,precision,recall,f1

In [29]:
def predict_on_unknown(train_files,test_file,train_directory = TRAIN_DIRECTORY,test_directory = TRAIN_DIRECTORY):
    results = defaultdict()
    x_train,y_train = get_training_data(train_files,train_directory)
    x_test,y_test = get_test_data(test_file,train_directory)
    auc_score,accuracy,precision,recall,f1 = linear_regression_model(x_train,x_test,y_train,y_test)
    results['AUC'] = auc_score
    results['Accuracy'] = accuracy
    results['Precision'] = precision
    results['Recall'] = recall
    results['F1'] = f1
    for k,v in results.items():
        print(k,"\t",v)
    return results

### Predict on NYUMC

In [30]:
train_files = ['goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
test_file = 'nyumc.csv'

In [31]:
print("NYUMC!")
results_nyumc = predict_on_unknown(train_files,test_file)

NYUMC!
AUC 	 0.19569142799819433
Recall 	 0.0
Precision 	 0.0
Accuracy 	 0.35618751997078557
F1 	 0.0


### Predict on GoViral

In [32]:
train_files = ['nyumc.csv','fluwatch.csv','hutterite.csv','hongkong.csv']
test_file = 'goviral.csv'

In [33]:
print("GoViral!")
results_goviral = predict_on_unknown(train_files,test_file)

GoViral!
AUC 	 0.6449076366692177
Recall 	 0.42955326460481097
Precision 	 0.7961783439490446
Accuracy 	 0.6192307692307693
F1 	 0.5580357142857143


### Predict on FluWatch

In [34]:
train_files = ['nyumc.csv','goviral.csv','hongkong.csv','hutterite.csv']
test_file = 'fluwatch.csv'
print("FluWatch!")
results_fluwatch = predict_on_unknown(train_files,test_file)

FluWatch!
AUC 	 0.5572356017758318
Recall 	 0.5943562610229277
Precision 	 0.6686507936507936
Accuracy 	 0.566120218579235
F1 	 0.6293183940242764


### Predict on HongKong

In [35]:
train_files = ['nyumc.csv','goviral.csv','fluwatch.csv','hutterite.csv']
test_file = 'hongkong.csv'
print("HongKong")
results_hongkong = predict_on_unknown(train_files,test_file)

HongKong
AUC 	 0.7346887172481743
Recall 	 0.5560842963970088
Precision 	 0.7303571428571428
Accuracy 	 0.807226483649576
F1 	 0.6314164415283674


### Predict on Hutterite

In [36]:
train_files = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv']
test_file = 'hutterite.csv'
print("Hutterite!")
results_hutterite = predict_on_unknown(train_files,test_file)

Hutterite!
AUC 	 0.7139280000411545
Recall 	 0.6404066073697586
Precision 	 0.8275862068965517
Accuracy 	 0.697111631537861
F1 	 0.7220630372492838
