In [1]:
import pandas as pd
import numpy as np
import os
import time
from sklearn import linear_model
from sklearn.metrics import f1_score,roc_curve,precision_score,recall_score,accuracy_score,auc
from collections import defaultdict
from sklearn import metrics
import random
random.seed(10)

One dataset is held out. All the other datasets are concatenated. We implement a Logistic Regression on the concatenated data and then test out on the held out the data.

In [2]:
TRAIN_DIRECTORY = "../Data/With_Improved_Target/Symptoms/Total/"
TEST_DIRECTORY = "../Data/Symptoms/Test/"

In [3]:
def read_files(filename):
    return pd.read_csv(filename)

In [4]:
def get_training_data(train_files_,train_directory):
    data = []
    for i in train_files_:
        data.append(read_files(train_directory+i))
    training_data = pd.concat(data)
    x_train = training_data.drop(['virus'],axis = 1)
    y_train = training_data['virus']
    return x_train,y_train

In [5]:
def get_test_data(test_file_,test_directory):
    data = read_files(test_directory+test_file_)
    x_test = data.drop(['virus'],axis = 1)
    y_test = data['virus']
    return x_test,y_test

In [6]:
def linear_regression_model(x_train,x_test,y_train,y_test):
    lm = linear_model.LogisticRegression()
    lm.fit(x_train,y_train)
    y_pred = lm.predict(x_test)
    fpr,tpr,threshold = roc_curve(y_test,y_pred)
    auc_score = auc(fpr,tpr)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    return auc_score,accuracy,precision,recall,f1

In [7]:
def predict_on_unknown(train_files,test_file,train_directory = TRAIN_DIRECTORY,test_directory = TEST_DIRECTORY):
    results = defaultdict()
    x_train,y_train = get_training_data(train_files,train_directory)
    x_test,y_test = get_test_data(test_file,test_directory)
    auc_score,accuracy,precision,recall,f1 = linear_regression_model(x_train,x_test,y_train,y_test)
    results['AUC'] = auc_score
    results['Accuracy'] = accuracy
    results['Precision'] = precision
    results['Recall'] = recall
    results['F1'] = f1
    for k,v in results.items():
        print(k,"\t",v)
    return results

### Predict on NYUMC

In [8]:
train_files = ['goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
test_file = 'nyumc.csv'

In [9]:
print("NYUMC!")
results_nyumc = predict_on_unknown(train_files,test_file)

NYUMC!
AUC 	 0.565581830953
Accuracy 	 0.912805295595
Precision 	 0.0696202531646
Recall 	 0.2
F1 	 0.103286384977


### Predict on GoViral

In [10]:
train_files = ['nyumc.csv','fluwatch.csv','hutterite.csv','hongkong.csv']
test_file = 'goviral.csv'

In [11]:
print("GoViral!")
results_goviral = predict_on_unknown(train_files,test_file)

GoViral!
AUC 	 0.591666666667
Accuracy 	 0.605769230769
Precision 	 0.650793650794
Recall 	 0.683333333333
F1 	 0.666666666667


### Predict on FluWatch

In [12]:
train_files = ['nyumc.csv','goviral.csv','hongkong.csv','hutterite.csv']
test_file = 'fluwatch.csv'
print("FluWatch!")
results_fluwatch = predict_on_unknown(train_files,test_file)

FluWatch!
AUC 	 0.563760452533
Accuracy 	 0.590163934426
Precision 	 0.631147540984
Recall 	 0.719626168224
F1 	 0.672489082969


### Predict on HongKong

In [13]:
train_files = ['nyumc.csv','goviral.csv','fluwatch.csv','hutterite.csv']
test_file = 'hongkong.csv'
print("HongKong")
results_hongkong = predict_on_unknown(train_files,test_file)

HongKong
AUC 	 0.762028350693
Accuracy 	 0.839556004036
Precision 	 0.661971830986
Recall 	 0.618421052632
F1 	 0.639455782313


### Predict on Hutterite

In [14]:
train_files = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv']
test_file = 'hutterite.csv'
print("Hutterite!")
results_hutterite = predict_on_unknown(train_files,test_file)

Hutterite!
AUC 	 0.577288031833
Accuracy 	 0.578125
Precision 	 0.552845528455
Recall 	 0.561983471074
F1 	 0.55737704918
