In [10]:
import pandas as pd
import numpy as np
import os
import time
from sklearn import linear_model
from sklearn.metrics import f1_score,roc_curve,precision_score,recall_score,accuracy_score,auc
from collections import defaultdict
from sklearn import metrics
import random
random.seed(10)

One dataset is held out. All the other datasets are concatenated. We implement a Logistic Regression on the concatenated data and then test out on the held out the data.

In [11]:
symptoms = ['fever','cough','muscle','sorethroat','virus']

In [12]:
def read_files(filename):
    data = pd.read_csv(filename)
    data = data[symptoms]
    print(data.head())
    return data

In [13]:
def get_training_data(train_files_,train_directory):
    data = []
    for i in train_files_:
        print(train_directory+i)
        data.append(read_files(train_directory+i))
        
    training_data = pd.concat(data)
    x_train = training_data.drop(['virus'],axis = 1)
    y_train = training_data['virus']
    return x_train,y_train

In [14]:
def get_test_data(test_file_,test_directory):
    data = read_files(test_directory+test_file_)
    x_test = data.drop(['virus'],axis = 1)
    y_test = data['virus']
    return x_test,y_test

In [15]:
def linear_regression_model(x_train,x_test,y_train,y_test):
    lm = linear_model.LogisticRegression()
    lm.fit(x_train,y_train)
    y_pred = lm.predict(x_test)
    fpr,tpr,threshold = roc_curve(y_test,y_pred)
    auc_score = auc(fpr,tpr)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    return auc_score,accuracy,precision,recall,f1

In [16]:
def predict_on_unknown(train_files,test_file,train_directory,test_directory ):
    results = defaultdict()
    x_train,y_train = get_training_data(train_files,train_directory)
    x_test,y_test = get_test_data(test_file,test_directory)
    auc_score,accuracy,precision,recall,f1 = linear_regression_model(x_train,x_test,y_train,y_test)
    results['AUC'] = auc_score
    results['Accuracy'] = accuracy
    results['Precision'] = precision
    results['Recall'] = recall
    results['F1'] = f1
    for k,v in results.items():
        print(k,"\t",v)
    return results

### Predict on GoViral

In [20]:
TRAIN_DIRECTORY = "../Data/Symptoms_Demo/Goviral/Train/"
TEST_DIRECTORY = "../Data/Symptoms_Demo/Goviral/Test/"

In [21]:
# train_files = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
train_files = ['goviral.csv']
test_file = 'goviral.csv'

In [22]:
print("GoViral!")
results_goviral = predict_on_unknown(train_files,test_file,TRAIN_DIRECTORY,TEST_DIRECTORY)

GoViral!
../Data/Symptoms_Demo/Goviral/Train/goviral.csv
   fever  cough  muscle  sorethroat  virus
0      0      1       1           0      0
1      0      0       0           1      1
2      1      1       1           1      1
3      0      1       0           1      0
4      0      0       0           1      1
   fever  cough  muscle  sorethroat  virus
0      1      1       1           1      1
1      1      1       0           1      1
2      0      1       0           1      0
3      0      0       0           0      0
4      1      1       0           0      1
Precision 	 0.6816608996539792
F1 	 0.7150635208711434
AUC 	 0.652653227599496
Accuracy 	 0.6645299145299145
Recall 	 0.7519083969465649


### Predict on FluWatch

In [23]:
TRAIN_DIRECTORY = "../Data/Symptoms_Demo/Fluwatch/Train/"
TEST_DIRECTORY = "../Data/Symptoms_Demo/Fluwatch/Test/"

In [24]:
# train_files = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
train_files = ['fluwatch.csv']
test_file = 'fluwatch.csv'
print("FluWatch!")
results_fluwatch = predict_on_unknown(train_files,test_file,TRAIN_DIRECTORY,TEST_DIRECTORY)

FluWatch!
../Data/Symptoms_Demo/Fluwatch/Train/fluwatch.csv
   fever  cough  muscle  sorethroat  virus
0    0.0    1.0     0.0         1.0      1
1    0.0    1.0     0.0         1.0      0
2    0.0    1.0     0.0         0.0      1
3    0.0    1.0     0.0         0.0      0
4    0.0    1.0     0.0         1.0      0
   fever  cough  muscle  sorethroat  virus
0    1.0    0.0     1.0         1.0      0
1    0.0    0.0     0.0         0.0      1
2    0.0    1.0     0.0         1.0      0
3    0.0    1.0     0.0         0.0      1
4    0.0    1.0     0.0         1.0      0
Precision 	 0.6844919786096256
F1 	 0.43611584327086883
AUC 	 0.5904245283018869
Accuracy 	 0.5983009708737864
Recall 	 0.32


### Predict on HongKong

In [25]:
TRAIN_DIRECTORY = "../Data/Symptoms_Demo/Hongkong/Train/"
TEST_DIRECTORY = "../Data/Symptoms_Demo/Hongkong/Test/"

In [26]:
# train_files = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
train_files = ['hongkong.csv']
test_file = 'hongkong.csv'
print("HongKong")
results_hongkong = predict_on_unknown(train_files,test_file,TRAIN_DIRECTORY ,TEST_DIRECTORY)

HongKong
../Data/Symptoms_Demo/Hongkong/Train/hongkong.csv
   fever  cough  muscle  sorethroat  virus
0    0.0    1.0     0.0         1.0      1
1    0.0    0.0     0.0         0.0      0
2    0.0    0.0     1.0         0.0      0
3    1.0    1.0     1.0         1.0      1
4    0.0    0.0     0.0         0.0      0
   fever  cough  muscle  sorethroat  virus
0    0.0    1.0     1.0         1.0      1
1    0.0    0.0     0.0         0.0      0
2    0.0    1.0     0.0         0.0      1
3    1.0    1.0     1.0         1.0      1
4    0.0    1.0     0.0         1.0      1
Precision 	 0.8785117691723614
F1 	 0.8554528650646951
AUC 	 0.8907365969837758
Accuracy 	 0.9123121776183001
Recall 	 0.8335734870317003


### Predict on Hutterite

In [27]:
TRAIN_DIRECTORY = "../Data/Symptoms_Demo/Hutterite/Train/"
TEST_DIRECTORY = "../Data/Symptoms_Demo/Hutterite/Test/"

In [28]:
# train_files = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
train_files = ['hutterite.csv']
test_file = 'hutterite.csv'
print("Hutterite!")
results_hutterite = predict_on_unknown(train_files,test_file,TRAIN_DIRECTORY,TEST_DIRECTORY)

Hutterite!
../Data/Symptoms_Demo/Hutterite/Train/hutterite.csv
   fever  cough  muscle  sorethroat  virus
0      0      1       0           0      0
1      0      0       0           0      0
2      0      1       0           0      1
3      0      0       0           0      0
4      0      1       0           0      0
   fever  cough  muscle  sorethroat  virus
0      0      0       0           0      0
1      0      0       0           1      0
2      1      0       0           0      1
3      0      1       0           0      1
4      0      1       0           0      1
Precision 	 0.6513249651324965
F1 	 0.7538337368845843
AUC 	 0.7492197509274945
Accuracy 	 0.7354726799653079
Recall 	 0.8946360153256705


### Predict on Loeb

In [29]:
TRAIN_DIRECTORY = "../Data/Symptoms_Demo/Loeb/Train/"
TEST_DIRECTORY = "../Data/Symptoms_Demo/Loeb/Test/"

In [31]:
# train_files = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
train_files = ['loeb.csv']
test_file = 'loeb.csv'
print("Loeb!")
results_hutterite = predict_on_unknown(train_files,test_file,TRAIN_DIRECTORY,TEST_DIRECTORY)

Loeb!
../Data/Symptoms_Demo/Loeb/Train/loeb.csv
   fever  cough  muscle  sorethroat  virus
0      0      1       0           0      1
1      0      1       1           1      1
2      0      1       1           1      0
3      1      1       0           1      1
4      0      1       0           0      0
   fever  cough  muscle  sorethroat  virus
0      0      1       0           1      1
1      0      1       0           1      1
2      1      0       0           1      0
3      0      1       0           1      1
4      1      1       0           1      1
Precision 	 0.5333333333333333
F1 	 0.6674968866749689
AUC 	 0.48791564839850254
Accuracy 	 0.5202156334231806
Recall 	 0.891846921797005
