In [1]:
import pandas as pd
import numpy as np
import os
import time
from sklearn import linear_model
from sklearn.metrics import f1_score,roc_curve,precision_score,recall_score,accuracy_score,auc
from collections import defaultdict
from sklearn import metrics
import random
random.seed(10)

One dataset is held out. All the other datasets are concatenated. We implement a Logistic Regression on the concatenated data and then test out on the held out the data.

In [91]:
symptoms = ['fever', 'sorethroat', 'cough', 'muscle', 'headache', 'fatigue', 'vomit', 'nausea', 'diarrhea', 'chills', 'sneeze', 'shortness of breath', 'phlegm', 'blockednose', 'earache', 'leg pain', 'runnynose', 'virus']


In [141]:
def read_files(filename):
    data = pd.read_csv(filename)
    data = data[symptoms]
    print(data.head())
    return data

In [142]:
def get_training_data(train_files_,train_directory):
    data = []
    for i in train_files_:
        print(train_directory+i)
        data.append(read_files(train_directory+i))
        
    training_data = pd.concat(data)
    x_train = training_data.drop(['virus'],axis = 1)
    y_train = training_data['virus']
    return x_train,y_train

In [143]:
def get_test_data(test_file_,test_directory):
    data = read_files(test_directory+test_file_)
    x_test = data.drop(['virus'],axis = 1)
    y_test = data['virus']
    return x_test,y_test

In [144]:
def linear_regression_model(x_train,x_test,y_train,y_test):
    lm = linear_model.LogisticRegression()
    lm.fit(x_train,y_train)
    y_pred = lm.predict(x_test)
    fpr,tpr,threshold = roc_curve(y_test,y_pred)
    auc_score = auc(fpr,tpr)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    return auc_score,accuracy,precision,recall,f1

In [145]:
def predict_on_unknown(train_files,test_file,train_directory,test_directory ):
    results = defaultdict()
    x_train,y_train = get_training_data(train_files,train_directory)
    x_test,y_test = get_test_data(test_file,test_directory)
    auc_score,accuracy,precision,recall,f1 = linear_regression_model(x_train,x_test,y_train,y_test)
    results['AUC'] = auc_score
    results['Accuracy'] = accuracy
    results['Precision'] = precision
    results['Recall'] = recall
    results['F1'] = f1
    for k,v in results.items():
        print(k,"\t",v)
    return results

### Predict on NYUMC

In [146]:
TRAIN_DIRECTORY = "../Data/Symptoms_Demo/NYUMC/Train/"
TEST_DIRECTORY = "../Data/Symptoms_Demo/NYUMC/Test/"

In [147]:
# train_files = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv','hutter']
train_files = ['nyumc.csv']
test_file = 'nyumc.csv'
name=""

In [148]:
print("NYUMC!")
results_nyumc = predict_on_unknown(train_files,test_file,TRAIN_DIRECTORY,TEST_DIRECTORY)

NYUMC!
../Data/Symptoms_Demo/NYUMC/Train/nyumc.csv
   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0      1           0      0       0         0        0      0       0   
1      1           0      0       0         0        0      0       0   
2      0           1      0       0         0        0      0       0   
3      1           0      0       0         0        0      0       0   
4      1           0      0       0         0        0      0       0   

   diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
0         0       0       0                    0       0            0   
1         0       0       0                    0       0            0   
2         0       0       0                    0       0            0   
3         0       0       0                    0       0            0   
4         0       0       0                    0       0            0   

   earache  leg pain  runnynose  virus  
0        0         0          

### Predict on GoViral

In [149]:
TRAIN_DIRECTORY = "../Data/Symptoms_Demo/Goviral/Train/"
TEST_DIRECTORY = "../Data/Symptoms_Demo/Goviral/Test/"

In [150]:
# train_files = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
train_files = ['goviral.csv']
test_file = 'goviral.csv'

In [151]:
print("GoViral!")
results_goviral = predict_on_unknown(train_files,test_file,TRAIN_DIRECTORY,TEST_DIRECTORY)

GoViral!
../Data/Symptoms_Demo/Goviral/Train/goviral.csv
   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0      1           1      1       1         0        1      0       1   
1      0           0      1       0         0        0      0       0   
2      0           1      0       1         0        1      1       1   
3      0           0      0       0         0        0      0       0   
4      1           1      1       1         0        1      0       0   

   diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
0         1       1       0                    0       0            0   
1         0       0       0                    0       0            0   
2         1       1       0                    1       0            0   
3         0       0       0                    0       0            0   
4         0       1       0                    0       0            0   

   earache  leg pain  runnynose  virus  
0        0         1    

### Predict on FluWatch

In [135]:
TRAIN_DIRECTORY = "../Data/Symptoms_Demo/Fluwatch/Train/"
TEST_DIRECTORY = "../Data/Symptoms_Demo/Fluwatch/Test/"

In [136]:
# train_files = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
train_files = ['fluwatch.csv']
test_file = 'fluwatch.csv'
print("FluWatch!")
results_fluwatch = predict_on_unknown(train_files,test_file,TRAIN_DIRECTORY,TEST_DIRECTORY)

FluWatch!
../Data/Symptoms_Demo/Fluwatch/Train/fluwatch.csv
(91, 18)
(824, 18)
AUC 	 0.7369187773144182
Precision 	 0.7768313458262351
F1 	 0.8306010928961748
Recall 	 0.8923679060665362
Accuracy 	 0.7742718446601942


### Predict on HongKong

In [137]:
TRAIN_DIRECTORY = "../Data/Symptoms_Demo/Hongkong/Train/"
TEST_DIRECTORY = "../Data/Symptoms_Demo/Hongkong/Test/"

In [138]:
# train_files = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
train_files = ['hongkong.csv']
test_file = 'hongkong.csv'
print("HongKong")
results_hongkong = predict_on_unknown(train_files,test_file,TRAIN_DIRECTORY ,TEST_DIRECTORY)

HongKong
../Data/Symptoms_Demo/Hongkong/Train/hongkong.csv
(495, 18)
(4459, 18)
AUC 	 0.9603794349516314
Precision 	 0.9582689335394127
F1 	 0.948012232415902
Recall 	 0.9379727685325264
Accuracy 	 0.9694998878672348


### Predict on Hutterite

In [139]:
TRAIN_DIRECTORY = "../Data/Symptoms_Demo/Hutterite/Train/"
TEST_DIRECTORY = "../Data/Symptoms_Demo/Hutterite/Test/"

In [140]:
# train_files = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
train_files = ['hutterite.csv']
test_file = 'hutterite.csv'
print("Hutterite!")
results_hutterite = predict_on_unknown(train_files,test_file,TRAIN_DIRECTORY,TEST_DIRECTORY)

Hutterite!
../Data/Symptoms_Demo/Hutterite/Train/hutterite.csv
(128, 18)
(1153, 18)
AUC 	 0.6410573952975054
Precision 	 0.7052505966587113
F1 	 0.7650485436893203
Recall 	 0.8359264497878359
Accuracy 	 0.6851691240242844
