In [1]:
import pandas as pd
import numpy as np
import os
import time
from sklearn import linear_model
from sklearn.metrics import f1_score,roc_curve,precision_score,recall_score,accuracy_score,auc
from collections import defaultdict
from sklearn import metrics
import random
random.seed(10)

One dataset is held out. All the other datasets are concatenated. We implement a Logistic Regression on the concatenated data and then test out on the held out the data.

In [2]:
symptoms = ['fever','cough','muscle','sorethroat','virus']

In [3]:
def read_files(filename):
    data = pd.read_csv(filename)
    data = data[symptoms]
    print(data.head())
    return data

In [4]:
def get_training_data(train_files_,train_directory):
    data = []
    for i in train_files_:
        print(train_directory+i)
        data.append(read_files(train_directory+i))
        
    training_data = pd.concat(data)
    x_train = training_data.drop(['virus'],axis = 1)
    y_train = training_data['virus']
    return x_train,y_train

In [5]:
def get_test_data(test_file_,test_directory):
    data = read_files(test_directory+test_file_)
    x_test = data.drop(['virus'],axis = 1)
    y_test = data['virus']
    return x_test,y_test

In [6]:
def linear_regression_model(x_train,x_test,y_train,y_test):
    lm = linear_model.LogisticRegression()
    lm.fit(x_train,y_train)
    y_pred = lm.predict(x_test)
    fpr,tpr,threshold = roc_curve(y_test,y_pred)
    auc_score = auc(fpr,tpr)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    return auc_score,accuracy,precision,recall,f1

In [7]:
def predict_on_unknown(train_files,test_file,train_directory,test_directory ):
    results = defaultdict()
    x_train,y_train = get_training_data(train_files,train_directory)
    x_test,y_test = get_test_data(test_file,test_directory)
    auc_score,accuracy,precision,recall,f1 = linear_regression_model(x_train,x_test,y_train,y_test)
    results['AUC'] = auc_score
    results['Accuracy'] = accuracy
    results['Precision'] = precision
    results['Recall'] = recall
    results['F1'] = f1
    for k,v in results.items():
        print(k,"\t",v)
    return results

### Predict on GoViral

In [8]:
dire = "../Data/Symptoms_Demo"

In [9]:
TRAIN_DIRECTORY = dire+"/Goviral/Train/"
TEST_DIRECTORY = dire+"/Goviral/Test/"

In [10]:
train_files = ['goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
# train_files = ['goviral.csv']
test_file = 'goviral.csv'

In [11]:
print("GoViral!")
results_goviral = predict_on_unknown(train_files,test_file,TRAIN_DIRECTORY,TEST_DIRECTORY)

GoViral!
../Data/Symptoms_Demo/Goviral/Train/goviral.csv
   fever  cough  muscle  sorethroat  virus
0      0      1       0           1      0
1      0      1       0           0      1
2      1      1       1           0      1
3      1      1       1           1      1
4      0      1       1           1      0
../Data/Symptoms_Demo/Goviral/Train/fluwatch.csv
   fever  cough  muscle  sorethroat  virus
0    0.0    1.0     0.0         0.0      1
1    0.0    1.0     0.0         0.0      1
2    0.0    1.0     0.0         0.0      1
3    1.0    1.0     0.0         1.0      0
4    0.0    1.0     1.0         1.0      0
../Data/Symptoms_Demo/Goviral/Train/hongkong.csv
   fever  cough  muscle  sorethroat  virus
0    0.0    1.0     0.0         1.0      1
1    0.0    0.0     1.0         0.0      0
2    0.0    1.0     0.0         0.0      0
3    0.0    0.0     0.0         0.0      0
4    0.0    0.0     0.0         0.0      0
../Data/Symptoms_Demo/Goviral/Train/hutterite.csv
   fever  cough  musc

### Predict on FluWatch

In [12]:
TRAIN_DIRECTORY = dire+"/Fluwatch/Train/"
TEST_DIRECTORY = dire+"/Fluwatch/Test/"

In [13]:
train_files = ['goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
# train_files = ['fluwatch.csv']
test_file = 'fluwatch.csv'
print("FluWatch!")
results_fluwatch = predict_on_unknown(train_files,test_file,TRAIN_DIRECTORY,TEST_DIRECTORY)

FluWatch!
../Data/Symptoms_Demo/Fluwatch/Train/goviral.csv
   fever  cough  muscle  sorethroat  virus
0      0      1       0           1      1
1      0      1       0           1      1
2      1      1       1           1      0
3      0      0       0           0      0
4      1      1       1           1      0
../Data/Symptoms_Demo/Fluwatch/Train/fluwatch.csv
   fever  cough  muscle  sorethroat  virus
0    0.0    1.0     0.0         0.0      1
1    1.0    1.0     1.0         1.0      1
2    1.0    1.0     1.0         1.0      0
3    0.0    1.0     0.0         0.0      0
4    0.0    0.0     0.0         0.0      0
../Data/Symptoms_Demo/Fluwatch/Train/hongkong.csv
   fever  cough  muscle  sorethroat  virus
0    0.0    1.0     0.0         1.0      1
1    0.0    0.0     1.0         0.0      0
2    0.0    1.0     0.0         0.0      0
3    0.0    0.0     0.0         0.0      0
4    0.0    0.0     0.0         0.0      0
../Data/Symptoms_Demo/Fluwatch/Train/hutterite.csv
   fever  cough 

### Predict on HongKong

In [14]:
TRAIN_DIRECTORY = dire+"/Hongkong/Train/"
TEST_DIRECTORY = dire+"/Hongkong/Test/"

In [15]:
train_files = ['goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
# train_files = ['hongkong.csv']
test_file = 'hongkong.csv'
print("HongKong")
results_hongkong = predict_on_unknown(train_files,test_file,TRAIN_DIRECTORY ,TEST_DIRECTORY)

HongKong
../Data/Symptoms_Demo/Hongkong/Train/goviral.csv
   fever  cough  muscle  sorethroat  virus
0      0      1       0           1      1
1      0      1       0           1      1
2      1      1       1           1      0
3      0      0       0           0      0
4      1      1       1           1      0
../Data/Symptoms_Demo/Hongkong/Train/fluwatch.csv
   fever  cough  muscle  sorethroat  virus
0    0.0    1.0     0.0         0.0      1
1    0.0    1.0     0.0         0.0      1
2    0.0    1.0     0.0         0.0      1
3    1.0    1.0     0.0         1.0      0
4    0.0    1.0     1.0         1.0      0
../Data/Symptoms_Demo/Hongkong/Train/hongkong.csv
   fever  cough  muscle  sorethroat  virus
0    0.0    0.0     0.0         0.0      0
1    1.0    1.0     1.0         1.0      1
2    0.0    0.0     0.0         0.0      0
3    1.0    1.0     1.0         1.0      1
4    1.0    1.0     1.0         0.0      1
../Data/Symptoms_Demo/Hongkong/Train/hutterite.csv
   fever  cough  

### Predict on Hutterite

In [16]:
TRAIN_DIRECTORY = dire+"/Hutterite/Train/"
TEST_DIRECTORY = dire+"/Hutterite/Test/"

In [17]:
train_files = ['goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
# train_files = ['hutterite.csv']
test_file = 'hutterite.csv'
print("Hutterite!")
results_hutterite = predict_on_unknown(train_files,test_file,TRAIN_DIRECTORY,TEST_DIRECTORY)

Hutterite!
../Data/Symptoms_Demo/Hutterite/Train/goviral.csv
   fever  cough  muscle  sorethroat  virus
0      0      1       0           1      1
1      0      1       0           1      1
2      1      1       1           1      0
3      0      0       0           0      0
4      1      1       1           1      0
../Data/Symptoms_Demo/Hutterite/Train/fluwatch.csv
   fever  cough  muscle  sorethroat  virus
0    0.0    1.0     0.0         0.0      1
1    0.0    1.0     0.0         0.0      1
2    0.0    1.0     0.0         0.0      1
3    1.0    1.0     0.0         1.0      0
4    0.0    1.0     1.0         1.0      0
../Data/Symptoms_Demo/Hutterite/Train/hongkong.csv
   fever  cough  muscle  sorethroat  virus
0    0.0    1.0     0.0         1.0      1
1    0.0    0.0     1.0         0.0      0
2    0.0    1.0     0.0         0.0      0
3    0.0    0.0     0.0         0.0      0
4    0.0    0.0     0.0         0.0      0
../Data/Symptoms_Demo/Hutterite/Train/hutterite.csv
   fever  c