In [1]:
import warnings; warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
import os
import time
from collections import defaultdict
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score,roc_curve
from sklearn import metrics

* The symptoms included are as follows:

In [2]:
symptoms = ['fever',
            'sorethroat',
            'cough',
            'muscle',
            'headache',
            'fatigue',
            'vomit',
            'nausea',
            'diarrhea',
            'chills',
            'sneeze',
            'shortness of breath',
            'phlegm',
            'blockednose',
            'earache',
            'leg pain',
            'runnynose',
            'virus']

In [3]:
def read_file(filename):
    data = pd.read_csv(filename)
    columns = list(data.columns)
#     data['intercept'] = 1
#     new_columns = ['intercept'] + columns
#     data = data[new_columns]
    train_data = data.drop(['virus'],axis =1).as_matrix()
    return data,train_data

In [4]:
def read_parameters(filename):
    parameters = pd.read_csv(filename)
    return parameters

#### Get the parameters for the different dataset combinations

In [5]:
directory_ = "./Generated_Parameters_1/"
only_symptoms_ = ['only_symptoms_nyumc.csv','only_symptoms_goviral.csv','only_symptoms_fluwatch.csv','only_symptoms_hongkong.csv','only_symptoms_hutterite.csv']
with_demographics_ = ['with_demographics_nyumc.csv','with_demographics_goviral.csv','with_demographics_fluwatch.csv','with_demographics_hongkong.csv','with_demographics_hutterite.csv']
only_symptoms_parameters = defaultdict()
with_demographic_parameters = defaultdict()

In [6]:
def return_parameters(file,dataset_names):
    param = read_parameters(file)
    parameter_dict = defaultdict()
    for i in dataset_names:
        parameter_dict[i] = list(param[i])
    return parameter_dict
    

In [7]:
def get_parameters(dataset_name,parameters):
    return np.array(list(parameters[dataset_name]))

In [8]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [9]:
def get_results(param,sample_points):
    return sigmoid(np.dot(param,sample_points.T)  )

In [10]:
def save_results_for_finding_threshold(filename,dataframe,predicted):
    results = pd.DataFrame()
    results['Actual'] = dataframe['virus']
    results['Predicted'] = predicted
    print(results.head())
    results.to_csv(filename,index = False)

In [11]:
def get_all_datasets(training_data_):
    datasets = defaultdict()
    for i in training_data_:
        data,training = read_file(training_directory+i)
        datasets[i[:-4]] = (data,training)
    return datasets

In [12]:
def get_all_results(data_dict,param):
    results = defaultdict()
    for i in list(param.keys()):
        data,train = data_dict[i]
        results[i] = get_results(param[i],train)
    return results

In [13]:
def result_statistics(list_):
#     print("Min : ",min(list_))
#     print("Max : ",max(list_))
#     print("Mean : ",np.mean(list_))
#     print("Standard Deviation : ",np.std(list_))
    return min(list_),max(list_)

In [14]:
def return_class(threshold,list_):
    ans = list_ >= threshold * 1
    return [int(i) for i in ans]

def metrics_pred(list1,list2):
    f1 =f1_score(list1,list2)
    precision = precision_score(list1,list2)
    recall = recall_score(list1,list2)
    accuracy = accuracy_score(list1,list2)
    fpr,tpr,threshold = roc_curve(list1,list2)
    auc = metrics.auc(fpr,tpr)
#     print("f1 score : ",f1)
#     print("Precision score : ",precision)
#     print("Recall : ",recall)
#     print("Accuracy : ",accuracy)
#     print("Area under the curve : ",auc)
    return auc

In [15]:
def find_threshold(min_,max_,list1,list2,step_size = 1e-3):
    auc_thresholds = defaultdict()
    value = min_
    while value < max_:
        auc_thresholds[value] = metrics_pred(list1['virus'],return_class(value,list2))
        value += step_size
    optimal_threshold = max(auc_thresholds.items(), key=lambda x: x[1]) 
    return optimal_threshold

In [16]:
def return_all_thresholds(results,data):
    thresholds = defaultdict()
    for i in list(data.keys()):
        print("_____________________")
        print(i)
        min_,max_ = result_statistics(results[i])
        threshold = find_threshold(min_,max_,data[i][0],results[i])
        print("Found threshold for : ",i)
        thresholds[i] = threshold
    return thresholds

In [17]:
def test(filename_,param,thresholds_):
    aucs = defaultdict()
    data,train = read_file(filename_)
    for i in list(param.keys()):
        test_results = get_results(param[i],train)
        auc_ = metrics_pred(data['virus'],return_class(thresholds_[i][0],test_results))
        aucs[i] = auc_
    return aucs

In [18]:
def return_final_auc_scores(training_data_,training_directory,filename_,parameters):
    data = get_all_datasets(training_data_)
    results = get_all_results(data,parameters)
    #find the thresholds
    thresholds = return_all_thresholds(results,data)
    #get the auc values
    aucs_= test(filename_,parameters,thresholds)
    return aucs_


#### Get the symptoms

In [19]:
symp_nyumc = return_parameters(directory_+only_symptoms_[0],['goviral','fluwatch','hongkong','hutterite'])
symp_goviral = return_parameters(directory_+only_symptoms_[1],['nyumc','fluwatch','hongkong','hutterite'])
symp_fluwatch = return_parameters(directory_+only_symptoms_[2],['nyumc','goviral','hongkong','hutterite'])
symp_hongkong = return_parameters(directory_+only_symptoms_[3],['nyumc','goviral','fluwatch','hutterite'])
symp_hutterite = return_parameters(directory_+only_symptoms_[4],['nyumc','goviral','fluwatch','hongkong'])
demo_nyumc = return_parameters(directory_+with_demographics_[0],['goviral','fluwatch','hongkong','hutterite'])
demo_goviral = return_parameters(directory_+with_demographics_[1],['nyumc','fluwatch','hongkong','hutterite'])
demo_fluwatch = return_parameters(directory_+with_demographics_[2],['nyumc','goviral','hongkong','hutterite'])
demo_hongkong = return_parameters(directory_+with_demographics_[3],['nyumc','goviral','fluwatch','hutterite'])
demo_hutterite = return_parameters(directory_+with_demographics_[4],['nyumc','goviral','fluwatch','hongkong'])


#### Only symptoms

##### NYUMC not considered in the training data

In [20]:
training_data_nyumc = ['goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
training_directory = "../../Data/Symptoms/Train/"
filename_ = "../../Data/Symptoms/Total/nyumc.csv"

In [21]:
print("Only symptoms")
aucs_nyumc_sym = return_final_auc_scores(training_data_nyumc,training_directory,filename_,symp_nyumc)
aucs_nyumc_sym

Only symptoms
_____________________
goviral
Found threshold for :  goviral
_____________________
fluwatch
Found threshold for :  fluwatch
_____________________
hongkong
Found threshold for :  hongkong
_____________________
hutterite
Found threshold for :  hutterite


defaultdict(None,
            {'fluwatch': 0.44610337670243594,
             'goviral': 0.49848225032842952,
             'hongkong': 0.48008613652692611,
             'hutterite': 0.50219138004094632})

In [22]:
print("With demographics")
aucs_nyumc_demo = return_final_auc_scores(training_data_nyumc,training_directory,filename_,demo_nyumc)
aucs_nyumc_demo

With demographics
_____________________
goviral
Found threshold for :  goviral
_____________________
fluwatch
Found threshold for :  fluwatch
_____________________
hongkong
Found threshold for :  hongkong
_____________________
hutterite
Found threshold for :  hutterite


defaultdict(None,
            {'fluwatch': 0.49406852150903502,
             'goviral': 0.54274003506465462,
             'hongkong': 0.4724306243973162,
             'hutterite': 0.47852398492522297})

##### Goviral not considered in training data

In [23]:
training_data_goviral = ['nyumc.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
training_directory = "../../Data/Symptoms/Train/"
filename_ = "../../Data/Symptoms/Total/goviral.csv"


In [24]:
print("Only Symptoms")
aucs_goviral_sym = return_final_auc_scores(training_data_goviral,training_directory,filename_,symp_goviral)
aucs_goviral_sym

Only Symptoms
_____________________
nyumc
Found threshold for :  nyumc
_____________________
fluwatch
Found threshold for :  fluwatch
_____________________
hongkong
Found threshold for :  hongkong
_____________________
hutterite
Found threshold for :  hutterite


defaultdict(None,
            {'fluwatch': 0.49945644788694116,
             'hongkong': 0.50794190031858188,
             'hutterite': 0.5,
             'nyumc': 0.48767193610242937})

In [25]:
print("With demographics")
aucs_goviral_demo = return_final_auc_scores(training_data_goviral,training_directory,filename_,demo_goviral)

With demographics
_____________________
nyumc
Found threshold for :  nyumc
_____________________
fluwatch
Found threshold for :  fluwatch
_____________________
hongkong
Found threshold for :  hongkong
_____________________
hutterite
Found threshold for :  hutterite


In [26]:
aucs_goviral_demo

defaultdict(None,
            {'fluwatch': 0.44141716114810292,
             'hongkong': 0.56936328909423084,
             'hutterite': 0.5039784994941946,
             'nyumc': 0.42948166266551913})

#### FluWatch not considered in training data

In [27]:
training_data_fluwatch = ['nyumc.csv','goviral.csv','hongkong.csv','hutterite.csv']
training_directory = "../../Data/Symptoms/Train/"
filename_ = "../../Data/Symptoms/Total/fluwatch.csv"


In [28]:
print("Only Symptoms")
aucs_fluwatch_sym = return_final_auc_scores(training_data_fluwatch,training_directory,filename_,symp_fluwatch)
aucs_fluwatch_sym

Only Symptoms
_____________________
nyumc
Found threshold for :  nyumc
_____________________
goviral
Found threshold for :  goviral
_____________________
hongkong
Found threshold for :  hongkong
_____________________
hutterite
Found threshold for :  hutterite


defaultdict(None,
            {'goviral': 0.51812285111669698,
             'hongkong': 0.52016699893097573,
             'hutterite': 0.52471757533732055,
             'nyumc': 0.51747276877293347})

In [29]:
print("With demographics")
aucs_fluwatch_demo = return_final_auc_scores(training_data_fluwatch,training_directory,filename_,demo_fluwatch)
aucs_fluwatch_demo

With demographics
_____________________
nyumc
Found threshold for :  nyumc
_____________________
goviral
Found threshold for :  goviral
_____________________
hongkong
Found threshold for :  hongkong
_____________________
hutterite
Found threshold for :  hutterite


defaultdict(None,
            {'goviral': 0.48426078414376927,
             'hongkong': 0.46575510675796716,
             'hutterite': 0.51095749906099219,
             'nyumc': 0.50304816387853568})

##### Hongkong not considered in training data

In [30]:
training_data_hongkong = ['nyumc.csv','goviral.csv','fluwatch.csv','hutterite.csv']
training_directory = "../../Data/Symptoms/Train/"
filename_ = "../../Data/Symptoms/Total/hongkong.csv"

In [31]:
print("Only Symptoms")
aucs_hongkong_sym = return_final_auc_scores(training_data_hongkong,training_directory,filename_,symp_hongkong)
aucs_hongkong_sym

Only Symptoms
_____________________
nyumc
Found threshold for :  nyumc
_____________________
goviral
Found threshold for :  goviral
_____________________
fluwatch
Found threshold for :  fluwatch
_____________________
hutterite
Found threshold for :  hutterite


defaultdict(None,
            {'fluwatch': 0.39511452837131666,
             'goviral': 0.58800489593262928,
             'hutterite': 0.32192922971781335,
             'nyumc': 0.59251545359382607})

In [32]:
print("With Demographics")
aucs_hongkong_demo = return_final_auc_scores(training_data_hongkong,training_directory,filename_,demo_hongkong)
aucs_hongkong_demo

With Demographics
_____________________
nyumc
Found threshold for :  nyumc
_____________________
goviral
Found threshold for :  goviral
_____________________
fluwatch
Found threshold for :  fluwatch
_____________________
hutterite
Found threshold for :  hutterite


defaultdict(None,
            {'fluwatch': 0.55463119327528165,
             'goviral': 0.76849206519277158,
             'hutterite': 0.73059858813358469,
             'nyumc': 0.63113993339522378})

##### Hutterite not considered in training data

In [33]:
training_data_hutterite = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv']
training_directory = "../../Data/Symptoms/Train/"
filename_ = "../../Data/Symptoms/Total/hutterite.csv"

In [34]:
print("Only Symptoms")
aucs_hutterite_sym = return_final_auc_scores(training_data_hutterite,training_directory,filename_,symp_hutterite)
aucs_hutterite_sym

Only Symptoms
_____________________
nyumc
Found threshold for :  nyumc
_____________________
goviral
Found threshold for :  goviral
_____________________
fluwatch
Found threshold for :  fluwatch
_____________________
hongkong
Found threshold for :  hongkong


defaultdict(None,
            {'fluwatch': 0.52308612440191393,
             'goviral': 0.54924812030075187,
             'hongkong': 0.52631578947368418,
             'nyumc': 0.52601674641148333})

In [35]:
print("With Demographics")
aucs_hutterite_demo = return_final_auc_scores(training_data_hutterite,training_directory,filename_,demo_hutterite)
aucs_hutterite_demo

With Demographics
_____________________
nyumc
Found threshold for :  nyumc
_____________________
goviral
Found threshold for :  goviral
_____________________
fluwatch
Found threshold for :  fluwatch
_____________________
hongkong
Found threshold for :  hongkong


defaultdict(None,
            {'fluwatch': 0.48887559808612435,
             'goviral': 0.57445317840054688,
             'hongkong': 0.58078434723171557,
             'nyumc': 0.54471975393028027})