In [131]:
import warnings; warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
import os
import time
from collections import defaultdict
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score,roc_curve
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.cross_validation import train_test_split

* The symptoms included are as follows:

In [2]:
symptoms = ['intercept',
            'fever',
            'sorethroat',
            'cough',
            'muscle',
            'headache',
            'fatigue',
            'vomit',
            'nausea',
            'diarrhea',
            'chills',
            'sneeze',
            'shortness of breath',
            'phlegm',
            'blockednose',
            'earache',
            'leg pain',
            'runnynose',
            'virus']

In [176]:
def read_file(filename):
    data = pd.read_csv(filename)
    data['intercept'] = 1
    columns = list(data.columns)
    columns = columns[-1:] + columns[:-1]
    data = data[columns]
#     train_data = data.drop(['virus'],axis =1).as_matrix()
    return data

In [177]:
def read_parameters(filename):
    parameters = pd.read_csv(filename)
    return parameters

#### Get the parameters for the different dataset combinations

In [173]:
directory_ = "./Generated_Parameters_3/"
with_demographics_ = ['with_demographics_nyumc.csv','with_demographics_goviral.csv','with_demographics_fluwatch.csv','with_demographics_hongkong.csv','with_demographics_hutterite.csv']
with_demographic_parameters = defaultdict()

In [186]:
def return_parameters(file,parameters_of):
    param = read_parameters(file)
    parameter_dict = defaultdict()
    for i in parameters_of:
        parameter_dict[i] = list(param[i])
    return parameter_dict
    

In [7]:
def get_parameters(dataset_name,parameters):
    return np.array(list(parameters[dataset_name]))

In [8]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [102]:
def get_results(param,sample_points):
    return sigmoid(np.dot(param,sample_points.T)  )

In [103]:
def save_results_for_finding_threshold(filename,dataframe,predicted):
    results = pd.DataFrame()
    results['Actual'] = dataframe['virus']
    results['Predicted'] = predicted
    print(results.head())
    results.to_csv(filename,index = False)

In [178]:
def get_all_datasets(training_data_):
    datasets = defaultdict()
    for i in training_data_:
        data = read_file(training_directory+i)
        datasets[i[:-4]] = (data)
    return datasets

In [12]:
def get_all_results(data_dict,param):
    results = defaultdict()
    for i in list(param.keys()):
        data,train = data_dict[i]
        results[i] = get_results(param[i],train)
    return results

In [13]:
def result_statistics(list_):
#     print("Min : ",min(list_))
#     print("Max : ",max(list_))
#     print("Mean : ",np.mean(list_))
#     print("Standard Deviation : ",np.std(list_))
    return min(list_),max(list_)

In [14]:
def return_class(threshold,list_):
    ans = list_ >= threshold * 1
    return [int(i) for i in ans]

def metrics_pred(list1,list2):
    f1 =f1_score(list1,list2)
    precision = precision_score(list1,list2)
    recall = recall_score(list1,list2)
    accuracy = accuracy_score(list1,list2)
    fpr,tpr,threshold = roc_curve(list1,list2)
    auc = metrics.auc(fpr,tpr)
#     print("f1 score : ",f1)
#     print("Precision score : ",precision)
#     print("Recall : ",recall)
#     print("Accuracy : ",accuracy)
#     print("Area under the curve : ",auc)
    return auc

In [15]:
def find_threshold(min_,max_,list1,list2,step_size = 1e-3):
    auc_thresholds = defaultdict()
    value = min_
    while value < max_:
        auc_thresholds[value] = metrics_pred(list1['virus'],return_class(value,list2))
        value += step_size
    optimal_threshold = max(auc_thresholds.items(), key=lambda x: x[1]) 
    return optimal_threshold

In [16]:
def return_all_thresholds(results,data):
    thresholds = defaultdict()
    for i in list(data.keys()):
        print("_____________________")
        print(i)
        min_,max_ = result_statistics(results[i])
        threshold = find_threshold(min_,max_,data[i][0],results[i])
        print("Found threshold for : ",i)
        thresholds[i] = threshold
    return thresholds

In [17]:

def test(filename_,param,thresholds_):
    aucs = defaultdict()
    data,train = read_file(filename_)
    for i in list(param.keys()):
        test_results = get_results(param[i],train)
        auc_ = metrics_pred(data['virus'],return_class(thresholds_[i][0],test_results))
        aucs[i] = auc_
    return aucs

In [18]:
def return_final_auc_scores(training_data_,training_directory,filename_,parameters):
    data = get_all_datasets(training_data_)
    results = get_all_results(data,parameters)
    #find the thresholds
    thresholds = return_all_thresholds(results,data)
    #get the auc values
    aucs_= test(filename_,parameters,thresholds)
    return aucs_


In [19]:
def create_dict(dict_):
    temp = []
    for k,v in dict_.items():
        temp.append((k,v))
    return temp
        

In [20]:
results_symp = defaultdict()
results_demo = defaultdict()

#### Get the symptoms

In [24]:
parameters = read_parameters(directory_+'with_demographics_nyumc.csv')
parameters.head()

Unnamed: 0,symptoms,goviral,fluwatch,hongkong,hutterite,clinically_collected,individually_reported,health_worker,female,male,age 0-4,age 5-15,age 16-44,age 45-64,age 65+,population
0,intercept,-0.13837,-0.187797,0.030844,-0.426161,-2.906815,-1.151237,-1.795861,-2.070173,-2.068259,-2.041379,-2.107281,-2.076364,-2.104053,-1.995266,-2.06806
1,fever,0.204137,0.131731,0.85864,0.333287,2.77167,1.246586,1.849353,2.008316,2.043778,2.030737,2.041785,2.033293,2.038176,1.977947,2.025554
2,sorethroat,0.07701,0.061443,0.720418,0.220058,-0.161843,-0.041248,0.074922,-0.052179,-0.070802,-0.064612,-0.06586,-0.066387,-0.066858,-0.063318,-0.064339
3,cough,0.087195,0.114197,0.663564,0.300647,2.032616,0.96721,1.33469,1.508208,1.502671,1.495051,1.530207,1.500596,1.522124,1.461916,1.503482
4,muscle,0.046507,0.083707,0.396633,0.065074,-0.334765,-0.127,-0.109791,-0.168658,-0.170333,-0.177561,-0.181513,-0.175814,-0.182052,-0.171453,-0.175438


In [25]:
cols = list(parameters.columns)[1:]
print(cols)

['goviral', 'fluwatch', 'hongkong', 'hutterite', 'clinically_collected', 'individually_reported', 'health_worker', 'female', 'male', 'age 0-4', 'age 5-15', 'age 16-44', 'age 45-64', 'age 65+', 'population']


In [27]:
x = return_parameters(directory_+'with_demographics_nyumc.csv',cols)
print(x.keys())

dict_keys(['age 16-44', 'age 45-64', 'clinically_collected', 'individually_reported', 'hutterite', 'female', 'age 5-15', 'age 65+', 'fluwatch', 'goviral', 'hongkong', 'health_worker', 'age 0-4', 'male', 'population'])


In [69]:
data,train = read_file("../../Data/With_Improved_Target/With_Demographics/goviral.csv")
data.head()

Unnamed: 0,intercept,fever,sorethroat,cough,muscle,headache,fatigue,vomit,nausea,diarrhea,...,leg pain,runnynose,age 0-4,age 5-15,age 16-44,age 45-64,age 65+,male,female,virus
0,1,0,1,1,0,0,1,0,0,0,...,0,1,0,0,1,0,0,1.0,0.0,0
1,1,0,1,0,1,0,1,0,0,1,...,1,0,0,0,1,0,0,1.0,0.0,1
2,1,0,1,1,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0.0,1.0,1
3,1,1,1,1,1,0,1,1,1,1,...,1,1,0,0,0,0,1,1.0,0.0,0
4,1,0,1,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0.0,1.0,1


In [70]:
def get_gender(dataframe_):
    df = dataframe_[['male','female']]
    temp = df.apply(lambda x:x.argmax(),axis =1)
    return temp

In [71]:
def get_age(dataframe_):
    df = dataframe_[['age 0-4', 'age 5-15', 'age 16-44', 'age 45-64', 'age 65+']]
    temp = df.apply(lambda x: x.argmax(), axis=1)
    return temp

In [72]:
temp_age = get_age(data)

In [73]:
temp_gender = get_gender(data)

In [154]:
only_symp_data = data[symptoms]
only_symp_data.drop('virus',axis = 1,inplace = True)
train_data_symp = only_symp_data.as_matrix()

In [155]:
def get_predictions(train,param_dict,temp_age,temp_gender,collection_mode = 'clinically_collected',population ='population'):
    results = []
    for i in range(train.shape[0]):
        sample_point = train[i,:]
        p_data = get_results(param_dict['goviral'],sample_point)
        p_collection = get_results(param_dict[collection_mode],sample_point)
        p_gender = get_results(param_dict[temp_gender[i]],sample_point)
        p_age = get_results(param_dict[temp_age[i]],sample_point)
        p_population = get_results(param_dict[population],sample_point)
        result = [p_data,p_collection,p_gender+p_age,p_population]
        results.append(result)
    return results
    

In [156]:
results = get_predictions(train_data_symp,x,temp_age,temp_gender)

In [157]:
y_true = list(data['virus'])

In [158]:
x_train,x_test,y_train,y_test = train_test_split(results,y_true,test_size = 0.3,random_state = 10)

In [159]:
len(x_train)

364

In [160]:
len(x_test)

156

In [161]:
len(y_train)

364

In [162]:
len(y_test)

156

In [163]:
lm = linear_model.LogisticRegression()

In [164]:
lm.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [165]:
y_pred = lm.predict(x_test)

In [166]:
acc = accuracy_score(y_test,y_pred)
fpr,tpr,threshold = roc_curve(y_test,y_pred)
auc_score = metrics.auc(fpr,tpr)

In [167]:
acc

0.6602564102564102

In [168]:
auc_score

0.6651515151515152

In [169]:
lm.coef_

array([[0.24173545, 0.33503834, 0.74179403, 0.35937587]])

In [139]:
lm.intercept_

array([-1.31331783])

In [97]:
p_collection

0.6828732067708851

In [98]:
p_gender

0.7127442124098128

In [99]:
p_age

0.7109822961576486

In [100]:
p_popultaion

0.7140686717950651

In [171]:
demo_nyumc = return_parameters(directory_+with_demographics_[0],['goviral','fluwatch','hongkong','hutterite'])
demo_goviral = return_parameters(directory_+with_demographics_[1],['nyumc','fluwatch','hongkong','hutterite'])
demo_fluwatch = return_parameters(directory_+with_demographics_[2],['nyumc','goviral','hongkong','hutterite'])
demo_hongkong = return_parameters(directory_+with_demographics_[3],['nyumc','goviral','fluwatch','hutterite'])
demo_hutterite = return_parameters(directory_+with_demographics_[4],['nyumc','goviral','fluwatch','hongkong'])

In [179]:
def process(training_data_list,training_directory,filename_,parameters):
    data = get_all_datasets(training_data_list)
    return data

#### With demographics

##### Finding the coeffieicents for all the datasets that were included for training for NYUMC

In [174]:
training_data_nyumc = ['goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
training_directory = "../../Data/With_Improved_Target/With_Demographics/"
filename_ = training_directory+'nyumc.csv'

In [187]:
cols = ['goviral', 'fluwatch', 'hongkong', 'hutterite', 'clinically_collected', 'individually_reported', 'health_worker', 'female', 'male', 'age 0-4', 'age 5-15', 'age 16-44', 'age 45-64', 'age 65+', 'population']
demo_nyumc = return_parameters(directory_+'with_demographics_nyumc.csv',cols)
demo_nyumc.keys()

dict_keys(['age 16-44', 'age 45-64', 'clinically_collected', 'individually_reported', 'hutterite', 'female', 'age 5-15', 'age 65+', 'fluwatch', 'goviral', 'hongkong', 'health_worker', 'age 0-4', 'male', 'population'])

In [180]:
print("With demographics!")

With demographics!


In [188]:
data_nyumc = process(training_data_nyumc,training_directory,filename_,demo_nyumc)
