In [1]:
import warnings; warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
import os
import time
from collections import defaultdict
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score,roc_curve
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
import random
random.seed(1)
from sklearn.linear_model import Lasso



* The symptoms included are as follows:

In [2]:
symptoms = ['intercept',
            'fever',
            'sorethroat',
            'cough',
            'muscle',
            'headache',
            'fatigue',
            'vomit',
            'nausea',
            'diarrhea',
            'chills',
            'sneeze',
            'shortness of breath',
            'phlegm',
            'blockednose',
            'earache',
            'leg pain',
            'runnynose',
            'virus']

In [3]:
print(symptoms)

['intercept', 'fever', 'sorethroat', 'cough', 'muscle', 'headache', 'fatigue', 'vomit', 'nausea', 'diarrhea', 'chills', 'sneeze', 'shortness of breath', 'phlegm', 'blockednose', 'earache', 'leg pain', 'runnynose', 'virus']


In [4]:
def read_file(filename):
    data = pd.read_csv(filename)
    data['intercept'] = 1
    columns = list(data.columns)
    columns = columns[-1:] + columns[:-1]
    data = data[columns]
#     train_data = data.drop(['virus'],axis =1).as_matrix()
    return data

In [5]:
def read_parameters(filename):
    parameters = pd.read_csv(filename)
    return parameters

#### Get the parameters for the different dataset combinations

In [35]:
directory_ = "./Generated_Parameters_3/"
with_demographics_ = ['with_demographics_nyumc.csv','with_demographics_goviral.csv','with_demographics_fluwatch.csv','with_demographics_hongkong.csv','with_demographics_hutterite.csv']
with_demographic_parameters = defaultdict()

In [36]:
def return_parameters(file,parameters_of):
    param = read_parameters(file)
    parameter_dict = defaultdict()
    for i in parameters_of:
        parameter_dict[i] = list(param[i])
    return parameter_dict
    

In [37]:
def get_parameters(dataset_name,parameters):
    return np.array(list(parameters[dataset_name]))

In [38]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [39]:
def get_results(param,sample_points):
    return sigmoid(np.dot(param,sample_points.T)  )

In [40]:
def save_results_for_finding_threshold(filename,dataframe,predicted):
    results = pd.DataFrame()
    results['Actual'] = dataframe['virus']
    results['Predicted'] = predicted
    print(results.head())
    results.to_csv(filename,index = False)

In [41]:
def get_all_datasets(training_data_,training_directory):
    datasets = defaultdict()
    for i in training_data_:
        data = read_file(training_directory+i)
        datasets[i[:-4]] = (data)
    return datasets

In [42]:
def get_all_results(data_dict,param):
    results = defaultdict()
    for i in list(param.keys()):
        data,train = data_dict[i]
        results[i] = get_results(param[i],train)
    return results

In [43]:
def result_statistics(list_):
#     print("Min : ",min(list_))
#     print("Max : ",max(list_))
#     print("Mean : ",np.mean(list_))
#     print("Standard Deviation : ",np.std(list_))
    return min(list_),max(list_)

In [44]:
def return_class(threshold,list_):
    ans = [1 if x >= threshold else 0 for x in list_]
    return ans

def metrics_pred(list1,list2):
    f1 =f1_score(list1,list2)
    precision = precision_score(list1,list2)
    recall = recall_score(list1,list2)
    accuracy = accuracy_score(list1,list2)
    fpr,tpr,threshold = roc_curve(list1,list2)
    auc = metrics.auc(fpr,tpr)
#     print("f1 score : ",f1)
#     print("Precision score : ",precision)
#     print("Recall : ",recall)
#     print("Accuracy : ",accuracy)
#     print("Area under the curve : ",auc)
    return auc

In [45]:
def find_threshold(min_,max_,list1,list2,step_size = 1e-3):
    auc_thresholds = defaultdict()
    value = min_
    while value < max_:
        auc_thresholds[value] = metrics_pred(list1,return_class(value,list2))
        value += step_size
    optimal_threshold = max(auc_thresholds.items(), key=lambda x: x[1]) 
    return optimal_threshold

In [46]:
def get_threshold(pred,true):
    min_,max_ = result_statistics(pred)
    threshold = find_threshold(min_,max_,true,pred)
    return threshold

In [47]:
def return_all_thresholds(results,data,y_true):
    thresholds = defaultdict()
    for i in list(data.keys()):
        print("_____________________")
        min_,max_ = result_statistics(results[i])
        
        threshold = find_threshold(min_,max_,y_true[i],results[i])
        print("Found threshold for : ",i)
        thresholds[i] = threshold
    return thresholds

In [48]:

def test(filename_,param,thresholds_):
    aucs = defaultdict()
    data,train = read_file(filename_)
    for i in list(param.keys()):
        test_results = get_results(param[i],train)
        auc_ = metrics_pred(data['virus'],return_class(thresholds_[i][0],test_results))
        aucs[i] = auc_
    return aucs

In [49]:
def return_final_auc_scores(training_data_,training_directory,filename_,parameters):
    data = get_all_datasets(training_data_)
    results = get_all_results(data,parameters)
    #find the thresholds
    thresholds = return_all_thresholds(results,data)
    #get the auc values
    aucs_= test(filename_,parameters,thresholds)
    return aucs_


In [50]:
def create_dict(dict_):
    temp = []
    for k,v in dict_.items():
        temp.append((k,v))
    return temp
        

In [51]:
results_symp = defaultdict()
results_demo = defaultdict()

#### Get the symptoms

In [52]:
def get_gender(dataframe_):
    df = dataframe_[['male','female']]
    temp = df.apply(lambda x:x.argmax(),axis =1)
    return temp

In [53]:
def get_age(dataframe_):
    df = dataframe_[['age 0-4', 'age 5-15', 'age 16-44', 'age 45-64', 'age 65+']]
    temp = df.apply(lambda x: x.argmax(), axis=1)
    return temp

In [54]:
def get_predictions(name,train,param_dict,temp_age,temp_gender,collection_mode = 'clinically_collected',population ='population'):
    results = []
    for i in range(train.shape[0]):
        sample_point = train[i,:]
        p_data = get_results(param_dict[name],sample_point)
#         p_collection = get_results(param_dict[collection_mode],sample_point)
#         p_gender = get_results(param_dict[temp_gender[i]],sample_point)
#         p_age = get_results(param_dict[temp_age[i]],sample_point)
#         p_population = get_results(param_dict[population],sample_point)
        result = [p_data]
        results.append(result)
    return results
    

In [55]:
def get_coeff(X,Y):
    lm = linear_model.LogisticRegression()
    x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2,random_state = 10)
    lm.fit(x_train,y_train)
    y_pred = lm.predict(x_test)
    acc = accuracy_score(y_test,y_pred)
#     print("Accuracy :",acc)
    fpr,tpr,threshold = roc_curve(y_test,y_pred)
    auc_score = metrics.auc(fpr,tpr)
#     print("AUC :",auc_score)
    coefficients = lm.coef_.tolist()[0]
    print("Coefficients : ",coefficients)
    intercept = lm.intercept_.tolist()[0]
    return coefficients,intercept
    

In [56]:
def norm(list_):
    min_ = min(list_)
    max_ = max(list_)
    denom = max_ - min_
    ans = [x-min_/denom for x in list_]
    return ans

In [57]:
COLLECTION_MODE = {'nyumc':'clinically_collected',
                   'goviral':'individually_reported',
                   'fluwatch':'individually_reported',
                   'hongkong': 'health_worker',
                   'hutterite':'health_worker'}

In [58]:
def process(training_data_list,training_directory,filename_,parameters,collection_mode = COLLECTION_MODE):
    name_dataset = filename_.split('/')[-1]
    thresholds = defaultdict()
    print(name_dataset)
    data = get_all_datasets(training_data_list,training_directory)
    print("Got the data")
    print("Now finding coefficients for the the datasets!")
    weights = defaultdict()
    for i in data.keys():
        print("Analyzing the dataset : ",i)
        data_ = data[i]
        temp_age = get_age(data_)
        temp_gender = get_gender(data_)
        only_symp_data = data_[symptoms]
        only_symp_data.drop('virus',axis = 1,inplace = True)
        train_data_symp = only_symp_data.as_matrix()
        prediction = get_predictions(i,train_data_symp,parameters,temp_age,temp_gender,COLLECTION_MODE[i])
        y_true = list(data_['virus'])
        values = [i[0] for i in prediction]
#         coefficient,intercept = get_coeff(temp,y_true)
#         weights[i] = (coefficient,intercept)
#         value = np.array(np.dot(temp,np.array(weights[i][0]).T)+weights[i][1]+first)
#         values = [sigmoid(j) for j in value]
        threshold = get_threshold(values,y_true)
        print("Found threshold for ",i)
        thresholds[i] = threshold[0]
    return weights,thresholds

In [117]:
def process_test(training_data_nyumc,training_directory,filename_,parameters,weights,thresholds,collection_mode = COLLECTION_MODE):
    aucs_ = defaultdict()
    predictions = defaultdict()
    test_data = get_all_datasets([filename_],training_directory)
    name = filename_.split('.')[0]
    print("Name : ",name) 
    data_ = test_data[name]
    temp_age = get_age(data_)
    temp_gender = get_gender(data_)
    only_symp_data = data_[symptoms]
    only_symp_data.drop('virus',axis = 1,inplace = True)
    y_true = list(data_['virus'])
    train_data_symp = only_symp_data.as_matrix()
    for i in training_data_nyumc:
        i = i[:-4]
        print("Using the parameters of : ",i)
        prediction = get_predictions(i,train_data_symp,parameters,temp_age,temp_gender,COLLECTION_MODE[i])
        values = [j[0] for j in prediction]
        predictions[i] = values
    print("Got the predicitions from the different parameters")
    for i in training_data_nyumc:
        i = i[:-4]
        print(i)
        auc_ = metrics_pred(y_true,return_class(thresholds[i],predictions[i]))
        print(auc_)
        aucs_[i[:-4]] = auc_
        print("Found the auc for ",i[:-4])
    return aucs_
#         

#### With demographics

##### Generating the results for NYUMC

In [133]:
training_data_nyumc = ['goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
training_directory = "../../Data/With_Improved_Target/With_Demographics/"
filename_ = 'nyumc.csv'

In [134]:
cols = ['goviral', 'fluwatch', 'hongkong', 'hutterite', 'clinically_collected', 'individually_reported', 'health_worker','population']
demo_nyumc = return_parameters(directory_+'only_symptoms_nyumc.csv',cols)
demo_nyumc.keys()

dict_keys(['goviral', 'fluwatch', 'hongkong', 'hutterite', 'clinically_collected', 'individually_reported', 'health_worker', 'population'])

In [135]:
print("With demographics!")

With demographics!


In [136]:
weights_nyumc,thresholds_nyumc = process(training_data_nyumc,training_directory,filename_,demo_nyumc)

nyumc.csv
Got the data
Now finding coefficients for the the datasets!
Analyzing the dataset :  goviral
Found threshold for  goviral
Analyzing the dataset :  fluwatch
Found threshold for  fluwatch
Analyzing the dataset :  hongkong
Found threshold for  hongkong
Analyzing the dataset :  hutterite
Found threshold for  hutterite


In [137]:
thresholds_nyumc

defaultdict(None,
            {'fluwatch': 0.59018767927501981,
             'goviral': 0.57397113252353771,
             'hongkong': 0.69671047566889888,
             'hutterite': 0.52704347720486799})

In [138]:
weights_nyumc

defaultdict(None, {})

In [139]:
aucs_nyumc = process_test(training_data_nyumc,training_directory,filename_,demo_nyumc,weights_nyumc,thresholds_nyumc)

Name :  nyumc
Using the parameters of :  goviral
Using the parameters of :  fluwatch
Using the parameters of :  hongkong
Using the parameters of :  hutterite
Got the predicitions from the different parameters
goviral
0.499172393038
Found the auc for  gov
fluwatch
0.5
Found the auc for  fluw
hongkong
0.645546249875
Found the auc for  hong
hutterite
0.915425290172
Found the auc for  hutte


In [140]:
aucs_nyumc

defaultdict(None,
            {'fluw': 0.5,
             'gov': 0.49917239303806993,
             'hong': 0.64554624987492326,
             'hutte': 0.91542529017165486})

##### Generating the results for Goviral

In [142]:
training_data_goviral = ['nyumc.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
training_directory = "../../Data/With_Improved_Target/With_Demographics/"
filename_ = 'goviral.csv'

In [143]:
cols = ['nyumc', 'fluwatch', 'hongkong', 'hutterite', 'clinically_collected', 'individually_reported', 'health_worker','population']
demo_goviral = return_parameters(directory_+'only_symptoms_goviral.csv',cols)
demo_goviral.keys()

dict_keys(['nyumc', 'fluwatch', 'hongkong', 'hutterite', 'clinically_collected', 'individually_reported', 'health_worker', 'population'])

In [144]:
weights_goviral,thresholds_goviral = process(training_data_goviral,training_data_goviral,training_directory,filename_,demo_goviral)




TypeError: can only concatenate list (not "str") to list

In [123]:
aucs_goviral1 = process_test(training_directory,filename_,demo_goviral,weights_goviral,thresholds_goviral)

TypeError: process_test() missing 1 required positional argument: 'thresholds'

In [None]:
aucs_goviral1


##### Generating the results for fluwatch

In [None]:
training_data_fluwatch = ['nyumc.csv','goviral.csv','hongkong.csv','hutterite.csv']
training_directory = "../../Data/With_Improved_Target/With_Demographics/"
filename_ = 'fluwatch.csv'

In [None]:
cols = ['nyumc', 'goviral', 'hongkong', 'hutterite', 'clinically_collected', 'individually_reported', 'health_worker', 'female', 'male', 'age 0-4', 'age 5-15', 'age 16-44', 'age 45-64', 'age 65+', 'population']
demo_fluwatch = return_parameters(directory_+'with_demographics_fluwatch.csv',cols)
demo_fluwatch.keys()

In [None]:
weights_fluwatch,thresholds_fluwatch = process(training_data_fluwatch,training_directory,filename_,demo_fluwatch)

In [None]:
aucs_fluwatch1 = process_test(training_directory,filename_,demo_fluwatch,weights_fluwatch,thresholds_fluwatch)

In [None]:
aucs_fluwatch1

##### Generating the results for HongKong

In [None]:
training_data_hongkong = ['nyumc.csv','goviral.csv','fluwatch.csv','hutterite.csv']
training_directory = "../../Data/With_Improved_Target/With_Demographics/"
filename_ = 'hongkong.csv'

In [None]:
cols = ['nyumc', 'goviral', 'fluwatch', 'hutterite', 'clinically_collected', 'individually_reported', 'health_worker', 'female', 'male', 'age 0-4', 'age 5-15', 'age 16-44', 'age 45-64', 'age 65+', 'population']
demo_hongkong = return_parameters(directory_+'with_demographics_hongkong.csv',cols)
demo_hongkong.keys()

In [None]:
weights_hongkong,thresholds_hongkong = process(training_data_hongkong,training_directory,filename_,demo_hongkong)

In [None]:
aucs_hongkong = process_test(training_directory,filename_,demo_hongkong,weights_hongkong,thresholds_hongkong)

In [None]:
aucs_hongkong

##### Generating results for hutterite

In [None]:
training_data_hutterite = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv']
training_directory = "../../Data/With_Improved_Target/With_Demographics/"
filename_ = 'hutterite.csv'

In [None]:
cols = ['nyumc', 'goviral', 'fluwatch', 'hongkong', 'clinically_collected', 'individually_reported', 'health_worker', 'female', 'male', 'age 0-4', 'age 5-15', 'age 16-44', 'age 45-64', 'age 65+', 'population']
demo_hutterite = return_parameters(directory_+'with_demographics_hutterite.csv',cols)
demo_hutterite.keys()

In [None]:
weights_hutterite,thresholds_hutterite = process(training_data_hutterite,training_directory,filename_,demo_hutterite)

In [None]:
aucs_hutterite = process_test(training_directory,filename_,demo_hutterite,weights_hutterite,thresholds_hutterite)

In [None]:
aucs_hutterite

In [None]:
aucs_ = defaultdict()
aucs_['nyumc'] = aucs_nyumc
aucs_['goviral'] = aucs_goviral1
aucs_['fluwatch'] = aucs_fluwatch1
aucs_['hongkong'] = aucs_hongkong
aucs_['hutterite'] = aucs_hutterite

In [None]:
aucs_

In [None]:
def create_plots(columns,dict_,name,order,label,title):
    df = pd.DataFrame(columns)
    df_list = []
    for i in dict_.keys():
        df_list.append(pd.DataFrame({k:[v] for (k,v) in dict(dict_[i]).items()}))
    df = pd.concat(df_list)
#     df.fillna(0,inplace = True)
    df[name] = columns
    df.set_index(name,inplace = True)
    df = df[order]
    sns.set()
    ax = plt.axes()
    sns.heatmap(df,annot=True,linewidth = 0.8,ax = ax,cbar_kws = {'label' : label},fmt="f")
    ax.set_title(title)
    ax.set_xlabel('Predicted using datatset')
    plt.show()
    
    return df

In [None]:
nyumc = create_plots(['nyumc','goviral','fluwatch','hongkong','hutterite'], 
                     aucs_, 
                     'Dataset',
                     ['hutterite','hongkong','fluwatch','goviral','nyumc'],'AUC Scores ','Hierarchical : AUC scores with demographics')

In [None]:
aucs_