In [1]:
import warnings; warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
import os
import time
from collections import defaultdict
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score,roc_curve
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
import random
random.seed(1)
from sklearn.linear_model import Lasso



* The symptoms included are as follows:

In [2]:
symptoms = ['intercept',
            'fever',
            'sorethroat',
            'cough',
            'muscle',
            'headache',
            'fatigue',
            'vomit',
            'nausea',
            'diarrhea',
            'chills',
            'sneeze',
            'shortness of breath',
            'phlegm',
            'blockednose',
            'earache',
            'leg pain',
            'runnynose',
            'virus']
aucs_ = defaultdict()

In [3]:
print(symptoms)

['intercept', 'fever', 'sorethroat', 'cough', 'muscle', 'headache', 'fatigue', 'vomit', 'nausea', 'diarrhea', 'chills', 'sneeze', 'shortness of breath', 'phlegm', 'blockednose', 'earache', 'leg pain', 'runnynose', 'virus']


In [4]:
def read_file(filename):
    data = pd.read_csv(filename)
    data['intercept'] = 1
    columns = list(data.columns)
    columns = columns[-1:] + columns[:-1]
    data = data[columns]
#     train_data = data.drop(['virus'],axis =1).as_matrix()
    return data

In [5]:
def read_parameters(filename):
    parameters = pd.read_csv(filename)
    return parameters

#### Get the parameters for the different dataset combinations

In [13]:
directory_ = "./Parameters_Age/"
with_demographics_ = ['with_demographics_new.csv']
with_demographic_parameters = defaultdict()

In [14]:
def return_parameters(file,parameters_of):
    param = read_parameters(file)
    parameter_dict = defaultdict()
    for i in parameters_of:
        parameter_dict[i] = list(param[i])
    return parameter_dict
    

In [15]:
def get_parameters(dataset_name,parameters):
    return np.array(list(parameters[dataset_name]))

In [16]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [17]:
def get_results(param,sample_points):
    return sigmoid(np.dot(param,sample_points.T)  )

In [18]:
def save_results_for_finding_threshold(filename,dataframe,predicted):
    results = pd.DataFrame()
    results['Actual'] = dataframe['virus']
    results['Predicted'] = predicted
    print(results.head())
    results.to_csv(filename,index = False)

In [19]:
def get_all_datasets(training_data_,training_directory):
    datasets = defaultdict()
    for i in training_data_:
        data = read_file(training_directory+i)
        datasets[i[:-4]] = (data)
    return datasets

In [20]:
def get_all_results(data_dict,param):
    results = defaultdict()
    for i in list(param.keys()):
        data,train = data_dict[i]
        results[i] = get_results(param[i],train)
    return results

In [21]:
def result_statistics(list_):
#     print("Min : ",min(list_))
#     print("Max : ",max(list_))
#     print("Mean : ",np.mean(list_))
#     print("Standard Deviation : ",np.std(list_))
    return min(list_),max(list_)

In [22]:
def return_class(threshold,list_):
    ans = [1 if x >= threshold else 0 for x in list_]
    return ans

def metrics_pred(list1,list2):
    f1 =f1_score(list1,list2)
    precision = precision_score(list1,list2)
    recall = recall_score(list1,list2)
    accuracy = accuracy_score(list1,list2)
    fpr,tpr,threshold = roc_curve(list1,list2)
    auc = metrics.auc(fpr,tpr)
#     print("f1 score : ",f1)
#     print("Precision score : ",precision)
#     print("Recall : ",recall)
#     print("Accuracy : ",accuracy)
#     print("Area under the curve : ",auc)
    return auc

In [23]:
def find_threshold(min_,max_,list1,list2,step_size = 1e-3):
    auc_thresholds = defaultdict()
    value = min_
    while value < max_:
        auc_thresholds[value] = metrics_pred(list1,return_class(value,list2))
        value += step_size
    optimal_threshold = max(auc_thresholds.items(), key=lambda x: x[1]) 
    return optimal_threshold

In [24]:
def get_threshold(pred,true):
    min_,max_ = result_statistics(pred)
    threshold = find_threshold(min_,max_,true,pred)
    return threshold

In [25]:
def return_all_thresholds(results,data,y_true):
    thresholds = defaultdict()
    for i in list(data.keys()):
        print("_____________________")
        min_,max_ = result_statistics(results[i])
        
        threshold = find_threshold(min_,max_,y_true[i],results[i])
        print("Found threshold for : ",i)
        thresholds[i] = threshold
    return thresholds

In [26]:

def test(filename_,param,thresholds_):
    aucs = defaultdict()
    data,train = read_file(filename_)
    for i in list(param.keys()):
        test_results = get_results(param[i],train)
        auc_ = metrics_pred(data['virus'],return_class(thresholds_[i][0],test_results))
        aucs[i] = auc_
    return aucs

In [27]:
def return_final_auc_scores(training_data_,training_directory,filename_,parameters):
    data = get_all_datasets(training_data_)
    results = get_all_results(data,parameters)
    #find the thresholds
    thresholds = return_all_thresholds(results,data)
    #get the auc values
    aucs_= test(filename_,parameters,thresholds)
    return aucs_


In [28]:
def create_dict(dict_):
    temp = []
    for k,v in dict_.items():
        temp.append((k,v))
    return temp
        

In [29]:
results_symp = defaultdict()
results_demo = defaultdict()

#### Get the symptoms

In [30]:
def get_gender(dataframe_):
    df = dataframe_[['male','female']]
    temp = df.apply(lambda x:x.argmax(),axis =1)
    return temp

In [31]:
def get_age(dataframe_):
    df = dataframe_[['age 0-4', 'age 5-15', 'age 16-44', 'age 45-64', 'age 65+']]
    temp = df.apply(lambda x: x.argmax(), axis=1)
    return temp

In [32]:
def get_predictions_all(name,train,only_symp_age,only_symp_gender,param_dict,temp_age,temp_gender,collection_mode = 'clinically_collected',population ='population'):
    results = []
    for i in range(train.shape[0]):
        result = []
        sample_point = train[i,:]
        gender = list(only_symp_gender.iloc[i][:])
        age = list(only_symp_age.iloc[i][:])
        p_data = get_results(param_dict[name],sample_point)
        result.append(p_data)
#         result.append(gender[0]*get_results(param_dict['male'],sample_point))
#         result.append(gender[1]*get_results(param_dict['female'],sample_point))

        result.append(age[0]*get_results(param_dict['age 0-4'],sample_point))
        result.append(age[1]*get_results(param_dict['age 5-15'],sample_point))
        result.append(age[2]*get_results(param_dict['age 16-44'],sample_point))
        result.append(age[3]*get_results(param_dict['age 45-64'],sample_point))
        result.append(age[4]*get_results(param_dict['age 65+'],sample_point))
        
#         p_collection = get_results(param_dict[collection_mode],sample_point)
#         p_gender = get_results(param_dict[temp_gender[i]],sample_point)
#         p_age = get_results(param_dict[temp_age[i]],sample_point)
# #         p_population = get_results(param_dict[population],sample_point)
#         result = [p_data,p_collection,p_gender+p_age]
        results.append(result)
    return results
    
    

In [33]:
def get_coeff(X,Y):
    lm = linear_model.LogisticRegression()
    x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2,random_state = 10)
    lm.fit(x_train,y_train)
    y_pred = lm.predict(x_test)
    acc = accuracy_score(y_test,y_pred)
#     print("Accuracy :",acc)
    fpr,tpr,threshold = roc_curve(y_test,y_pred)
    auc_score = metrics.auc(fpr,tpr)
#     print("AUC :",auc_score)
    coefficients = lm.coef_.tolist()[0]
    print("Coefficients : ",coefficients)
    intercept = lm.intercept_.tolist()[0]
    return coefficients,intercept
    

In [34]:
def norm(list_):
    min_ = min(list_)
    max_ = max(list_)
    denom = max_ - min_
    ans = [x-min_/denom for x in list_]
    return ans

In [35]:
COLLECTION_MODE = {'nyumc':'clinically_collected',
                   'goviral':'individually_reported',
                   'fluwatch':'individually_reported',
                   'hongkong': 'health_worker',
                   'hutterite':'health_worker'}

In [43]:
def process_all(training_data_list,training_directory,filename_,parameters,collection_mode = COLLECTION_MODE):
    name_dataset = filename_.split('/')[-1]
    thresholds = defaultdict()
    print(name_dataset)
    data = get_all_datasets(training_data_list,training_directory)
    print("Got the data")
    print("Now finding coefficients for the the datasets!")
    weights = defaultdict()
    for i in data.keys():
        print("Analyzing the dataset : ",i)
        data_ = data[i]
        temp_age = get_age(data_)
        temp_gender = get_gender(data_)
        only_symp_data = data_[symptoms]
        only_symp_gender = data_[['male','female']]
        only_symp_age = data_[['age 0-4','age 5-15','age 16-44','age 45-64','age 65+']]
        only_symp_data.drop('virus',axis = 1,inplace = True)
        train_data_symp = only_symp_data.as_matrix()
        prediction = get_predictions_all(i,train_data_symp,only_symp_age,only_symp_gender,parameters,temp_age,temp_gender,COLLECTION_MODE[i])
        y_true = list(data_['virus'])
        coefficient,intercept = get_coeff(prediction,y_true)
        weights[i] = (coefficient,intercept)
        value = np.array(np.dot(prediction,np.array(weights[i][0]).T)+weights[i][1])
        values = [sigmoid(j) for j in value]
        
        
        threshold = get_threshold(values,y_true)
        print("Found threshold for ",i)
        thresholds[i] = threshold[0]
        ans = [(y_true[i],values[i]) for i in range(len(y_true))]
    return weights,thresholds,ans

In [61]:
def process_test(training_directory,filename_,parameters,weights,thresholds,collection_mode = COLLECTION_MODE):
    aucs_ = defaultdict()
    predictions = defaultdict()
    test_data = get_all_datasets([filename_],training_directory)
    name = filename_.split('.')[0]
    print("Name : ",name) 
    data_ = test_data[name]
    temp_age = get_age(data_)
    temp_gender = get_gender(data_)
    only_symp_data = data_[symptoms]
    only_symp_data.drop('virus',axis = 1,inplace = True)
    only_symp_gender = data_[['male','female']]
    only_symp_age = data_[['age 0-4','age 5-15','age 16-44','age 45-64','age 65+']]
    y_true = list(data_['virus'])
    train_data_symp = only_symp_data.as_matrix()
    for i in weights.keys():
        print("Using the parameters of : ",i)
        prediction = get_predictions_all(i,train_data_symp,only_symp_age,only_symp_gender,parameters,temp_age,temp_gender,COLLECTION_MODE[i])
#         temp = [i[1:] for i in prediction]
#         first = [i[0] for i in prediction]
        prediction = np.array(prediction)
        value = np.array(np.dot(prediction,np.array(weights[i][0]).T)+weights[i][1])
        values = [sigmoid(i) for i in value]
        predictions[i] = values
   
    print("Got the predicitions from the different parameters")
    for i in weights.keys():
        auc_ = metrics_pred(y_true,return_class(thresholds[i],predictions[i]))
        aucs_[i] = auc_
        print("Found the auc for ",i)
    return aucs_
#         

#### With demographics

##### Generating the results for NYUMC

In [53]:
training_data_nyumc = ['goviral.csv']
training_directory = "../../Data/Symptoms_Demo/Train/"
testing_directory = "../../Data/Symptoms_Demo/Test/"
filename_ = 'goviral.csv'

In [54]:
cols = ['goviral',  'individually_reported', 'health_worker', 'age 0-4','age 5-15','age 16-44','age 45-64','age 65+', 'population']
demo_nyumc = return_parameters(directory_+'with_demographics_new.csv',cols)
demo_nyumc.keys()

dict_keys(['age 45-64', 'age 16-44', 'age 65+', 'age 0-4', 'population', 'individually_reported', 'age 5-15', 'goviral', 'health_worker'])

In [55]:
print("With demographics!")

With demographics!


In [56]:
weights_nyumc,thresholds_nyumc,pred = process_all(training_data_nyumc,training_directory,filename_,demo_nyumc)

goviral.csv
Got the data
Now finding coefficients for the the datasets!
Analyzing the dataset :  goviral
Coefficients :  [2.2169113027114857, 0.7529693784700504, -0.3176581623014775, 2.1155451345499166, 1.2440772444435613, 1.5097576564968958]
Found threshold for  goviral


In [57]:
thresholds_nyumc

defaultdict(None, {'goviral': 0.5745458310588295})

In [58]:
weights_nyumc

defaultdict(None,
            {'goviral': ([2.2169113027114857,
               0.7529693784700504,
               -0.3176581623014775,
               2.1155451345499166,
               1.2440772444435613,
               1.5097576564968958],
              -2.41142767356107)})

In [62]:
aucs_nyumc = process_test(testing_directory,filename_,demo_nyumc,weights_nyumc,thresholds_nyumc)

Name :  goviral
Using the parameters of :  goviral
Got the predicitions from the different parameters
Found the auc for  goviral


In [63]:
aucs_nyumc

defaultdict(None, {'goviral': 0.6827671913835958})

In [64]:
aucs_['goviral'] = aucs_nyumc

##### Generating the results for FluWatch

In [65]:
training_data_goviral = ['fluwatch.csv']
# training_directory = "../../Data/Symptoms_Demo/Balanced_Data/Train/"
filename_ = 'fluwatch.csv'

In [67]:
cols = ['fluwatch',  'individually_reported', 'health_worker', 'age 0-4','age 5-15','age 16-44','age 45-64','age 65+', 'population']
demo_goviral = return_parameters(directory_+'with_demographics_new.csv',cols)
demo_goviral.keys()

dict_keys(['age 45-64', 'population', 'age 65+', 'age 0-4', 'individually_reported', 'fluwatch', 'age 5-15', 'age 16-44', 'health_worker'])

In [68]:
weights_goviral,thresholds_goviral,ans = process_all(training_data_goviral,training_directory,filename_,demo_goviral)

fluwatch.csv
Got the data
Now finding coefficients for the the datasets!
Analyzing the dataset :  fluwatch
Coefficients :  [0.3678178968555492, 0.6325855781707181, 0.7439766779519751, 0.4990589828294488, 0.8938304829036621, 0.9014762627864858]
Found threshold for  fluwatch


In [69]:
aucs_goviral1 = process_test(testing_directory,filename_,demo_goviral,weights_goviral,thresholds_goviral)

Name :  fluwatch
Using the parameters of :  fluwatch
Got the predicitions from the different parameters
Found the auc for  fluwatch


In [70]:
aucs_goviral1

defaultdict(None, {'fluwatch': 0.5860521562429442})

In [71]:
aucs_['nyumc'] = aucs_goviral1


##### Generating the results for hongkong

In [72]:
training_data_fluwatch = ['hongkong.csv']
# training_directory = "../../Data/With_Improved_Target/With_Demographics/"
filename_ = 'hongkong.csv'

In [73]:
cols = ['hongkong',  'individually_reported', 'health_worker', 'age 0-4','age 5-15','age 16-44','age 45-64','age 65+', 'population']
demo_fluwatch = return_parameters(directory_+'with_demographics_new.csv',cols)
demo_fluwatch.keys()

dict_keys(['age 45-64', 'hongkong', 'age 65+', 'age 0-4', 'population', 'individually_reported', 'age 5-15', 'age 16-44', 'health_worker'])

In [74]:
weights_fluwatch,thresholds_fluwatch,ans = process_all(training_data_fluwatch,training_directory,filename_,demo_fluwatch)

hongkong.csv
Got the data
Now finding coefficients for the the datasets!
Analyzing the dataset :  hongkong
Coefficients :  [0.30299531993630846, 6.210442628999582, 7.340794408257192, 7.558320157981965, 7.2541358649368135, 6.2621319748346815]
Found threshold for  hongkong


In [75]:
aucs_fluwatch1 = process_test(testing_directory,filename_,demo_fluwatch,weights_fluwatch,thresholds_fluwatch)

Name :  hongkong
Using the parameters of :  hongkong
Got the predicitions from the different parameters
Found the auc for  hongkong


In [76]:
aucs_fluwatch1

defaultdict(None, {'hongkong': 0.9349961213722008})

In [77]:
aucs_['hongkong'] = aucs_fluwatch1

##### Generating the results for Hutterite

In [79]:
training_data_hongkong = ['hutterite.csv']
# training_directory = "../../Data/With_Improved_Target/With_Demographics/"
filename_ = 'hutterite.csv'

In [80]:
cols = ['hutterite',  'individually_reported', 'health_worker', 'age 0-4','age 5-15','age 16-44','age 45-64','age 65+', 'population']
demo_hongkong = return_parameters(directory_+'with_demographics_new.csv',cols)
demo_hongkong.keys()

dict_keys(['age 45-64', 'population', 'age 65+', 'age 0-4', 'individually_reported', 'age 5-15', 'age 16-44', 'hutterite', 'health_worker'])

In [81]:
weights_hongkong,thresholds_hongkong,ans = process_all(training_data_hongkong,training_directory,filename_,demo_hongkong)

hutterite.csv
Got the data
Now finding coefficients for the the datasets!
Analyzing the dataset :  hutterite
Coefficients :  [1.9475855592665567, 1.3180521894619541, 1.7345791144586418, 1.5549686441273287, 2.183000572425849, 1.4018967820285873]
Found threshold for  hutterite


In [82]:
aucs_hongkong = process_test(testing_directory,filename_,demo_hongkong,weights_hongkong,thresholds_hongkong)

Name :  hutterite
Using the parameters of :  hutterite
Got the predicitions from the different parameters
Found the auc for  hutterite


In [83]:
aucs_hongkong

defaultdict(None, {'hutterite': 0.6947162426614482})

In [84]:
aucs_['hutterite'] = aucs_hongkong

In [85]:
aucs_

defaultdict(None,
            {'goviral': defaultdict(None, {'goviral': 0.6827671913835958}),
             'hongkong': defaultdict(None, {'hongkong': 0.9349961213722008}),
             'hutterite': defaultdict(None, {'hutterite': 0.6947162426614482}),
             'nyumc': defaultdict(None, {'fluwatch': 0.5860521562429442})})

In [237]:
def create_plots(columns,dict_,name,order,label,title):
    df = pd.DataFrame()
    df_list = []
    for i in columns:
        print(i)
        df_list.append(pd.DataFrame({k:[v] for (k,v) in dict(dict_[i]).items()}))
    df = pd.concat(df_list)
#     df.fillna(0,inplace = True)
    df[name] = columns
    df.set_index(name,inplace = True)
    df = df[order]
    print(df)
    sns.set()
    ax = plt.axes()
    sns.heatmap(df,annot=True,linewidth = 0.8,ax = ax,cbar_kws = {'label' : label},fmt="f")
    ax.set_title(title)
    ax.set_xlabel('Predicted using datatset')
    plt.show()
    
    return df

In [238]:
nyumc = create_plots(['fluwatch','hutterite','hongkong','nyumc','goviral'], 
                     aucs_, 
                     'Dataset',
                     ['goviral','nyumc','hongkong','hutterite','fluwatch'],'AUC Scores ','Hierarchical : AUC scores with demographics')

fluwatch


KeyError: 'fluwatch'

In [None]:
nyumc = create_plots(['nyumc','goviral','fluwatch','hongkong','hutterite'], 
                     aucs_, 
                     'Dataset',
                     ['hutterite','hongkong','fluwatch','goviral','nyumc'],'AUC Scores ','Hierarchical : AUC scores with demographics')

In [None]:
nyumc = create_plots(['nyumc','goviral','fluwatch','hongkong','hutterite'], 
                     aucs_, 
                     'Dataset',
                     ['hutterite','hongkong','fluwatch','goviral','nyumc'],'AUC Scores ','Hierarchical : AUC scores with demographics')