In [1]:
import warnings; warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
import os
import time
from collections import defaultdict
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score,roc_curve
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
import random
random.seed(1)
from sklearn.linear_model import Lasso



* The symptoms included are as follows:

In [49]:
symptoms = ['intercept','fever','sorethroat','cough','muscle','headache','fatigue','vomit','nausea','diarrhea','chills','sneeze','shortness of breath','phlegm','blockednose','earache','leg pain','runnynose','virus']
age = ['age 0-4', 'age 5-15', 'age 16-44', 'age 45-64', 'age 65+']
gender = ['male','female']

aucs_ = defaultdict()

In [3]:
print(symptoms)

['intercept', 'fever', 'sorethroat', 'cough', 'muscle', 'headache', 'fatigue', 'vomit', 'nausea', 'diarrhea', 'chills', 'sneeze', 'shortness of breath', 'phlegm', 'blockednose', 'earache', 'leg pain', 'runnynose', 'virus']


In [4]:
def read_file(filename):
    data = pd.read_csv(filename)
    data['intercept'] = 1
    columns = list(data.columns)
    columns = columns[-1:] + columns[:-1]
    data = data[columns]
#     train_data = data.drop(['virus'],axis =1).as_matrix()
    return data

In [5]:
def read_parameters(filename):
    parameters = pd.read_csv(filename)
    return parameters

#### Get the parameters for the different dataset combinations

In [6]:
directory_ = "./Generated_Parameters_5/"
with_demographics_ = ['with_demographics_nyumc.csv','with_demographics_goviral.csv','with_demographics_fluwatch.csv','with_demographics_hongkong.csv','with_demographics_hutterite.csv']
with_demographic_parameters = defaultdict()

In [7]:
def return_parameters(file,parameters_of):
    param = read_parameters(file)
    parameter_dict = defaultdict()
    for i in parameters_of:
        parameter_dict[i] = list(param[i])
    return parameter_dict
    

In [8]:
def get_parameters(dataset_name,parameters):
    return np.array(list(parameters[dataset_name]))

In [9]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [10]:
def get_results(param,sample_points):
    return sigmoid(np.dot(param,sample_points.T)  )

In [11]:
def save_results_for_finding_threshold(filename,dataframe,predicted):
    results = pd.DataFrame()
    results['Actual'] = dataframe['virus']
    results['Predicted'] = predicted
    print(results.head())
    results.to_csv(filename,index = False)

In [12]:
def get_all_datasets(training_data_,training_directory):
    datasets = defaultdict()
    for i in training_data_:
        data = read_file(training_directory+i)
        datasets[i[:-4]] = (data)
    return datasets

In [13]:
def get_all_results(data_dict,param):
    results = defaultdict()
    for i in list(param.keys()):
        data,train = data_dict[i]
        results[i] = get_results(param[i],train)
    return results

In [14]:
def result_statistics(list_):
#     print("Min : ",min(list_))
#     print("Max : ",max(list_))
#     print("Mean : ",np.mean(list_))
#     print("Standard Deviation : ",np.std(list_))
    return min(list_),max(list_)

In [15]:
def return_class(threshold,list_):
    ans = [1 if x >= threshold else 0 for x in list_]
    return ans

def metrics_pred(list1,list2):
    f1 =f1_score(list1,list2)
    precision = precision_score(list1,list2)
    recall = recall_score(list1,list2)
    accuracy = accuracy_score(list1,list2)
    fpr,tpr,threshold = roc_curve(list1,list2)
    auc = metrics.auc(fpr,tpr)
#     print("f1 score : ",f1)
#     print("Precision score : ",precision)
#     print("Recall : ",recall)
#     print("Accuracy : ",accuracy)
#     print("Area under the curve : ",auc)
    return auc

In [16]:
def find_threshold(min_,max_,list1,list2,step_size = 1e-3):
    auc_thresholds = defaultdict()
    value = min_
    while value < max_:
        auc_thresholds[value] = metrics_pred(list1,return_class(value,list2))
        value += step_size
    optimal_threshold = max(auc_thresholds.items(), key=lambda x: x[1]) 
    return optimal_threshold

In [17]:
def get_threshold(pred,true):
    min_,max_ = result_statistics(pred)
    threshold = find_threshold(min_,max_,true,pred)
    return threshold

In [18]:
def return_all_thresholds(results,data,y_true):
    thresholds = defaultdict()
    for i in list(data.keys()):
        print("_____________________")
        min_,max_ = result_statistics(results[i])
        
        threshold = find_threshold(min_,max_,y_true[i],results[i])
        print("Found threshold for : ",i)
        thresholds[i] = threshold
    return thresholds

In [19]:

def test(filename_,param,thresholds_):
    aucs = defaultdict()
    data,train = read_file(filename_)
    for i in list(param.keys()):
        test_results = get_results(param[i],train)
        auc_ = metrics_pred(data['virus'],return_class(thresholds_[i][0],test_results))
        aucs[i] = auc_
    return aucs

In [20]:
def return_final_auc_scores(training_data_,training_directory,filename_,parameters):
    data = get_all_datasets(training_data_)
    results = get_all_results(data,parameters)
    #find the thresholds
    thresholds = return_all_thresholds(results,data)
    #get the auc values
    aucs_= test(filename_,parameters,thresholds)
    return aucs_


In [21]:
def create_dict(dict_):
    temp = []
    for k,v in dict_.items():
        temp.append((k,v))
    return temp
        

In [22]:
results_symp = defaultdict()
results_demo = defaultdict()

#### Get the symptoms

In [23]:
def get_gender(dataframe_):
    df = dataframe_[['male','female']]
    temp = df.apply(lambda x:x.argmax(),axis =1)
    return temp

In [24]:
def get_age(dataframe_):
    df = dataframe_[['age 0-4', 'age 5-15', 'age 16-44', 'age 45-64', 'age 65+']]
    temp = df.apply(lambda x: x.argmax(), axis=1)
    return temp

In [77]:
def get_predictions_all(name,train,only_gender_data,only_age_data,param_dict,temp_age,temp_gender,collection_mode = 'clinically_collected',population ='population'):
    results = []
#     for i in range(train.shape[0]):
    for i in range(0,1):
        result = []
        sample_point = train[i,:]
        p_data = [get_results(param_dict[name],sample_point)]
        result.append(p_data)
        print("Data : ",p_data)
        p_collection = [get_results(param_dict[collection_mode],sample_point)]
        print('Collection : ',p_collection)
        result.append(p_collection)
        gender = only_gender_data.iloc[i][:]
        age = only_age_data.loc[i][:]
        p_gender = [gender['male']*get_results(param_dict['male'] ,sample_point),gender['female']*get_results(param_dict['female'] ,sample_point) ]
        print("Gender :",p_gender)
        p_age = [age['age 0-4']*get_results(param_dict['age 0-4'] ,sample_point),
                 age['age 5-15']*get_results(param_dict['age 5-15'] ,sample_point),
                 age['age 16-44']* get_results(param_dict['age 16-44'] ,sample_point),
                 age['age 45-64']*get_results(param_dict['age 45-64'] ,sample_point),
                 age['age 65+']*get_results(param_dict['age 65+'] ,sample_point)]
#         p_population = get_results(param_dict[population],sample_point)
        print("Age : ",p_age)
        result = [p_data] + [p_collection] + [p_gender] +[p_age]
        print("Final result : ",result)
#         result = [p_data,p_collection,p_gender+p_age]
        results.append(result)
    return results
    

In [63]:
def get_coeff(X,Y):
    lm = linear_model.LogisticRegression()
    x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2,random_state = 10)
    lm.fit(x_train,y_train)
    y_pred = lm.predict(x_test)
    acc = accuracy_score(y_test,y_pred)
#     print("Accuracy :",acc)
    fpr,tpr,threshold = roc_curve(y_test,y_pred)
    auc_score = metrics.auc(fpr,tpr)
#     print("AUC :",auc_score)
    coefficients = lm.coef_.tolist()[0]
    print("Coefficients : ",coefficients)
    intercept = lm.intercept_.tolist()[0]
    return coefficients,intercept
    

In [64]:
def norm(list_):
    min_ = min(list_)
    max_ = max(list_)
    denom = max_ - min_
    ans = [x-min_/denom for x in list_]
    return ans

In [65]:
COLLECTION_MODE = {'nyumc':'clinically_collected',
                   'goviral':'individually_reported',
                   'fluwatch':'individually_reported',
                   'hongkong': 'health_worker',
                   'hutterite':'health_worker'}

In [66]:
# predicitions = [prediction_dataset,
#                 prediction_collection_mode,
#                 predicition_male,
#                 prediction_female,
#                 prediction_age0-4,
#                 prediction_age5-15,
#                 prediction_age6-44,
#                 prediction_age45-64,
#                 prediction_age65+]

In [67]:
def process_all(training_data_list,training_directory,filename_,parameters,collection_mode = COLLECTION_MODE):
    name_dataset = filename_.split('/')[-1]
    thresholds = defaultdict()
    print(name_dataset)
    data = get_all_datasets(training_data_list,training_directory)
    print("Got the data")
    print("Now finding coefficients for the the datasets!")
    weights = defaultdict()
    for i in data.keys():
        print("Analyzing the dataset : ",i)
        data_ = data[i]
        temp_age = get_age(data_)
        temp_gender = get_gender(data_)
        only_symp_data = data_[symptoms]
        only_symp_data.drop('virus',axis = 1,inplace = True)
        only_age_data = data_[age]
        only_gender_data = data_[gender]
        train_data_symp = only_symp_data.as_matrix()
        prediction = get_predictions_all(i,train_data_symp,only_gender_data,only_age_data,parameters,temp_age,temp_gender,COLLECTION_MODE[i])
        print(prediction)
        y_true = list(data_['virus'])
        coefficient,intercept = get_coeff(prediction,y_true)
        weights[i] = (coefficient,intercept)
        value = np.array(np.dot(prediction,np.array(weights[i][0]).T)+weights[i][1])
        values = [sigmoid(j) for j in value]
        threshold = get_threshold(values,y_true)
        print("Found threshold for ",i)
        thresholds[i] = threshold[0]
        ans = [(y_true[i],values[i]) for i in range(len(y_true))]
    return weights,thresholds,ans

In [68]:
nyid@nationalgrid.com 0232053636

SyntaxError: invalid token (<ipython-input-68-5425f3a136ac>, line 1)

#### With demographics

##### Generating the results for HongKong

In [69]:
training_data_nyumc = ['nyumc.csv']
training_directory = "../../Data/With_Improved_Target/With_Demographics/"
filename_ = 'nyumc.csv'

In [74]:
cols = [ 'nyumc', 'clinically_collected', 'individually_reported', 'health_worker', 'female', 'male', 'age 0-4', 'age 5-15', 'age 16-44', 'age 45-64', 'age 65+', 'population']
demo_nyumc = return_parameters(directory_+'with_demographics_goviral.csv',cols)

In [75]:
print("With demographics!")

With demographics!


##### All the levels

In [78]:
weights_all,thresholds_all,prediction_all = process_all(training_data_nyumc,training_directory,filename_,demo_nyumc)

nyumc.csv
Got the data
Now finding coefficients for the the datasets!
Analyzing the dataset :  nyumc
Data :  [[0.4131297785795559]]
Collection :  [[0.31545297259793503]]
Gender : [0.36369572312417436, 0.0]
Age :  [0.0, 0.3640005953384614, 0.0, 0.0, 0.0]
Final result :  [[[0.4131297785795559]], [[0.31545297259793503]], [0.36369572312417436, 0.0], [0.0, 0.3640005953384614, 0.0, 0.0, 0.0]]
[[[[0.4131297785795559]], [[0.31545297259793503]], [0.36369572312417436, 0.0], [0.0, 0.3640005953384614, 0.0, 0.0, 0.0]]]


ValueError: Found input variables with inconsistent numbers of samples: [1, 21907]

In [134]:
thresholds_all

defaultdict(None, {'nyumc': 0.70308413882679366})

In [135]:
weights_all['nyumc'][0]

[-13.374995305463436, 4.172228255717748, 7.069313833670369]

##### Demographic level

In [136]:
weights_demographic,thresholds_demographic,prediction_demographic = process_demographic(training_data_nyumc,training_directory,filename_,demo_nyumc)

nyumc.csv
Got the data
Now finding coefficients for the the datasets!
Analyzing the dataset :  nyumc
Coefficients :  [8.441103973703346]
Found threshold for  nyumc


In [137]:
weights_demographic['nyumc'][0]

[8.441103973703346]

In [138]:
thresholds_demographic

defaultdict(None, {'nyumc': 0.56501044335588047})

##### Collection level

In [139]:
weights_collection,thresholds_collection,prediction_collection = process_collection(training_data_nyumc,training_directory,filename_,demo_nyumc)

nyumc.csv
Got the data
Now finding coefficients for the the datasets!
Analyzing the dataset :  nyumc
Coefficients :  [14.637663901470939]
Found threshold for  nyumc


In [140]:
weights_collection['nyumc'][0]

[14.637663901470939]

In [141]:
thresholds_collection

defaultdict(None, {'nyumc': 0.40504697887531044})

##### Only dataset

In [142]:
weights_dataset,thresholds_dataset,prediction_dataset = process_only_dataset(training_data_nyumc,training_directory,filename_,demo_nyumc)

nyumc.csv
Got the data
Now finding coefficients for the the datasets!
Analyzing the dataset :  nyumc
Found threshold for  nyumc


In [143]:
weights_dataset

defaultdict(None, {})

In [144]:
thresholds_dataset

defaultdict(None, {'nyumc': 0.49808848602681988})

In [145]:
predictions = pd.DataFrame()
predictions['Predicted_Value_All'] = [i[1] for i in prediction_all]

In [146]:
predictions['Predicted_Value_Collection'] = [i[1] for i in prediction_collection]

In [147]:
predictions['Predicted_Value_Demographic'] = [i[1] for i in prediction_demographic]

In [148]:
predictions['Predicted_Value_Dataset'] = [i[1] for i in prediction_dataset]

In [149]:
predictions['True'] = [i[0] for i in prediction_all]

In [150]:
predictions[0:10]

Unnamed: 0,Predicted_Value_All,Predicted_Value_Collection,Predicted_Value_Demographic,Predicted_Value_Dataset,True
0,0.008524,0.004458,0.004193,0.41313,0
1,0.000149,0.000122,5.2e-05,0.363979,0
2,0.032877,0.040791,0.037608,0.497823,0
3,0.03306,0.040791,0.037858,0.497823,0
4,0.033501,0.040791,0.038458,0.497823,0
5,0.030729,0.023143,0.027188,0.469584,1
6,0.03306,0.040791,0.037858,0.497823,0
7,0.008726,0.004458,0.004312,0.41313,0
8,0.0335,0.040791,0.038457,0.497823,0
9,0.030736,0.023671,0.026957,0.469507,0


In [151]:
predictions['All_Dataset_Specific'] = weights_all['nyumc'][0][0]

In [152]:
predictions['All_Collection_Mode'] = weights_all['nyumc'][0][1]

In [153]:
# predictions['Only_Dataset'] = weights_dataset['nyumc'][0][0]

In [154]:
predictions['All_Demographic'] = weights_all['nyumc'][0][2]

In [155]:
predictions['Only_Collection'] = weights_collection['nyumc'][0][0]

In [156]:
predictions['Only_Demographic'] = weights_demographic['nyumc'][0][0]

In [157]:
predictions[0:10]

Unnamed: 0,Predicted_Value_All,Predicted_Value_Collection,Predicted_Value_Demographic,Predicted_Value_Dataset,True,All_Dataset_Specific,All_Collection_Mode,All_Demographic,Only_Collection,Only_Demographic
0,0.008524,0.004458,0.004193,0.41313,0,-13.374995,4.172228,7.069314,14.637664,8.441104
1,0.000149,0.000122,5.2e-05,0.363979,0,-13.374995,4.172228,7.069314,14.637664,8.441104
2,0.032877,0.040791,0.037608,0.497823,0,-13.374995,4.172228,7.069314,14.637664,8.441104
3,0.03306,0.040791,0.037858,0.497823,0,-13.374995,4.172228,7.069314,14.637664,8.441104
4,0.033501,0.040791,0.038458,0.497823,0,-13.374995,4.172228,7.069314,14.637664,8.441104
5,0.030729,0.023143,0.027188,0.469584,1,-13.374995,4.172228,7.069314,14.637664,8.441104
6,0.03306,0.040791,0.037858,0.497823,0,-13.374995,4.172228,7.069314,14.637664,8.441104
7,0.008726,0.004458,0.004312,0.41313,0,-13.374995,4.172228,7.069314,14.637664,8.441104
8,0.0335,0.040791,0.038457,0.497823,0,-13.374995,4.172228,7.069314,14.637664,8.441104
9,0.030736,0.023671,0.026957,0.469507,0,-13.374995,4.172228,7.069314,14.637664,8.441104


In [158]:
predictions.to_csv("../Predictions/predictions_nyumc.csv",index=False)