In [None]:
import time
import numpy as np
import pandas as pd
import os
from os.path import join, abspath, dirname, realpath
import sys
import datetime
from dateutil.relativedelta import relativedelta
from datetime import date, timedelta
from dateutil import relativedelta
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, roc_auc_score


from dateutil.relativedelta import relativedelta

from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc
import argparse
import csv


import multiprocessing as mp
from multiprocessing import Process

from functools import partial
from skopt import gp_minimize
from skopt import space

from functions import *
from Train_bayes import *


In [None]:
def data_prep(data):
    
    data['date']=pd.to_datetime(data['date'])
    data['migraine_start']=data['migraine_start'].astype(int)
    data['order']=data['order'].astype(float).astype(int)
    data['imputation']=data['imputation'].astype(float).astype(int)
    data.loc[:, ['temp_avg','sun_perc','precip_tot',
                  'pres_avg','cloud_avg','wind_avg','hum_avg',]] = data.loc[:, ['temp_avg','sun_perc','precip_tot','pres_avg',
                  'cloud_avg','wind_avg','hum_avg',]].astype(float)
    
    data=data.sort_values(['order', 'date'])
    data.reset_index(drop=True, inplace=True)
    return(data)

In [None]:
#returns a table with the predictions and the true values for the outcome variable
def prd2(df_p,model):
    predictions=[]
    true_values=[]
    orders=np.unique(df_p['order'])
    #Processes every sequence individually
    for o in orders:
    
        df_batch=df_p[df_p['order']==o]
        df_batch=df_batch.sort_values(by='date')
        df_batch.reset_index(drop=True, inplace=True)
        
        
        X=df_batch.loc[:, ['temp_avg','sun_perc','precip_tot','pres_avg',
                             'cloud_avg','wind_avg','hum_avg',]].to_numpy()
        #resize for rnn keras use
        X=np.resize(X,(X.shape[0],1,X.shape[1]))
        
        pred=model.predict(X,batch_size=1)
        #reset state to clear memory
        model.reset_states()
       
        pred=np.concatenate(pred).ravel()
        
        for number,j in enumerate(df_batch['imputation']):
             if not j:
                    predictions.append(pred[number])
                    true_values.append(df_batch['migraine_start'][number])
        
    predictions=np.asarray(predictions)
    true_values=np.asarray(true_values)
    

        
    precision, recall, thresholds = precision_recall_curve(true_values, predictions)
    auc_precision_recall=auc(recall, precision)
    
    auc_roc= round(roc_auc_score(true_values,predictions),3)
    
    return(predictions,true_values)

In [None]:
#creation of the model given the parameters and the cell type
def modelcreation_bayes2(params,cell_type):
    #batch_size, units, layers, dropout=params
    units, layers, dropout=params
    new_model = tf.keras.Sequential()
    #adding every layer one by one
    for i in range(layers):
        if cell_type=='LSTM':
            new_model.add(tf.keras.layers.LSTM(units, activation='tanh', return_sequences=True,
                                       stateful=True, batch_input_shape=(1,1,7)))
        elif cell_type=='GRU':
            new_model.add(tf.keras.layers.GRU(units, activation='tanh', return_sequences=True,
                                       stateful=True, batch_input_shape=(1,1,7)))
            
        if dropout!=0:
            new_model.add(tf.keras.layers.Dropout(dropout*0.1))
    #final layer with the outcome variable
    new_model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    opt=tf.keras.optimizers.Adam()
    new_model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
    
    return(new_model)

In [None]:
#training of the model and then it also makes predictions 

def fit_lstm2(df,df_test,params,cell_type,fold=None,epochs=1):

    batch_size=1
    # Since most days are not migraine days, the two classes are not balanced 
    class_weights = class_weight.compute_class_weight(class_weight='balanced',
                                                  classes=np.asarray([0,1]),
                                                  y=df.loc[:,'migraine_start'])

    weights = {i : class_weights[i] for i in range(2)}
    
    orders=np.unique(df['order'])
    #creation of model
    model=modelcreation_bayes2(params,cell_type=cell_type)
    #loop for every epoch
    for i in np.arange(epochs):
        #print('epoch= ',i+1)
        #every sequence one by one
        for o in orders:
            df_batch=df[df['order']==o]
            df_batch=df_batch.sort_values(by='date')
            df_batch.reset_index(drop=True, inplace=True)

            X=df_batch.loc[:, ['temp_avg','sun_perc','precip_tot','pres_avg',
                             'cloud_avg','wind_avg','hum_avg',]].to_numpy()


            X=np.resize(X,(X.shape[0],1,X.shape[1]))


            Y=df_batch.loc[:,['migraine_start']]
            #fit the same model for every sequence 
            model.fit(X,Y,epochs=1,batch_size=batch_size,shuffle=False, class_weight=weights, verbose=0)
            #reseting states after every group of data so that different characteristics of every group do not influence next group
            #and weather conditions of a previous group do not influence the predictions of the current group 
            model.reset_states() 

    train_scores['Train:']=prd2(df,model=model)
    test_scores['Test:']=prd2(df_test,model=model)
    return(prd2(df,model=model),prd2(df_test,model=model))



In [None]:
def main(args,cell_type):
    
    train = pd.read_csv('train_plus.csv', sep=',', index_col=False, dtype='unicode')#, error_bad_lines=False
    
    train = data_prep(train)
    
    test = pd.read_csv('test_plus.csv', sep=',', index_col=False, dtype='unicode')#, error_bad_lines=False
    
    test = data_prep(test)
    
    weather_full=pd.read_csv("weather_full.csv")
    weather_full['date']=pd.to_datetime(weather_full['date'])
    weather_full.loc[:, ['temp_avg','sun_perc','precip_tot',
                  'pres_avg','cloud_avg','wind_avg','hum_avg',]] = weather_full.loc[:, ['temp_avg','sun_perc','precip_tot','pres_avg',
                  'cloud_avg','wind_avg','hum_avg',]].astype(float)
    
    
    #Scaling of the data
    scaler = StandardScaler()

    train.iloc[:,3:-2]=scaler.fit_transform(train.iloc[:,3:-2])
    test.iloc[:,3:-2]=scaler.transform(test.iloc[:,3:-2])

    start = time.time()
    
    
    train_values,test_values=fit_lstm2(train,test,args.params,cell_type=cell_type)
    
    end = time.time()
    
    print("Time:"+str(int(end - start)))
    return(train_values,test_values)

In [None]:
if __name__=='__main__':
    train_scores=mp.Manager().dict()
    test_scores=mp.Manager().dict()
    patient_scores=mp.Manager().dict()
    #comment out for py file
    #args=parse_args()
    #turn this to a comment for py
    random.seed(441995)
    class args:
        epochs=10
        #comment out next line for lstm
        params=39, 4, 4 
        #comment out next line for gru
        #params=10, 4, 2
        
    
    #FOR LSTM
    train_values,test_values=main(args,cell_type='LSTM')
    (pd.DataFrame(train_values).T).to_csv('performance_lstm_train.zip', index=False)
    (pd.DataFrame(test_values).T).to_csv('performance_lstm_test.zip', index=False)
    
    #COMMENT OUT FOR GRU
    #train_values,test_values=main(args,cell_type='GRU')
    #(pd.DataFrame(train_values).T).to_csv('performance_gru_train.zip', index=False)
    #(pd.DataFrame(test_values).T).to_csv('performance_gru_test.zip', index=False)
    

In [None]:
#FOR LSTM
train_values_p = pd.read_csv('performance_lstm_train.zip', sep=',', index_col=False, dtype='unicode')
test_values_p = pd.read_csv('performance_lstm_test.zip', sep=',', index_col=False, dtype='unicode')

#COMMENT OUT FOR GUR
#train_values_p = pd.read_csv('performance_gru_train.zip', sep=',', index_col=False, dtype='unicode')
#test_values_p = pd.read_csv('performance_gru_test.zip', sep=',', index_col=False, dtype='unicode')

train_values_p['1']=train_values_p['1'].astype(float).astype(int)
train_values_p['0']=train_values_p['0'].astype(float)
test_values_p['1']=test_values_p['1'].astype(float).astype(int)
test_values_p['0']=test_values_p['0'].astype(float)
(train_values_p).columns=['pred','real']
(test_values_p).columns=['pred','real']

precision, recall, thresholds = precision_recall_curve(train_values_p['real'].to_numpy(), 
                                                       train_values_p['pred'].to_numpy())
auc_pr_train=auc(recall, precision)

precision, recall, thresholds = precision_recall_curve(test_values_p['real'].to_numpy(), 
                                                       test_values_p['pred'].to_numpy())
auc_pr_test=auc(recall, precision)
print(f'AUC PR: Test={round(auc_pr_test,3)}, Train={round(auc_pr_train,3)}')


#training data: train_plus.csv

train = pd.read_csv('train_plus.csv', sep=',', index_col=False, dtype='unicode')#, error_bad_lines=False
train = data_prep(train)

#testing data: test_plus.csv
test = pd.read_csv('test_plus.csv', sep=',', index_col=False, dtype='unicode')#, error_bad_lines=False
test = data_prep(test)

real_train=train.loc[(train['imputation']==0)]
pr_train=real_train.loc[real_train['migraine_start']==1].shape[0]/real_train.shape[0]
real_test=test.loc[(test['imputation']==0)]
pr_test=real_test.loc[real_test['migraine_start']==1].shape[0]/real_test.shape[0]

print(f'Baseline Train AUC PR: {round(pr_train,3)}')
print(f'Baseline Test AUC PR: {round(pr_test,3)}')
