# Exploration Notebook

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Data Preprocessing

In [None]:
# lets load in all three datasets for each year
symptoms19 = pd.read_csv('../data/2019/symptoms19.csv', index_col=['VAERS_ID'], encoding='latin-1')
data19     = pd.read_csv('../data/2019/data19.csv', index_col=['VAERS_ID'], encoding='latin-1')
vax19      = pd.read_csv('../data/2019/vax19.csv', index_col=['VAERS_ID'], encoding='latin-1')

symptoms20 = pd.read_csv('../data/2020/symptoms20.csv', index_col=['VAERS_ID'], encoding='latin-1')
data20     = pd.read_csv('../data/2020/data20.csv', index_col=['VAERS_ID'], encoding='latin-1')
vax20      = pd.read_csv('../data/2020/vax20.csv', index_col=['VAERS_ID'], encoding='latin-1')

symptoms21 = pd.read_csv('../data/2021/symptoms21.csv', index_col=['VAERS_ID'], encoding='latin-1')
data21     = pd.read_csv('../data/2021/data21.csv', index_col=['VAERS_ID'], encoding='latin-1')
vax21      = pd.read_csv('../data/2021/vax21.csv', index_col=['VAERS_ID'], encoding='latin-1')

In [None]:
# there were no Covid-19 vaccinations untill 2020
(vax19['VAX_TYPE']=='COVID19').sum()

In [None]:
# we can combine the three datasets for the years containing covid vaccinations on the index
combined_vax = pd.concat([vax20, vax21])
combined_data = pd.concat([data20, data21])
combined_symptoms = pd.concat([symptoms20, symptoms21])

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
combined_vax[combined_vax['VAX_TYPE'] == 'COVID19']

In [None]:
datavax = pd.merge(combined_data, combined_vax, on='VAERS_ID', how='right')
dvs = pd.merge(datavax, combined_symptoms, on='VAERS_ID', how='left')

In [None]:
# isolating covid-19 vaccinations for the base dataframe
df = dvs[dvs['VAX_TYPE'] == 'COVID19']
df.head(4)

In [None]:
# lets isolate all the text data to deal with later
df_text_cols = df[['SYMPTOM_TEXT', 'LAB_DATA', 'OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'ALLERGIES', 'SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']]

In [None]:
len(df)

In [None]:
len(df.drop_duplicates())

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
# Lets drop unessesary columns
# Lets move this untill after EDA is completed
# df.drop(axis=1, inplace=True, columns=['CAGE_YR', 'CAGE_MO', 'RPT_DATE', 'ER_VISIT','V_FUNDBY', 'SPLTTYPE', 'FORM_VERS',
#                                        'TODAYS_DATE','OFC_VISIT', 'ER_ED_VISIT', 'VAX_TYPE', 'VAX_NAME', 'VAX_LOT',
#                                        'SYMPTOM_TEXT','LAB_DATA','OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'ALLERGIES',
#                                        'SYMPTOM1', 'SYMPTOM2','SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5', 'VAX_DATE', 'ONSET_DATE',
#                                       'PRIOR_VAX', 'DATEDIED'])

In [None]:
df['DIED'] = df['DIED'].fillna(0)
df['DIED'] = df['DIED'].replace('Y', 1)

df['SEX'] = df['SEX'].replace('U', '0')
df['SEX'] = df['SEX'].replace('F', '0')
df['SEX'] = df['SEX'].replace('M', '1')

df['L_THREAT'] = df['L_THREAT'].fillna(0)
df['L_THREAT'] = df['L_THREAT'].replace('Y', 1)

df['HOSPITAL'] = df['HOSPITAL'].fillna(0)
df['HOSPITAL'] = df['HOSPITAL'].replace('Y', 1)

df['HOSPDAYS'] = df['HOSPDAYS'].fillna(0)

df['X_STAY'] = df['X_STAY'].fillna(0)
df['X_STAY'] = df['X_STAY'].replace('Y', 1)

df['DISABLE'] = df['DISABLE'].fillna(0)
df['DISABLE'] = df['DISABLE'].replace('Y', 1)

df['RECOVD'] = df['RECOVD'].fillna(0)
df['RECOVD'] = df['RECOVD'].replace('U', 0)
df['RECOVD'] = df['RECOVD'].replace('N', 0)
df['RECOVD'] = df['RECOVD'].replace('Y', 1)

df['BIRTH_DEFECT'] = df['BIRTH_DEFECT'].fillna(0)
df['BIRTH_DEFECT'] = df['BIRTH_DEFECT'].replace('Y', 1)

df['VAX_DOSE_SERIES'] = df['VAX_DOSE_SERIES'].fillna(0)
df['VAX_DOSE_SERIES'] = df['VAX_DOSE_SERIES'].replace('7+', 7)
df['VAX_DOSE_SERIES'] = df['VAX_DOSE_SERIES'].replace('UNK', 1)

df['SYMPTOMVERSION2'] = df['SYMPTOMVERSION2'].fillna(0)
df['SYMPTOMVERSION3'] = df['SYMPTOMVERSION3'].fillna(0)
df['SYMPTOMVERSION4'] = df['SYMPTOMVERSION4'].fillna(0)
df['SYMPTOMVERSION5'] = df['SYMPTOMVERSION5'].fillna(0)

In [None]:
df = pd.concat([df,pd.get_dummies(df['VAX_MANU'], prefix='BRAND: ')],axis=1).drop(['VAX_MANU'],axis=1)
df = pd.concat([df,pd.get_dummies(df['VAX_SITE'], prefix='VAX_SITE: ')],axis=1).drop(['VAX_SITE'],axis=1)
df = pd.concat([df,pd.get_dummies(df['VAX_ROUTE'], prefix='VAX_ROUTE: ')],axis=1).drop(['VAX_ROUTE'],axis=1)
df = pd.concat([df,pd.get_dummies(df['V_ADMINBY'], prefix='ADMINBY: ')],axis=1).drop(['V_ADMINBY'],axis=1)

In [None]:
# fill according to average
df['AGE_YRS'] = df['AGE_YRS'].fillna(50)

In [None]:
lmno = df['NUMDAYS']
df['NUMDAYS'] = lmno.where(lmno<120, 7)

In [None]:
df['SEX'] = df['SEX'].astype(int)
df['AGE_YRS'] = df['AGE_YRS'].astype(int)
df['HOSPDAYS'] = df['HOSPDAYS'].astype(int)
df['NUMDAYS'] = df['NUMDAYS'].astype(int)
df['VAX_DOSE_SERIES'] = df['VAX_DOSE_SERIES'].astype(int)

df['SYMPTOMVERSION1'] = df['SYMPTOMVERSION1'].astype('category')
df['SYMPTOMVERSION2'] = df['SYMPTOMVERSION2'].astype('category')
df['SYMPTOMVERSION3'] = df['SYMPTOMVERSION3'].astype('category')
df['SYMPTOMVERSION4'] = df['SYMPTOMVERSION4'].astype('category')
df['SYMPTOMVERSION5'] = df['SYMPTOMVERSION5'].astype('category')

# labelbinarizer on STATE

In [None]:
from sklearn.preprocessing import LabelEncoder

df['STATE'].replace(['AS', 'VI', 'MP', 'Ca', 'XB', 'FM', 'MH', 'GU'], 'OTH', inplace=True)
df['STATE'] = df['STATE'].fillna('N/A')

df['STATE'].value_counts()

label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(df['STATE'])
df['STATE'] = label_encoder.transform(df['STATE'])

## ALLERGIES

In [None]:
df['ALLERGIES'] = df['ALLERGIES'].str.lower()

In [None]:
nonelist = ['no', 'no known allergies', 'unknown', 'none known', 'n/a', 'none reported', 'na', 'none.',
            'no known drug allergies', 'no allergies', 'na', 'no known', 'no known allergies.', 'none listed', 
           'unk', 'none known.']

df['ALLERGIES'] = df['ALLERGIES'].fillna('none')
df['ALLERGIES'] = df['ALLERGIES'].replace('penicillin|sulfa', 'penicillin')
df['ALLERGIES'] = df['ALLERGIES'].replace(nonelist, 'none')

In [None]:
allall = []
for each in df['ALLERGIES']:
    if ',' in each:
        alls = each.split(',')
        ally = []
        for weach in alls:
            ally.append(weach.strip())
        allall.append(ally)
    else:
        allall.append(each)

In [None]:
allall2 = []
for each in allall:
    if type(each) == list:
        welt = "|".join(each)
        allall2.append(welt)
    else:
        allall2.append(each)

In [None]:
listy = list(pd.Series(allall2).value_counts()[pd.Series(allall2).value_counts()>30].index)
allall3 = list(map(lambda x: 'none' if x not in listy else x, allall2))

In [None]:
datufrayme = pd.Series(allall3).str.get_dummies()

In [None]:
listy.remove('penicillin|sulfa')

In [None]:
# del listy[31]

In [None]:
df.reset_index(inplace=True)

In [None]:
df = df.join(datufrayme[listy])

In [None]:
df

In [None]:
df.describe()

## ALLERGIES
## CURRENT ILLNESS

In [None]:
df['CUR_ILL'] = df['CUR_ILL'].str.lower()

In [None]:
nonelist = ['no', 'unknown', 'none.', 'none reported', 'n/a', 'na', 'none known', 'denies', 'none noted', '0', 'no illness',
           'none listed', 'not known', 'no known', 'non', 'no acute illnesses', 'no.', 'denied', 'see below', 'no illnesses',
            'unk', 'unkown', 'none documented', 'none stated', 'nothing', 'none known.', 'unknown.', 'no known illnesses',
            'n/a.','no e', 'none reported.', 'no acute illness']

df['CUR_ILL'] = df['CUR_ILL'].fillna('none')
df['CUR_ILL'] = df['CUR_ILL'].replace(nonelist, 'none')
df['CUR_ILL'] = df['CUR_ILL'].replace(['covid 19', 'covid', 'covid- 19 diagnosis 12/11/2020 asymptomatic', 'covid-19 (diagnosed 10/26/20)', 'covid-19  (diagnosed 10/26/20)'], 'covid-19')

In [None]:
allall = []
for each in df['CUR_ILL']:
    if ',' in each:
        alls = each.split(',')
        ally = []
        for weach in alls:
            ally.append(weach.strip())
        allall.append(ally)
    else:
        allall.append(each)
            
allall2 = []
for each in allall:
    if type(each) == list:
        welt = "|".join(each)
        allall2.append(welt)
    else:
        allall2.append(each)
            
listy = list(pd.Series(allall2).value_counts()[pd.Series(allall2).value_counts()>13].index)
allall3 = list(map(lambda x: 'none' if x not in listy else x, allall2))
    
datufrayme = pd.Series(allall3).str.get_dummies()

listy.remove('alcohol use disorder|facial laceration|alcohol intoxication|secondary syphillis')
listy.remove('elevated troponin i level elevated troponin i level        elevated brain natriuretic peptide (bnp) level elevated brain natriuretic peptide (bnp) level        dyspnea       chest pain        atrial fibrillation with rapid ventricular response (hcc) atrial fibrillation with rapid ventricular response|initial encounter       hyponatremia hyponatremia')

# df.reset_index(inplace=True)
    
df = df.join(datufrayme[listy], lsuffix=" cur_ill")

In [None]:
df.describe()

# HISTORY

In [None]:
df['HISTORY'] = df['HISTORY'].str.lower()

In [None]:
nonelist = ['no', 'unknown', 'none.', 'none reported', 'n/a', 'na', 'none known', 'denies', 'none noted', '0', 'no illness',
           'none listed', 'not known', 'no known', 'non', 'no acute illnesses', 'no.', 'denied', 'see below', 'no illnesses',
            'unk', 'unkown', 'none documented', 'none stated', 'nothing', 'none known.', 'unknown.', 'as above', 'no known illnesses',
            'n/a.','no e', 'none reported.', 'medical history/concurrent conditions: no adverse event (no reported medical history)',
           'medical history/concurrent conditions: no adverse event (no reported medical history.)', 'see above', 'medical history/concurrent conditions: no adverse event',
           'medical history/concurrent conditions: no adverse event (no medical history reported.)', 'medical history/concurrent conditions: no adverse event (no medical history reported)',
           'medical history/concurrent conditions: no adverse event (medical history not provided)', 'comments: list of non-encoded patient relevant history: patient other relevant history 1: none',
           ]

df['HISTORY'] = df['HISTORY'].fillna('none')
df['HISTORY'] = df['HISTORY'].replace(nonelist, 'none')
df['HISTORY'] = df['HISTORY'].replace('medical history/concurrent conditions: covid-19', 'covid-19')
df['HISTORY'] = df['HISTORY'].replace('medical history/concurrent conditions: hypertension', 'hypertension')
df['HISTORY'] = df['HISTORY'].replace('medical history/concurrent conditions: penicillin allergy', 'penicillin allergy')
df['HISTORY'] = df['HISTORY'].replace(['medical history/concurrent conditions: asthma','mild asthma','exercise induced asthma'], 'asthma')
df['HISTORY'] = df['HISTORY'].replace('medical history/concurrent conditions: blood pressure high', 'high blood pressure')
df['HISTORY'] = df['HISTORY'].replace('medical history/concurrent conditions: sulfonamide allergy', 'sulfonamide allergy')
df['HISTORY'] = df['HISTORY'].replace(['diabetic', 'type 2 diabetes', 'type 1 diabetes'], 'diabetes')
df['HISTORY'] = df['HISTORY'].replace('medical history/concurrent conditions: migraine', 'migraines')


In [None]:
allall = []
for each in df['HISTORY']:
    if ',' in each:
        alls = each.split(',')
        ally = []
        for weach in alls:
            ally.append(weach.strip())
        allall.append(ally)
    else:
        allall.append(each)
            
allall2 = []
for each in allall:
    if type(each) == list:
        welt = "|".join(each)
        allall2.append(welt)
    else:
        allall2.append(each)
            
listy = list(pd.Series(allall2).value_counts()[pd.Series(allall2).value_counts()>40].index)
allall3 = list(map(lambda x: 'none' if x not in listy else x, allall2))
    
datufrayme = pd.Series(allall3).str.get_dummies()

listy.remove('cerebral palsy|anxiety|crohns|bipolar|gerd|nutrition deficiency|iron deficiency')

# df.reset_index(inplace=True)
    
df = df.join(datufrayme[listy], lsuffix=" history")

In [None]:
df.describe()

In [None]:
# import gc
# gc.collect()

# OTHER_MEDS

In [None]:
df['OTHER_MEDS'] = df['OTHER_MEDS'].str.lower()

In [None]:
nonelist = ['unknown', 'no', 'none.', 'n/a', 'none reported', 'unk', 'none known', ';', 'not known', 'na', 'denies', ';  ;', 
           'nothing']

df['OTHER_MEDS'] = df['OTHER_MEDS'].fillna('none')
df['OTHER_MEDS'] = df['OTHER_MEDS'].replace(nonelist, 'none')

In [None]:
allall = []
for each in df['OTHER_MEDS']:
    if ',' in each:
        alls = each.split(',')
        ally = []
        for weach in alls:
            ally.append(weach.strip())
        allall.append(ally)
    else:
        allall.append(each)
            
allall2 = []
for each in allall:
    if type(each) == list:
        welt = "|".join(each)
        allall2.append(welt)
    else:
        allall2.append(each)
            
listy = list(pd.Series(allall2).value_counts()[pd.Series(allall2).value_counts()>20].index)
allall3 = list(map(lambda x: 'none' if x not in listy else x, allall2))
    
datufrayme = pd.Series(allall3).str.get_dummies()

# df.reset_index(inplace=True)
    
df = df.join(datufrayme[listy], lsuffix=" meds")

In [None]:
df

# EDA

In [None]:
df.info()

In [None]:
df['SEX']
fig5, ax5 = plt.subplots(figsize=(8,8))
sns.histplot(df['SEX'], ax=ax5, bins=2)

In [None]:
symplist = [df['SYMPTOM1'].values, df['SYMPTOM2'].values, df['SYMPTOM3'].values,
            df['SYMPTOM4'].values, df['SYMPTOM5'].values]
sl = []

for each in symplist:
          for weach in each:
            sl.append(weach)
dfsl = pd.Series(sl)

In [None]:
dfsl

In [None]:
vcsym = dfsl.value_counts()
vcsym

In [None]:
vcsym = vcsym[vcsym > 1000]
vcsym
keys = []
for each in vcsym.keys():
    keys.append(str(each))

In [None]:
X = keys
Y = vcsym.values

In [None]:
fig, ax = plt.subplots(figsize=(8,25))
sns.barplot(Y, X, orient='h', ax=ax)

In [None]:
fig2, ax2 = plt.subplots(figsize=(8,30))
sns.barplot(y=df['STATE'].value_counts().keys(), x=df['STATE'].value_counts().values, ax=ax2, orient='h')

In [None]:
sns.histplot(df['AGE_YRS'])

In [None]:
df['DIED'].sum()

In [None]:
fig3, ax3 = plt.subplots(figsize=(8,8))
sns.barplot(y=df['VAX_NAME'].value_counts().keys(), x=df['VAX_NAME'].value_counts().values, ax=ax3, orient='h')

In [None]:
for each in df['VAX_NAME'].value_counts().index:
    tdf = df[df['VAX_NAME']==each]
    print(len(tdf[tdf['DIED']==1]))

In [None]:
df['VAX_NAME'].value_counts().index

In [None]:
df.corr().head(6)

In [None]:
fig4, ax4 = plt.subplots(figsize =(35,30))
sns.heatmap(df.corr(), ax=ax4)

In [None]:
dfjan = df[df['BRAND: _JANSSEN']==1]
dfmod = df[df['BRAND: _MODERNA']==1]
dfpfi = df[df['BRAND: _PFIZER\\BIONTECH']==1]

In [None]:
dfjan.describe()

In [None]:
dfmod.describe()

In [None]:
dfpfi.describe()

In [None]:
df.drop(axis=1, inplace=True, columns=['CAGE_YR', 'CAGE_MO', 'RPT_DATE', 'ER_VISIT','V_FUNDBY', 'SPLTTYPE', 'FORM_VERS',
                                       'TODAYS_DATE','OFC_VISIT', 'ER_ED_VISIT', 'VAX_TYPE', 'VAX_NAME', 'VAX_LOT',
                                       'SYMPTOM_TEXT','LAB_DATA','OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'ALLERGIES',
                                       'SYMPTOM1', 'SYMPTOM2','SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5', 'VAX_DATE', 'ONSET_DATE',
                                      'PRIOR_VAX', 'DATEDIED', 'SYMPTOMVERSION1', 'SYMPTOMVERSION2', 'SYMPTOMVERSION3',
                                      'SYMPTOMVERSION4', 'SYMPTOMVERSION5'])

In [None]:
allcols = list(df.columns)
allcols

In [None]:
cols = ['AGE_YRS', 'SEX', 'DIED', 'L_THREAT', 'HOSPITAL','HOSPDAYS', 'X_STAY', 'DISABLE', 'RECOVD', 'NUMDAYS', 'BIRTH_DEFECT',
'VAX_DOSE_SERIES', 'BRAND: _JANSSEN', 'BRAND: _MODERNA','BRAND: _PFIZER\BIONTECH', 'BRAND: _UNKNOWN MANUFACTURER',
'VAX_SITE: _AR', 'VAX_SITE: _GM', 'VAX_SITE: _LA', 'VAX_SITE: _LG','VAX_SITE: _LL', 'VAX_SITE: _OT', 'VAX_SITE: _RA',
'VAX_SITE: _]
df[cols] = df[cols].astype(int)

# Model

In [None]:
df = df.set_index('VAERS_ID')
df.drop(columns = ['RECVDATE', 'CAGE_MO', 'CAGE_YR', 'RPT_DATE', 'SYMPTOM_TEXT', 'DIED',
                  'DATEDIED', 'L_THREAT', 'ER_VISIT', 'HOSPDAYS', 'X_STAY', 'RECOVD', 'VAX_DATE', 'ONSET_DATE',
                  'NUMDAYS', 'LAB_DATA', 'V_FUNDBY', 'OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'PRIOR_VAX', 'SPLTTYPE', 
                  'FORM_VERS', 'TODAYS_DATE', 'OFC_VISIT', 'ER_ED_VISIT', 'ALLERGIES', 'VAX_TYPE', 'VAX_LOT', 
                  'SYMPTOM1', 'SYMPTOM2','SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5','SYMPTOMVERSION1', 'SYMPTOMVERSION2',
                   'SYMPTOMVERSION3','SYMPTOMVERSION4', 'SYMPTOMVERSION5', 'VAX_NAME'],
       axis = 1, inplace=True)

In [None]:
df = df.astype(int)
df['STATE'] = df['STATE'].astype('category')

In [None]:
list(df.columns)

In [None]:
df.reset_index(inplace=True)
df.drop('VAERS_ID', axis=1, inplace=True)
df

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn import metrics
x = df.drop(columns = ['HOSPITAL'])
y = df['HOSPITAL']
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = .2)
xtrain, xval, ytrain, yval = train_test_split(xtrain, ytrain, test_size = .2)

In [None]:
xtrain.info()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report

In [None]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(xtrain, ytrain)

In [None]:
ypred = rfc.predict(xval)

In [None]:
plot_confusion_matrix(estimator=rfc, y_true=yval, X = xval)

In [None]:
plot_confusion_matrix(estimator=rfc, y_true=yval, X = xval)

In [None]:
plot_confusion_matrix(estimator=rfc, y_true=yval, X = xval)

# Lazy Predict

In [None]:
!pip install lazypredict

In [None]:
import lazypredict
from lazypredict.Supervised import LazyClassifier

In [None]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(xtrain, xval, ytrain, yval)

print(models)

# XGBoost

In [1]:
df = df.astype(int)
df['STATE'] = df['STATE'].astype('category')

NameError: name 'df' is not defined

In [None]:
df.info()

In [2]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.naive_bayes import GaussianNB
# from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, mean_squared_error
from bayes_opt import BayesianOptimization
import xgboost as xgb

In [None]:
dtrain = xgb.DMatrix(xtrain, label=ytrain, missing=0, enable_categorical=True)
dtest  = xgb.DMatrix(xtest, label=ytest, missing=0)
dval   = xgb.DMatrix(xval, label=yval, missing=0)

In [None]:
def bo_tune_xgb(max_depth, gamma, n_estimators ,learning_rate, scale_pos_weight, min_child_weight, colsample_bytree, subsample):
    params = {'max_depth'       : int(max_depth),
              'gamma'           : gamma,
              'n_estimators'    : int(n_estimators),
              'learning_rate'   : learning_rate,
              'subsample'       : subsample,
              'eval_metric'     : 'rmse',
              'min_child_weight': min_child_weight,
              'scale_pos_weight': scale_pos_weight,
              'colsample_bytree': colsample_bytree,
              'tree_method'     : 'gpu_hist'}
    cv_result = xgb.cv(params, dtrain, num_boost_round=200, nfold=5)
    return -1.0 * cv_result['test-rmse-mean'].iloc[-1]

In [None]:
xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth' : (1, 20),
                        'gamma'            : (0, 2),
                        'subsample'        : (0,1),           
                        'learning_rate'    : (0,1),
                        'n_estimators'     : (100,400),
                        'scale_pos_weight' : (5,10),
                        'min_child_weight' : (1,10),
                        'colsample_bytree' : (0,1)} ,verbose=3)

In [None]:
xgb_bo.maximize(n_iter=20, init_points=15, acq='ei')

In [None]:
params = xgb_bo.max['params']
params['max_depth'] = int(params['max_depth'])
params['n_estimators'] = int(params['n_estimators'])
params

In [None]:
xgb_opt= xgb.train(params, dtrain)

In [None]:
predsopt = xgb_opt.predict(dtrain)

In [None]:
predsopt.round()

In [None]:
cm = confusion_matrix(predsopt.round(), ytrain)
cm

In [None]:
# cm = confusion_matrix(predsopt.round(), ytrain)
# cm

In [None]:
# cm = confusion_matrix(predsopt.round(), ytrain)
# cm

In [None]:
cm.diagonal().sum()/cm.sum()

In [None]:
print(classification_report(predsopt.round(), ytrain))

In [None]:
predsoptval = xgb_opt.predict(dval)
print(classification_report(predsoptval.round(), yval))

In [None]:
loss: 0.2705 - accuracy: 0.8928 - precision_9: 0.7092 - recall_9: 0.4204 - val_loss: 0.3487 - val_accuracy: 0.8827 - val_precision_9: 0.6594 - val_recall_9: 0.3908

df.corr()['HOSPITAL'].sort_values()

# DEEP LEARNING

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout

In [None]:
tf.__version__

In [None]:
# swish is a recent activation function that is said to remedy the issues of ReLU. Lets put it to the test
def swish(x, b = 1):
    return (x * sigmoid(b * x))

In [None]:
def newmod():
    model = tf.keras.Sequential()
    model.add(Dense(176, input_dim=len(xtrain.columns), activation='swish'))
    model.add(Dropout(.2))

    model.add(Dense(88, activation='swish'))
    model.add(Dropout(.2))
    
    model.add(Dense(44, activation='swish'))
    model.add(Dropout(.2))
    
    model.add(Dense(22, activation='swish'))
    model.add(Dropout(.2))
    
    model.add(Dense(11, activation='swish'))
    model.add(Dropout(.2))
    
    model.add(Dense(1, activation='sigmoid'))
    
    return model


estimator = newmod()
estimator.compile(optimizer='nadam', 
                  metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()], loss='binary_crossentropy')

In [None]:
history = estimator.fit(xtrain, ytrain, epochs=100, validation_data=(xval, yval))

In [None]:
# eleventh model
history_df = pd.DataFrame(history.history)
plt.plot(history_df['loss'], label='loss')
plt.plot(history_df['val_loss'], label='val_loss')

plt.legend()

In [None]:
# tenth model
history_df = pd.DataFrame(history.history)
plt.plot(history_df['loss'], label='loss')
plt.plot(history_df['val_loss'], label='val_loss')

plt.legend()

In [None]:
# ninth model
history_df = pd.DataFrame(history.history)
plt.plot(history_df['loss'], label='loss')
plt.plot(history_df['val_loss'], label='val_loss')

plt.legend()

In [None]:
# ninth model
history_df = pd.DataFrame(history.history)
plt.plot(history_df['loss'], label='loss')
plt.plot(history_df['val_loss'], label='val_loss')

plt.legend()

In [None]:
# eighth model
history_df = pd.DataFrame(history.history)
plt.plot(history_df['loss'], label='loss')
plt.plot(history_df['val_loss'], label='val_loss')

plt.legend()

In [None]:
# seventh model 1k
history_df = pd.DataFrame(history.history)
plt.plot(history_df['loss'], label='loss')
plt.plot(history_df['val_loss'], label='val_loss')

plt.legend()

In [None]:
# seventh model
history_df = pd.DataFrame(history.history)
plt.plot(history_df['loss'], label='loss')
plt.plot(history_df['val_loss'], label='val_loss')

plt.legend()

In [None]:
# sixth model
history_df = pd.DataFrame(history.history)
plt.plot(history_df['loss'], label='loss')
plt.plot(history_df['val_loss'], label='val_loss')

plt.legend()

In [None]:
# fifth model
history_df = pd.DataFrame(history.history)
plt.plot(history_df['loss'], label='loss')
plt.plot(history_df['val_loss'], label='val_loss')

plt.legend()

In [None]:
# fourth model
history_df = pd.DataFrame(history.history)
plt.plot(history_df['loss'], label='loss')
plt.plot(history_df['val_loss'], label='val_loss')

plt.legend()

In [None]:
# third model
history_df = pd.DataFrame(history.history)
plt.plot(history_df['loss'], label='loss')
plt.plot(history_df['val_loss'], label='val_loss')

plt.legend()

In [None]:
# second model
history_df = pd.DataFrame(history.history)
plt.plot(history_df['loss'], label='loss')
plt.plot(history_df['val_loss'], label='val_loss')

plt.legend()

In [None]:
# first model
history_df = pd.DataFrame(history.history)
plt.plot(history_df['loss'], label='loss')
plt.plot(history_df['val_loss'], label='val_loss')

plt.legend()