## Import libraries and the data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
wave_1 = pd.read_excel("/Users/kazuma/Desktop/Vaccine_hesitancy/Vaccnien_national_survey/vaccine_hesidency/2nd_time_files/vaccine_hesidency/data/wave1_final.xlsx")
wave_2 = pd.read_excel("/Users/kazuma/Desktop/Vaccine_hesitancy/Vaccnien_national_survey/vaccine_hesidency/2nd_time_files/vaccine_hesidency/data/wave2_final.xlsx")
wave_3 = pd.read_excel("/Users/kazuma/Desktop/Vaccine_hesitancy/Vaccnien_national_survey/vaccine_hesidency/2nd_time_files/vaccine_hesidency/data/wave3_final.xlsx")

In [3]:
wave_1["wave"] = 1
wave_2["wave"] = 2
wave_3["wave"] = 3

In [4]:
waves = pd.concat([wave_1,wave_2,wave_3])
waves.reset_index(drop=True , inplace = True)

## Train-Test Split

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

X = waves.drop("vaccined", axis = 1)
y = waves["vaccined"]

split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)

for train_index, test_index in split.split(X, y):
    X_train = X.loc[train_index]
    X_test = X.loc[test_index]
    y_train = y.loc[train_index]
    y_test = y.loc[test_index]

In [6]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (3393, 21)
X_test shape: (377, 21)
y_train shape: (3393,)
y_test shape: (377,)


In [7]:
strat_train_set = pd.concat([X_train, y_train], axis = 1)
strat_test_set = pd.concat([X_test, y_test], axis = 1)

In [8]:
corr_matrix = strat_train_set.corr()
corr_matrix.style.background_gradient("BuGn")

Unnamed: 0,sex,age,education,marital_status,job_status,city_rural,previous_covid_infection,percevedrisk,worry,trust,easy_hard,behaviour,knowledge,attitude,collectiveresponse,confidence,convenience,safety,complacency,calculation,wave,vaccined
sex,1.0,-0.225459,0.023446,0.064999,0.634583,-0.062047,-0.007722,0.05235,0.158034,-0.001255,0.002871,0.13165,0.080885,-0.00033,0.007664,-0.028296,0.039557,-0.056206,0.070337,0.023218,-0.028173,0.015076
age,-0.225459,1.0,-0.362017,0.376432,-0.154126,0.073902,0.003109,-0.000208,-0.02801,0.052317,0.054103,0.03688,-0.105491,0.110166,0.08438,0.110287,0.107906,0.111518,0.00853,-0.055998,0.040005,0.086666
education,0.023446,-0.362017,1.0,-0.206998,-0.090386,-0.296823,-0.050304,0.077156,0.01575,-0.057986,-0.205266,0.144354,0.327766,-0.000952,0.005118,-0.125445,0.046819,-0.021025,0.095257,0.088297,-0.012568,0.028038
marital_status,0.064999,0.376432,-0.206998,1.0,-0.090871,0.012503,0.004269,0.016818,-0.000971,0.026048,0.047629,0.011469,-0.020697,0.028785,0.022572,0.057511,0.031237,0.035686,0.015014,-0.047782,0.005939,0.02422
job_status,0.634583,-0.154126,-0.090386,-0.090871,1.0,0.021535,0.03516,-0.017335,0.125377,0.023101,0.036657,0.067488,-0.02253,-0.009881,0.012981,-0.019689,0.04096,-0.059024,0.044668,0.020256,-0.059511,-0.017812
city_rural,-0.062047,0.073902,-0.296823,0.012503,0.021535,1.0,0.03385,-0.16436,-0.037194,0.053092,0.036878,-0.086507,-0.161807,0.064496,0.059119,0.099388,0.040968,0.019849,0.013955,-0.031824,-0.008237,-0.002927
previous_covid_infection,-0.007722,0.003109,-0.050304,0.004269,0.03516,0.03385,1.0,-0.043718,0.043523,0.029925,0.102325,0.19367,-0.057106,-0.215787,0.091412,0.025227,0.009051,0.295204,-0.084884,-0.245586,-0.668976,-0.012389
percevedrisk,0.05235,-0.000208,0.077156,0.016818,-0.017335,-0.16436,-0.043718,1.0,0.341246,0.083032,0.068921,0.074495,0.073647,-0.031748,0.038872,0.023506,-0.144328,-0.131414,-0.043198,0.031481,-0.012348,0.047971
worry,0.158034,-0.02801,0.01575,-0.000971,0.125377,-0.037194,0.043523,0.341246,1.0,0.182654,0.001196,0.206426,0.060974,0.097246,0.213937,0.128651,-0.022249,-0.13192,0.115338,0.038366,-0.110908,0.069629
trust,-0.001255,0.052317,-0.057986,0.026048,0.023101,0.053092,0.029925,0.083032,0.182654,1.0,-0.021013,0.229861,0.190938,0.403677,0.396157,0.492793,0.076647,0.015689,0.181739,0.107086,-0.034205,0.176327


In [9]:
pd.DataFrame(corr_matrix["vaccined"].sort_values(ascending=False)).style.background_gradient("BuGn")

Unnamed: 0,vaccined
vaccined,1.0
collectiveresponse,0.451345
attitude,0.439269
confidence,0.319683
knowledge,0.301257
complacency,0.240297
trust,0.176327
behaviour,0.145921
safety,0.132265
convenience,0.130963


## Scaling and Encoding Variables

In [10]:
from sklearn.preprocessing import OrdinalEncoder

education_array = X_train[['education']]

ordinal_encoder = OrdinalEncoder()
education_encoded = ordinal_encoder.fit_transform(education_array)

education_encoded

array([[1.],
       [3.],
       [3.],
       ...,
       [3.],
       [3.],
       [3.]])

In [11]:
X_train['ord_education'] = education_encoded
X_train.drop(['education'], axis = 1 , inplace = True)
X_train.rename(columns = {'ord_education':'education'}, inplace = True )

In [12]:
convert_dict = {'sex': 'category',
                'marital_status': 'category',
                'job_status': 'category',
                'city_rural': 'category',
                'previous_covid_infection': 'category',
                'wave': 'category'
                }

waves = waves.astype(convert_dict)

In [13]:
cat_list = [    'sex',
                'marital_status',
                'job_status',
                'city_rural',
                'previous_covid_infection',
                'wave',
           ]

ordinal_list = ['education'
               ]

num_list = ['age',
            'percevedrisk',         
            'worry',                       
            'trust',                       
            'easy_hard',                   
            'behaviour',                   
            'knowledge',                    
            'attitude',  
            "collectiveresponse",
            "confidence",
            "complacency",
            "convenience"
            ]

whole_list = cat_list + ordinal_list + num_list
whole_list

['sex',
 'marital_status',
 'job_status',
 'city_rural',
 'previous_covid_infection',
 'wave',
 'education',
 'age',
 'percevedrisk',
 'worry',
 'trust',
 'easy_hard',
 'behaviour',
 'knowledge',
 'attitude',
 'collectiveresponse',
 'confidence',
 'complacency',
 'convenience']

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

full_pipeline_nom = ColumnTransformer([
    ('cat', OneHotEncoder(), cat_list),
    ('full', StandardScaler(), whole_list) 
])

waves_prepared = full_pipeline_nom.fit_transform(X_train)



waves_prepared

array([[ 1.        ,  0.        ,  0.        , ...,  0.3917921 ,
         0.32049645, -0.58816848],
       [ 0.        ,  1.        ,  0.        , ..., -1.87243107,
         0.32049645,  0.39249769],
       [ 1.        ,  0.        ,  0.        , ...,  0.64337245,
        -0.81799016,  0.63766423],
       ...,
       [ 1.        ,  0.        ,  0.        , ..., -0.48873914,
        -0.62824239, -1.3236681 ],
       [ 0.        ,  1.        ,  0.        , ...,  0.76916262,
        -0.24874685,  0.63766423],
       [ 1.        ,  0.        ,  1.        , ..., -1.99822125,
        -2.52572008,  0.63766423]])

In [15]:
feature_names = np.concatenate((full_pipeline_nom.transformers_[0][1].get_feature_names_out(), num_list))
feature_names

array(['sex_1', 'sex_2', 'marital_status_1', 'marital_status_2',
       'marital_status_3', 'job_status_1', 'job_status_2', 'job_status_3',
       'job_status_4', 'job_status_5', 'city_rural_1', 'city_rural_2',
       'previous_covid_infection_0', 'previous_covid_infection_1',
       'previous_covid_infection_2', 'wave_1', 'wave_2', 'wave_3', 'age',
       'percevedrisk', 'worry', 'trust', 'easy_hard', 'behaviour',
       'knowledge', 'attitude', 'collectiveresponse', 'confidence',
       'complacency', 'convenience'], dtype=object)

In [16]:
X_train = pd.DataFrame(waves_prepared)
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.250738,0.455151,-0.152215,-0.443937,-0.334285,-0.114592,0.028885,0.391792,0.320496,-0.588168
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.320101,0.811028,-0.788254,0.965116,1.593039,-0.296806,0.513273,-1.872431,0.320496,0.392498
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.250738,-0.256603,0.483825,0.260590,1.593039,0.031180,-0.455502,0.643372,-0.817990,0.637664
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-1.582455,-0.434541,0.483825,0.260590,-0.334285,-0.588349,-1.424277,0.140212,-0.438495,0.637664
4,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.054794,-0.256603,-0.788254,-0.091674,0.629377,0.432052,-0.697696,-0.740319,1.458983,0.147331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3388,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.778399,1.522782,-0.152215,0.260590,-0.816117,0.176952,1.159123,-0.488739,1.648731,0.637664
3389,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-1.124157,-0.612480,-1.424294,0.965116,0.629377,-0.296806,0.190348,-1.117690,0.320496,0.637664
3390,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-1.429689,-1.502172,0.483825,0.965116,-2.261610,-1.863850,-2.393052,-0.488739,-0.628242,-1.323668
3391,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.014568,1.522782,-0.470234,0.260590,0.629377,0.687152,1.159123,0.769163,-0.248747,0.637664


In [17]:
X_train.shape

(3393, 37)

In [18]:
y_train.shape

(3393,)

In [19]:
waves_prepared_test = full_pipeline_nom.transform(X_test)

X_test = pd.DataFrame(waves_prepared_test)

X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.167335,0.633090,-0.788254,0.612853,1.111208,1.124466,1.159123,1.272323,1.648731,0.637664
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-1.735221,-0.612480,-1.424294,0.965116,0.147546,0.468495,0.190348,0.894953,0.320496,0.637664
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-1.124157,0.277213,0.483825,0.260590,1.593039,0.140509,0.271079,0.266002,0.699992,-0.343002
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-0.818624,-0.612480,-1.106274,-0.091674,0.629377,-0.369692,-1.262815,-0.740319,0.699992,0.637664
4,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.403504,0.633090,1.119864,0.260590,0.147546,1.160909,0.513273,0.140212,1.648731,0.637664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,-1.887987,-2.925679,-1.424294,-3.262042,0.147546,-0.734120,-1.989396,0.894953,-0.817990,0.637664
373,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-0.971391,-0.434541,-0.152215,-0.443937,0.629377,0.832923,-0.213308,-0.237159,1.458983,0.637664
374,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-1.582455,0.811028,-1.424294,0.965116,0.147546,1.926210,0.674735,1.272323,1.648731,0.637664
375,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.556270,-0.256603,1.755904,-2.557516,-0.816117,0.614266,0.432542,0.014422,-0.438495,0.637664


## Implement the models

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score, roc_curve

In [21]:
# now it's time for some magic!

# first create a dictionary for each model
models = {
    "Logistic Regression" : LogisticRegression(),
    "Random Forest" : RandomForestClassifier(),
    "GBM" : GradientBoostingClassifier(),
    "Naive Bayes" : GaussianNB()
}

# then a for loop 

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    
# make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    

    
# training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred)
    model_train_precision = precision_score(y_train, y_train_pred)
    model_train_recall = recall_score(y_train, y_train_pred)
    model_train_f1 = f1_score(y_train, y_train_pred, average = "weighted")
    model_train_rocauc = roc_auc_score(y_train, y_train_pred)
    
# test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred)
    model_test_precision = precision_score(y_test, y_test_pred)
    model_test_recall = recall_score(y_test, y_test_pred)
    model_test_f1 = f1_score(y_test, y_test_pred, average = "weighted")
    model_test_rocauc = roc_auc_score(y_test, y_test_pred)
    
    
    
# outcome
    print(list(models.keys())[i])
    print("model performance for training set: \n")
    print(f"- accuracy is: {model_train_accuracy: .4f}")
    print(f"- precision is: {model_train_precision: .4f}")
    print(f"- recall is: {model_train_recall: .4f}")
    print(f"- f1_score is: {model_train_f1: .4f}")
    print(f"- auc-roc score is: {model_train_rocauc: .4f}")
    
    print("------------ \n")
    
    print("model performance for test set: \n")
    print("model performance for training set:")
    print(f"- accuracy is: {model_test_accuracy: .4f}")
    print(f"- precision is: {model_test_precision: .4f}")
    print(f"- recall is: {model_test_recall: .4f}")
    print(f"- f1_score is: {model_test_f1: .4f}")
    print(f"- auc-roc score is: {model_test_rocauc: .4f}")
    
    
    print("-------------------------------------------------\n")

Logistic Regression
model performance for training set: 

- accuracy is:  0.9443
- precision is:  0.9532
- recall is:  0.9882
- f1_score is:  0.9368
- auc-roc score is:  0.6984
------------ 

model performance for test set: 

model performance for training set:
- accuracy is:  0.9363
- precision is:  0.9475
- recall is:  0.9856
- f1_score is:  0.9268
- auc-roc score is:  0.6652
-------------------------------------------------

Random Forest
model performance for training set: 

- accuracy is:  1.0000
- precision is:  1.0000
- recall is:  1.0000
- f1_score is:  1.0000
- auc-roc score is:  1.0000
------------ 

model performance for test set: 

model performance for training set:
- accuracy is:  0.9337
- precision is:  0.9474
- recall is:  0.9828
- f1_score is:  0.9247
- auc-roc score is:  0.6638
-------------------------------------------------

GBM
model performance for training set: 

- accuracy is:  0.9661
- precision is:  0.9661
- recall is:  0.9984
- f1_score is:  0.9620
- auc-roc

In [22]:
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy="minority")


X_over, y_over = oversample.fit_resample(X_train, y_train)

In [23]:

for i in range(len(list(models))):
    model_bal = list(models.values())[i]
    model_bal.fit(X_over, y_over)
    
# make predictions
    y_over_pred = model_bal.predict(X_over)
    y_over_test_pred = model_bal.predict(X_test)
    
# training set performance
    model_bal_train_accuracy = accuracy_score(y_over, y_over_pred)
    model_bal_train_precision = precision_score(y_over, y_over_pred)
    model_bal_train_recall = recall_score(y_over, y_over_pred)
    model_bal_train_f1 = f1_score(y_over, y_over_pred, average = "weighted")
    model_bal_train_rocauc = roc_auc_score(y_over, y_over_pred)
    
# test set performance
    model_bal_test_accuracy = accuracy_score(y_test, y_over_test_pred)
    model_bal_test_precision = precision_score(y_test, y_over_test_pred)
    model_bal_test_recall = recall_score(y_test, y_over_test_pred)
    model_bal_test_f1 = f1_score(y_test, y_over_test_pred, average = "weighted")
    model_bal_test_rocauc = roc_auc_score(y_test, y_over_test_pred)
    
    
# outcome
    print(list(models.keys())[i])
    print("model performance for training set:")
    print(f"- accuracy is: {model_bal_train_accuracy: .4f}")
    print(f"- precision is: {model_bal_train_precision: .4f}")
    print(f"- recall is: {model_bal_train_recall: .4f}")
    print(f"- f1_score is: {model_bal_train_f1: .4f}")
    print(f"- auc-roc score is: {model_bal_train_rocauc: .4f}")
    
    print("------------ \n")
    
    print("model performance for test set: \n")
    print("model performance for training set:")
    print(f"- accuracy is: {model_bal_test_accuracy: .4f}")
    print(f"- precision is: {model_bal_test_precision: .4f}")
    print(f"- recall is: {model_bal_test_recall: .4f}")
    print(f"- f1_score is: {model_bal_test_f1: .4f}")
    print(f"- auc-roc score is: {model_bal_test_rocauc: .4f}")
    
    
    print("-------------------------------------------------\n")

Logistic Regression
model performance for training set:
- accuracy is:  0.8304
- precision is:  0.8207
- recall is:  0.8453
- f1_score is:  0.8303
- auc-roc score is:  0.8304
------------ 

model performance for test set: 

model performance for training set:
- accuracy is:  0.8462
- precision is:  0.9833
- recall is:  0.8477
- f1_score is:  0.8753
- auc-roc score is:  0.8376
-------------------------------------------------

Random Forest
model performance for training set:
- accuracy is:  1.0000
- precision is:  1.0000
- recall is:  1.0000
- f1_score is:  1.0000
- auc-roc score is:  1.0000
------------ 

model performance for test set: 

model performance for training set:
- accuracy is:  0.9337
- precision is:  0.9549
- recall is:  0.9741
- f1_score is:  0.9295
- auc-roc score is:  0.7112
-------------------------------------------------

GBM
model performance for training set:
- accuracy is:  0.9268
- precision is:  0.9370
- recall is:  0.9152
- f1_score is:  0.9268
- auc-roc score