In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
# LOADING THE DATASET INTO THE CODE

df=pd.read_csv("https://raw.githubusercontent.com/Premalatha-success/Datasets/main/h1n1_vaccine_prediction.csv")
df.sample(5)

Unnamed: 0,unique_id,h1n1_worry,h1n1_awareness,antiviral_medication,contact_avoidance,bought_face_mask,wash_hands_frequently,avoid_large_gatherings,reduced_outside_home_cont,avoid_touch_face,...,race,sex,income_level,marital_status,housing_status,employment,census_msa,no_of_adults,no_of_children,h1n1_vaccine
2079,2079,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,Black,Male,"<= $75,000, Above Poverty",Not Married,Rent,Not in Labor Force,"MSA, Principle City",0.0,0.0,0
23198,23198,3.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,White,Female,"<= $75,000, Above Poverty",Not Married,Rent,Employed,Non-MSA,0.0,1.0,0
16040,16040,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,Black,Female,"> $75,000",Not Married,Own,Employed,"MSA, Principle City",1.0,1.0,0
12563,12563,3.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,...,White,Female,"<= $75,000, Above Poverty",Not Married,Own,Employed,"MSA, Not Principle City",0.0,0.0,0
19616,19616,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,White,Male,,Not Married,Own,Employed,Non-MSA,0.0,0.0,0


In [3]:
df.shape

(26707, 34)

In [4]:
df.describe()

Unnamed: 0,unique_id,h1n1_worry,h1n1_awareness,antiviral_medication,contact_avoidance,bought_face_mask,wash_hands_frequently,avoid_large_gatherings,reduced_outside_home_cont,avoid_touch_face,...,has_health_insur,is_h1n1_vacc_effective,is_h1n1_risky,sick_from_h1n1_vacc,is_seas_vacc_effective,is_seas_risky,sick_from_seas_vacc,no_of_adults,no_of_children,h1n1_vaccine
count,26707.0,26615.0,26591.0,26636.0,26499.0,26688.0,26665.0,26620.0,26625.0,26579.0,...,14433.0,26316.0,26319.0,26312.0,26245.0,26193.0,26170.0,26458.0,26458.0,26707.0
mean,13353.0,1.618486,1.262532,0.048844,0.725612,0.068982,0.825614,0.35864,0.337315,0.677264,...,0.87972,3.850623,2.342566,2.35767,4.025986,2.719162,2.118112,0.886499,0.534583,0.212454
std,7709.791156,0.910311,0.618149,0.215545,0.446214,0.253429,0.379448,0.47961,0.472802,0.467531,...,0.3253,1.007436,1.285539,1.362766,1.086565,1.385055,1.33295,0.753422,0.928173,0.409052
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,6676.5,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,3.0,1.0,1.0,4.0,2.0,1.0,0.0,0.0,0.0
50%,13353.0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,4.0,2.0,2.0,4.0,2.0,2.0,1.0,0.0,0.0
75%,20029.5,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,5.0,4.0,4.0,5.0,4.0,4.0,1.0,1.0,0.0
max,26706.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,3.0,1.0


In [5]:
# CHECKING FOR MISSING VALUES

df.isnull().sum()

unique_id                        0
h1n1_worry                      92
h1n1_awareness                 116
antiviral_medication            71
contact_avoidance              208
bought_face_mask                19
wash_hands_frequently           42
avoid_large_gatherings          87
reduced_outside_home_cont       82
avoid_touch_face               128
dr_recc_h1n1_vacc             2160
dr_recc_seasonal_vacc         2160
chronic_medic_condition        971
cont_child_undr_6_mnths        820
is_health_worker               804
has_health_insur             12274
is_h1n1_vacc_effective         391
is_h1n1_risky                  388
sick_from_h1n1_vacc            395
is_seas_vacc_effective         462
is_seas_risky                  514
sick_from_seas_vacc            537
age_bracket                      0
qualification                 1407
race                             0
sex                              0
income_level                  4423
marital_status                1408
housing_status      

In [6]:
df.dtypes

unique_id                      int64
h1n1_worry                   float64
h1n1_awareness               float64
antiviral_medication         float64
contact_avoidance            float64
bought_face_mask             float64
wash_hands_frequently        float64
avoid_large_gatherings       float64
reduced_outside_home_cont    float64
avoid_touch_face             float64
dr_recc_h1n1_vacc            float64
dr_recc_seasonal_vacc        float64
chronic_medic_condition      float64
cont_child_undr_6_mnths      float64
is_health_worker             float64
has_health_insur             float64
is_h1n1_vacc_effective       float64
is_h1n1_risky                float64
sick_from_h1n1_vacc          float64
is_seas_vacc_effective       float64
is_seas_risky                float64
sick_from_seas_vacc          float64
age_bracket                   object
qualification                 object
race                          object
sex                           object
income_level                  object
m

In [7]:
# FUNCTION TO REPLACE MISSING VALUES IN NUMERICAL COLUMNS WITH THEIR MEDIAN

def replaceMV(col1):
  med=col1.median()
  col1.replace(np.nan,med,inplace=True)

col=['unique_id','no_of_adults','no_of_children','h1n1_worry','h1n1_awareness','antiviral_medication','contact_avoidance','bought_face_mask','wash_hands_frequently','avoid_large_gatherings','reduced_outside_home_cont','avoid_touch_face','dr_recc_h1n1_vacc','dr_recc_seasonal_vacc','chronic_medic_condition','cont_child_undr_6_mnths','is_health_worker','has_health_insur','is_h1n1_vacc_effective','is_h1n1_risky','sick_from_h1n1_vacc','is_seas_vacc_effective','is_seas_risky','sick_from_seas_vacc']
for i in col :
  replaceMV(df[i])

In [8]:
df.isnull().sum()

unique_id                       0
h1n1_worry                      0
h1n1_awareness                  0
antiviral_medication            0
contact_avoidance               0
bought_face_mask                0
wash_hands_frequently           0
avoid_large_gatherings          0
reduced_outside_home_cont       0
avoid_touch_face                0
dr_recc_h1n1_vacc               0
dr_recc_seasonal_vacc           0
chronic_medic_condition         0
cont_child_undr_6_mnths         0
is_health_worker                0
has_health_insur                0
is_h1n1_vacc_effective          0
is_h1n1_risky                   0
sick_from_h1n1_vacc             0
is_seas_vacc_effective          0
is_seas_risky                   0
sick_from_seas_vacc             0
age_bracket                     0
qualification                1407
race                            0
sex                             0
income_level                 4423
marital_status               1408
housing_status               2042
employment    

In [9]:
# REPLACING THE MISSING VALUES IN THE CATEGORICAL DATA COLUMNS WITH THEIE MODE

modeQ=df['qualification'].mode().values[0]
modeIL=df['income_level'].mode().values[0]
modeE=df['employment'].mode().values[0]

df['qualification']=df['qualification'].replace("?",np.nan)
df['qualification']=df['qualification'].replace(np.nan,modeQ)

df['income_level']=df['income_level'].replace("?",np.nan)
df['income_level']=df['income_level'].replace(np.nan,modeQ)

df['employment']=df['employment'].replace("?",np.nan)
df['employment']=df['employment'].replace(np.nan,modeE)

In [10]:
df.isnull().sum()

unique_id                       0
h1n1_worry                      0
h1n1_awareness                  0
antiviral_medication            0
contact_avoidance               0
bought_face_mask                0
wash_hands_frequently           0
avoid_large_gatherings          0
reduced_outside_home_cont       0
avoid_touch_face                0
dr_recc_h1n1_vacc               0
dr_recc_seasonal_vacc           0
chronic_medic_condition         0
cont_child_undr_6_mnths         0
is_health_worker                0
has_health_insur                0
is_h1n1_vacc_effective          0
is_h1n1_risky                   0
sick_from_h1n1_vacc             0
is_seas_vacc_effective          0
is_seas_risky                   0
sick_from_seas_vacc             0
age_bracket                     0
qualification                   0
race                            0
sex                             0
income_level                    0
marital_status               1408
housing_status               2042
employment    

In [11]:
# REMOVING DUPLICATES AND COLUMNS WHICH HAVE NO IMPACT ON THE PROBLEM

df.drop_duplicates(inplace=True)
df=df.drop(['marital_status','housing_status','race'],axis=1)

In [12]:
df.isnull().sum()

unique_id                    0
h1n1_worry                   0
h1n1_awareness               0
antiviral_medication         0
contact_avoidance            0
bought_face_mask             0
wash_hands_frequently        0
avoid_large_gatherings       0
reduced_outside_home_cont    0
avoid_touch_face             0
dr_recc_h1n1_vacc            0
dr_recc_seasonal_vacc        0
chronic_medic_condition      0
cont_child_undr_6_mnths      0
is_health_worker             0
has_health_insur             0
is_h1n1_vacc_effective       0
is_h1n1_risky                0
sick_from_h1n1_vacc          0
is_seas_vacc_effective       0
is_seas_risky                0
sick_from_seas_vacc          0
age_bracket                  0
qualification                0
sex                          0
income_level                 0
employment                   0
census_msa                   0
no_of_adults                 0
no_of_children               0
h1n1_vaccine                 0
dtype: int64

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 31 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   unique_id                  26707 non-null  int64  
 1   h1n1_worry                 26707 non-null  float64
 2   h1n1_awareness             26707 non-null  float64
 3   antiviral_medication       26707 non-null  float64
 4   contact_avoidance          26707 non-null  float64
 5   bought_face_mask           26707 non-null  float64
 6   wash_hands_frequently      26707 non-null  float64
 7   avoid_large_gatherings     26707 non-null  float64
 8   reduced_outside_home_cont  26707 non-null  float64
 9   avoid_touch_face           26707 non-null  float64
 10  dr_recc_h1n1_vacc          26707 non-null  float64
 11  dr_recc_seasonal_vacc      26707 non-null  float64
 12  chronic_medic_condition    26707 non-null  float64
 13  cont_child_undr_6_mnths    26707 non-null  flo

In [14]:
df.describe()

Unnamed: 0,unique_id,h1n1_worry,h1n1_awareness,antiviral_medication,contact_avoidance,bought_face_mask,wash_hands_frequently,avoid_large_gatherings,reduced_outside_home_cont,avoid_touch_face,...,has_health_insur,is_h1n1_vacc_effective,is_h1n1_risky,sick_from_h1n1_vacc,is_seas_vacc_effective,is_seas_risky,sick_from_seas_vacc,no_of_adults,no_of_children,h1n1_vaccine
count,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,...,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0
mean,13353.0,1.6198,1.261392,0.048714,0.727749,0.068933,0.825888,0.357472,0.336279,0.678811,...,0.934998,3.85281,2.337589,2.35238,4.025536,2.705321,2.115737,0.887558,0.529599,0.212454
std,7709.791156,0.909016,0.617047,0.215273,0.445127,0.253345,0.379213,0.479264,0.472444,0.466942,...,0.246533,1.000195,1.276825,1.353339,1.077131,1.375216,1.319585,0.74998,0.925264,0.409052
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,6676.5,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,3.0,1.0,1.0,4.0,2.0,1.0,0.0,0.0,0.0
50%,13353.0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,4.0,2.0,2.0,4.0,2.0,2.0,1.0,0.0,0.0
75%,20029.5,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,5.0,4.0,4.0,5.0,4.0,2.0,1.0,1.0,0.0
max,26706.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,3.0,1.0


In [15]:
# ENCODING THE OBJECT DATATYPE COLUMNS

col2=['age_bracket','qualification','sex','income_level','employment','census_msa']
dum=pd.get_dummies(df[['age_bracket','qualification','sex','income_level','employment','census_msa']])
df=df.drop(col2,axis=1)
df=pd.concat([df,dum],axis=1)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 46 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   unique_id                               26707 non-null  int64  
 1   h1n1_worry                              26707 non-null  float64
 2   h1n1_awareness                          26707 non-null  float64
 3   antiviral_medication                    26707 non-null  float64
 4   contact_avoidance                       26707 non-null  float64
 5   bought_face_mask                        26707 non-null  float64
 6   wash_hands_frequently                   26707 non-null  float64
 7   avoid_large_gatherings                  26707 non-null  float64
 8   reduced_outside_home_cont               26707 non-null  float64
 9   avoid_touch_face                        26707 non-null  float64
 10  dr_recc_h1n1_vacc                       26707 non-null  fl

In [17]:
# There are no outliers for Categorical and Binary data, thus no need for outlier elimination

In [18]:
# THE DATA IS DIVIDED INTO DEPENDENT AND INDEPENDENT VARIABLES. x CONTAONS THE INDEPENDENT VARIABLES AND y CONTAINS THE DEPENDENT VARIABLES

y=df['h1n1_vaccine']
x=df.drop(['h1n1_vaccine'],axis=1)

In [19]:
# THE DATA IS DIVIDED INTO TEST AND TRAIN SETS. THE SUBSCRIPT "ns" DENOTES THAT THE DATA IS NON STANDARD

xtrain_ns,xtest_ns,ytrain_ns,ytest_ns = train_test_split(x,y,test_size=0.3, random_state=1)

In [20]:
xtrain_ns.shape

(18694, 45)

In [21]:
xtest_ns.shape

(8013, 45)

In [22]:
# APPLYING LOGISITC REGRESSION

model1 = LogisticRegression()
model1.fit(xtrain_ns,ytrain_ns)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
predictions=model1.predict(xtrain_ns)
accuracy_score(ytrain_ns,predictions)

0.8223494169252167

In [24]:
predictions=model1.predict(xtest_ns)
accuracy_score(ytest_ns,predictions)

0.8286534381629852

In [25]:
# STANDADISING THE DATA. THE VARIABLES NOW WILL NOT HAVE THE SUBSCRIPT "ns"

std_model=StandardScaler()

In [26]:
std_model.fit_transform(df)

array([[-1.73198595, -0.68184881, -2.04427905, ..., -0.87928249,
        -0.64602114,  1.64630987],
       [-1.73185625,  1.51837305,  1.19702709, ...,  1.13729093,
        -0.64602114, -0.60741906],
       [-1.73172654, -0.68184881, -0.42362598, ...,  1.13729093,
        -0.64602114, -0.60741906],
       ...,
       [ 1.73172654,  0.41826212,  1.19702709, ...,  1.13729093,
        -0.64602114, -0.60741906],
       [ 1.73185625, -0.68184881, -0.42362598, ..., -0.87928249,
        -0.64602114,  1.64630987],
       [ 1.73198595, -1.78195974, -2.04427905, ..., -0.87928249,
         1.54793697, -0.60741906]])

In [27]:
x=df.drop(['h1n1_vaccine'],axis=1)
y=df['h1n1_vaccine']

In [28]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2)

In [29]:
# LOGISTIC REGRESSION

model2 = LogisticRegression()
model2.fit(xtrain,ytrain)

In [30]:
predictions=model2.predict(xtrain)
accuracy_score(ytrain,predictions)

0.8241048443716359

In [31]:
predictions=model2.predict(xtest)
accuracy_score(ytest,predictions)

0.8186072631973044

In [32]:
# DECISION TREE CLASSIFIER

model3=tree.DecisionTreeClassifier(max_depth=6)
model3.fit(xtrain,ytrain)

In [33]:
predictions=model3.predict(xtrain)
accuracy_score(ytrain,predictions)

0.8378656681488416

In [34]:
predictions=model3.predict(xtest)
accuracy_score(ytest,predictions)

0.8229127667540247

In [35]:
# BAGGING CLASSIFIER

model4=BaggingClassifier(n_estimators = 25,max_features=5)
model4.fit(xtrain,ytrain)

In [36]:
predictions=model4.predict(xtrain)
accuracy_score(ytrain,predictions)

0.7897963959747251

In [37]:
predictions=model4.predict(xtest)
accuracy_score(ytest,predictions)

0.7789217521527517

In [38]:
# Gradient boost Classifier

model5=GradientBoostingClassifier(n_estimators=250,max_depth=4)
model5.fit(xtrain,ytrain)

In [39]:
predictions=model5.predict(xtrain)
accuracy_score(ytrain,predictions)

0.8658553709337702

In [40]:
predictions=model5.predict(xtest)
accuracy_score(ytest,predictions)

0.8330213403219768

In [41]:
# ADABOOST CLASSIFIER

model6=AdaBoostClassifier(n_estimators=75)
model6.fit(xtrain,ytrain)

In [42]:
predictions=model6.predict(xtrain)
accuracy_score(ytrain,predictions)

0.836789141118652

In [43]:
predictions=model6.predict(xtest)
accuracy_score(ytest,predictions)

0.8294646199925122

In [44]:
# Random Forest Classifier

model7=RandomForestClassifier(max_depth=8)
model7.fit(xtrain,ytrain)

In [45]:
predictions=model7.predict(xtrain)
accuracy_score(ytrain,predictions)

0.8469459396208753

In [46]:
predictions=model7.predict(xtest)
accuracy_score(ytest,predictions)

0.8304005990265818

In [47]:
# SVC

model8=SVC()
model8.fit(xtrain,ytrain)

In [48]:
predictions=model8.predict(xtrain)
accuracy_score(ytrain,predictions)

0.7897027849286216

In [49]:
predictions=model8.predict(xtest)
accuracy_score(ytest,predictions)

0.7789217521527517

In [50]:
# Classifiers have been used in this predicitons project because the dependent variable is binary, i.e. it takes only the values 1 and 0 for vaccinated and unvaccinated

# Since the h1n1 vaccine prediciton data set is a big dataset, we get sufficiently good scores
# The highest score is observed in Gradient Boost Classifier, which is 0.868 for the train set and 0.827 for the test set
# Adaboost Classifier got the second highest score, which is 0.839 for the train set and 0.820 for the test set
# Since Gradient Boost is a tree based classifier which improve its previous error, it got the highest score