## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### Loading datasets

In [2]:
train_df = pd.read_csv('training_set_features.csv')
test_df = pd.read_csv('test_set_features.csv')
train_df.isnull().sum()

respondent_id                      0
xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

## A few more libraries 

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression


## Extracting the target variables from the dataset

In [4]:
train_labels = pd.read_csv('training_set_labels.csv')
target_var = ['xyz_vaccine', 'seasonal_vaccine']
train_target = train_labels[target_var]

## Filling the empty values

In [5]:
train_df = train_df.apply(lambda x: x.fillna(x.value_counts().index[0]))
test_df = test_df.apply(lambda x: x.fillna(x.value_counts().index[0]))
train_df.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,fcxhlnwr,xtkaffoo
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,fcxhlnwr,xtkaffoo
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


## Categorical data into numerical data

In [6]:
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)
test_df.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,False,False,False,False,False,False,False,False,False,False
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,True,False,False
2,26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False
3,26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
4,26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False


## Defining variables and targets

In [7]:
x = train_df.drop(columns=['respondent_id'])
y = train_target

## Splitting the data into training and testing variables

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## Training the model 

In [10]:
parameters = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100]
}

using Logistic Regression L1 and L2 regularization

## Logistic Regression for vaccine

In [17]:
xyz = GridSearchCV(LogisticRegression(solver='liblinear', random_state=42), parameters, cv=5, scoring='roc_auc')
xyz.fit(x_train, y_train['xyz_vaccine'])
xyz_pred = xyz.predict_proba(x_test)[:, 1]
roc_auc_xyz = roc_auc_score(y_test['xyz_vaccine'], xyz_pred)

## Regression for seasonal_vaccine

In [12]:
seasonal = GridSearchCV(LogisticRegression(solver='liblinear', random_state=42), parameters, cv=5, scoring='roc_auc')
seasonal.fit(x_train, y_train['seasonal_vaccine'])
seasonal_pred = seasonal.predict_proba(x_test)[:, 1]
roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], seasonal_pred)

## Mean ROC AUC score

In [13]:
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

In [14]:
print(f'Best ROC AUC for xyz vaccine: {roc_auc_xyz}')
print(f'Best ROC AUC for seasonal vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')

Best ROC AUC for xyz vaccine: 0.8316884333845986
Best ROC AUC for seasonal vaccine: 0.8561971684095084
Mean ROC AUC: 0.8439428008970535


## Predicting probabilities 

In [15]:
xyz_test_pred = xyz.predict_proba(test_df.drop(columns=['respondent_id']))[:, 1]
seasonal_test_pred = seasonal.predict_proba(test_df.drop(columns=['respondent_id']))[:, 1]

## submission file

In [18]:
final = pd.DataFrame({
    'respondent_id': test_df['respondent_id'],
    'xyz_vaccine': xyz_test_pred,
    'seasonal_vaccine': seasonal_test_pred
})
final.to_csv('final.csv', index=False)