In [1]:
import pandas as pd
pd.set_option('display.max_columns', 500)

In [2]:
features=pd.read_csv('training_set_features.csv')
labels=pd.read_csv('training_set_labels.csv')

In [3]:
labels1=labels.drop('xyz_vaccine',axis=1)

In [4]:
data=features.merge(labels1,how='outer')

In [5]:
data.drop(['health_insurance','employment_industry','employment_occupation','respondent_id','hhs_geo_region'],axis=1,inplace=True)

In [6]:
X=data.drop('seasonal_vaccine',axis=1)
y=data['seasonal_vaccine']

In [7]:
best_labels=['xyz_concern',
 'xyz_knowledge',
 'behavioral_antiviral_meds',
 'behavioral_avoidance',
 'behavioral_face_mask',
 'behavioral_wash_hands',
 'behavioral_large_gatherings',
 'behavioral_outside_home',
 'behavioral_touch_face',
 'doctor_recc_xyz',
 'doctor_recc_seasonal',
 'chronic_med_condition',
 'child_under_6_months',
 'health_worker',
 'opinion_xyz_vacc_effective',
 'opinion_xyz_risk',
 'opinion_xyz_sick_from_vacc',
 'opinion_seas_vacc_effective',
 'opinion_seas_risk',
 'opinion_seas_sick_from_vacc',
 'age_group',
 'education',
 'race',
 'sex',
 'income_poverty',
 'marital_status',
 'rent_or_own',
 'employment_status',
 'census_msa',
 'household_adults',
 'household_children']

In [8]:
X_new=X[best_labels]

In [9]:
num_cols_new=[i for i in X_new if X_new[i].dtype!=object]
cat_cols_new=[i for i in X_new if X_new[i].dtype==object]

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.compose import make_column_transformer

In [11]:
num_pipe=make_pipeline(SimpleImputer(strategy='most_frequent'))
nominal_pipe=make_pipeline(SimpleImputer(strategy='most_frequent'),OneHotEncoder())
ordinal_pipe=make_pipeline(SimpleImputer(strategy='most_frequent'),OrdinalEncoder(categories=[['18 - 34 Years','35 - 44 Years','45 - 54 Years','55 - 64 Years','65+ Years'],['< 12 Years', '12 Years', 'Some College','College Graduate'],['Below Poverty','<= $75,000, Above Poverty','> $75,000'],['Unemployed','Not in Labor Force','Employed']]))

In [12]:
col_trans=make_column_transformer((num_pipe,num_cols_new),(nominal_pipe,['race','sex','marital_status','rent_or_own','census_msa']),(ordinal_pipe,['age_group','education','income_poverty','employment_status']),remainder='passthrough')

In [13]:
from sklearn.ensemble import HistGradientBoostingClassifier as GBC

In [14]:
pipe=make_pipeline(col_trans,GBC(learning_rate=0.03, max_iter=500,l2_regularization=1,max_depth=5, random_state=10))

In [15]:
from sklearn.model_selection import cross_val_score

In [16]:
cross_val_score(pipe,X_new,y,cv=10,scoring='roc_auc').mean()

0.8569244686324404

In [17]:
pipe.fit(X_new,y)

In [18]:
test_data=pd.read_csv('test_set_features.csv')

In [19]:
test=test_data.drop(['health_insurance','employment_industry','employment_occupation','respondent_id','hhs_geo_region'],axis=1)

In [20]:
prob=pipe.predict_proba(test)[:,1]

In [21]:
pred=pipe.predict(test)

In [22]:
seasonal=pd.DataFrame(prob,columns=['seasonal_vaccine'])

In [23]:
seasonal['prediction']=pred

In [25]:
seasonal.to_csv('seasonal_vaccine.csv')

In [26]:
seasonal

Unnamed: 0,seasonal_vaccine,prediction
0,0.203956,0
1,0.041381,0
2,0.809632,1
3,0.902201,1
4,0.383811,0
...,...,...
26703,0.522583,1
26704,0.445025,0
26705,0.156215,0
26706,0.356973,0
