In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier
%matplotlib inline

In [None]:
X = pd.read_csv('training_set_features.csv')
X

In [None]:
Y = pd.read_csv('training_set_labels.csv')
Y

In [None]:
Z = Y
Z.drop(['respondent_id'], axis = 1, inplace = True)
data = pd.concat([X,Z],axis=1)

In [None]:
data.isnull().sum()

In [None]:
data.drop(['health_insurance', 'employment_industry', 'employment_occupation'], axis=1, inplace = True)

# EDA & cleaning

In [None]:
print(Y['xyz_vaccine'].value_counts())
print(Y['seasonal_vaccine'].value_counts())

In [None]:
fig, ax = plt.subplots(2,1)

sns.countplot(Y, y='xyz_vaccine', ax=ax[0])
sns.countplot(Y, y='seasonal_vaccine', ax=ax[1])

plt.tight_layout()

In [None]:
data['hhs_geo_region'].unique()

In [None]:
non_numeric_col = data.select_dtypes(include='object').columns
data[non_numeric_col].head()

In [None]:
for col in data.columns:
    if data[col].isnull().sum() and data[col].dtypes != 'object':
        data.loc[(X[col].isnull()), col] = data[col].median()
for col in data.columns:
    if data[col].isnull().sum() and data[col].dtypes == 'object':
        data.loc[(data[col].isnull()), col] = data[col].mode().max()

In [None]:
data.isnull().sum()

In [None]:
LE = LabelEncoder()
for col in non_numeric_col:
    data[col] = LE.fit_transform(data[col])

In [None]:
corr=data.corr()

g = sns.heatmap(corr, square=True, linewidths=1, annot=True)
g.figure.set_size_inches(30,25)
    
plt.show()

In [None]:
data['cleanliness'] =  data['behavioral_antiviral_meds']+ data['behavioral_avoidance']+\
                        data['behavioral_face_mask']+data['behavioral_wash_hands']+\
                       data['behavioral_large_gatherings'] + data['behavioral_outside_home']+\
                       data['behavioral_touch_face']

In [None]:
data['opinion'] = data['opinion_xyz_vacc_effective'] + data['opinion_xyz_risk']+\
                  data['opinion_xyz_sick_from_vacc'] + data['opinion_seas_vacc_effective']+\
                  data['opinion_seas_risk'] + data['opinion_seas_sick_from_vacc']

In [None]:
sns.boxplot(x='cleanliness', y='opinion', data=data)

In [None]:
data.drop(['race','child_under_6_months','opinion_xyz_sick_from_vacc','opinion_seas_sick_from_vacc','household_adults','behavioral_antiviral_meds','behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_antiviral_meds','marital_status',
           'behavioral_avoidance','behavioral_face_mask','income_poverty','hhs_geo_region','employment_status','education','census_msa'],axis=1,inplace = True)
data.head()

In [None]:
corr=data.corr()

g = sns.heatmap(corr, square=True, linewidths=1, annot=True)
g.figure.set_size_inches(30,25)
    
plt.show()

In [None]:
Y_label = Y[['xyz_vaccine', 'seasonal_vaccine']]

In [None]:
data.drop(['respondent_id','xyz_vaccine','seasonal_vaccine'],axis=1,inplace = True)
frames =[data,Y_label]
data = pd.concat(frames,axis=1)

data.head()

In [None]:
features = data.columns[:-2]
print(features)
labels = ['xyz_vaccine', 'seasonal_vaccine']

# Model

In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(data[features], data[labels], test_size=0.2, shuffle=True)

In [None]:
X_train.columns

In [None]:
y_train

In [None]:
def transform_test(Test_X):
    Test_X.drop(['respondent_id','health_insurance','employment_industry','employment_occupation'],axis=1,inplace = True)
            
    d = Test_X
    str_cols = d.select_dtypes(include = 'object').columns

    for col in Test_X.columns:
        if Test_X[col].isnull().sum() and Test_X[col].dtypes != 'object':
            Test_X.loc[(Test_X[col].isnull()), col] = Test_X[col].median()
    for col in Test_X.columns:
        if Test_X[col].isnull().sum() and Test_X[col].dtypes == 'object':
            Test_X.loc[(Test_X[col].isnull()), col] = Test_X[col].mode().max()

    LE = LabelEncoder()
    for col in str_cols:
        Test_X[col] = LE.fit_transform(Test_X[col]) 
            
    data = d

    data['cleanliness'] =  data['behavioral_antiviral_meds']+ data['behavioral_avoidance']+\
                        data['behavioral_face_mask']+data['behavioral_wash_hands']+\
                       data['behavioral_large_gatherings'] + data['behavioral_outside_home']+\
                       data['behavioral_touch_face']
    data['opinion'] = data['opinion_xyz_vacc_effective'] + data['opinion_xyz_risk']+\
                  data['opinion_xyz_sick_from_vacc'] + data['opinion_seas_vacc_effective']+\
                  data['opinion_seas_risk'] + data['opinion_seas_sick_from_vacc']

    data.drop(['race','child_under_6_months','opinion_xyz_sick_from_vacc','opinion_seas_sick_from_vacc','household_adults','behavioral_antiviral_meds','behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_antiviral_meds','marital_status',
           'behavioral_avoidance','behavioral_face_mask','income_poverty','hhs_geo_region','employment_status','education','census_msa'],axis=1,inplace = True)
    
    Test_X = data
    return Test_X

In [None]:
Test_X = pd.read_csv('test_set_features.csv')
Test_X

In [None]:
Test_X = transform_test(Test_X)
Test_X.isnull().sum()

In [None]:
Test_X.columns

In [None]:
estimators=MultiOutputClassifier(
    estimator=LogisticRegression()
)

pipe = make_pipeline(StandardScaler(), estimators)

In [None]:
pipe.fit(X_train, y_train)

In [None]:
pred = pipe.predict_proba(X_eval)
pred

In [None]:
y_preds = pd.DataFrame(
    {
        "xyz_vaccine": pred[0][:, 1],
        "seasonal_vaccine": pred[1][:, 1],
    },
    index = y_eval.index
)
print(y_preds.shape)
y_preds.head()

In [None]:
print(roc_auc_score(y_eval, y_preds))

In [None]:
test_pred = pipe.predict_proba(Test_X)
test_pred

In [None]:
submission_df = pd.read_csv('submission_format.csv')

In [None]:
submission_df["h1n1_vaccine"] = test_pred[0][:, 1]
submission_df["seasonal_vaccine"] = test_pred[1][:, 1]

submission_df.head()


In [None]:
submission_df.to_csv('my_submission.csv',index=False)