In [133]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [134]:

#importing the dataset
train_data_features = pd.read_csv('training_set_features.csv') #respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_xyz_vacc_effective,opinion_xyz_risk,opinion_xyz_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
train_data_labels = pd.read_csv('training_set_labels.csv') #respondent_id,xyz_vaccine,seasonal_vaccine
test_data_features = pd.read_csv('test_set_features.csv') #respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_xyz_vacc_effective,opinion_xyz_risk,opinion_xyz_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation

submission = pd.read_csv('submission_format.csv') #respondent_id,h1n1_vaccine,seasonal_vaccine

In [135]:
# Printin the first 5 rows of the dataset
print(train_data_features.head())

# Checking the shape of the dataset
print(train_data_features.shape)

   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [136]:
# Drop the respondent_id column
train_data_features = train_data_features.drop(columns=['respondent_id'])
train_data_labels = train_data_labels.drop(columns=['respondent_id'])
test_data_features = test_data_features.drop(columns=['respondent_id'])

# Identify numerical and categorical columns
numerical_cols = [cname for cname in train_data_features.columns if 
                train_data_features[cname].dtype in ['int64', 'float64']]
categorical_cols = [cname for cname in train_data_features.columns if
                    train_data_features[cname].nunique() < 10 and 
                    train_data_features[cname].dtype == "object"]


# Preprocessing for numerical data: impute missing values and scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data: impute missing values and one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Preprocess the features
X_train = preprocessor.fit_transform(train_data_features)
X_test = preprocessor.transform(test_data_features)


In [137]:
# Model Selection and Training
X_train, X_test, y_train, y_test = train_test_split(X_train, train_data_labels, test_size=0.2, random_state=42)
model = MultiOutputClassifier(LogisticRegression())
model.fit(X_train, y_train)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
score = model.score(X_test, y_test)
print('Accuracy:', score)

# Make predictions
y_pred_proba = model.predict_proba(X_test)

# Prepare the predictions dataframe
predictions_df = pd.DataFrame({
    'h1n1_vaccine': y_pred_proba[0][:, 1],
    'seasonal_vaccine': y_pred_proba[1][:, 1]
})


Accuracy: 0.6765256458255335


In [138]:
# Calculate ROC AUC score
roc_auc_score(y_test, predictions_df)

0.8420105146862404

In [139]:
# Hyperparameter Tuning
param_grid = {
    'estimator__C': [0.1, 1, 10],
    'estimator__solver': ['lbfgs', 'liblinear']
}

# Grid Search
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

# Best parameters
print(grid_search.best_params_)

# Best score
print(grid_search.best_score_)

{'estimator__C': 0.1, 'estimator__solver': 'lbfgs'}
0.8391443318916452


In [140]:
# Train the final model
final_model = grid_search.best_estimator_
final_model.fit(X_train, y_train)

# Make predictions
final_predictions_proba = final_model.predict_proba(X_test)

# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': submission.loc[y_test.index, 'respondent_id'],
    'xyz_vaccine': final_predictions_proba[0][:, 1],
    'seasonal_vaccine': final_predictions_proba[1][:, 1]
})
submission.to_csv('submission.csv', index=False)