In [26]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


In [27]:

# Importing the dataset
train_data_features = pd.read_csv('training_set_features.csv')
train_data_labels = pd.read_csv('training_set_labels.csv')
test_data_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')


In [28]:

# Print the first 5 rows of the dataset
print(train_data_features.head())


   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [29]:

# Checking the shape of the dataset
print(train_data_features.shape)


(26707, 36)


In [30]:

# Store respondent_ids separately
train_respondent_ids = train_data_features['respondent_id']
test_respondent_ids = test_data_features['respondent_id']

# Drop the respondent_id column
train_data_features = train_data_features.drop(columns=['respondent_id'])
train_data_labels = train_data_labels.drop(columns=['respondent_id'])
test_data_features = test_data_features.drop(columns=['respondent_id'])


In [31]:

# Identify numerical and categorical columns
numerical_cols = [cname for cname in train_data_features.columns if train_data_features[cname].dtype in ['int64', 'float64']]
categorical_cols = [cname for cname in train_data_features.columns if train_data_features[cname].dtype == "object"]

# Preprocessing for numerical data: impute missing values and scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data: impute missing values and one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [32]:

# Preprocess the features
X_train = preprocessor.fit_transform(train_data_features)
X_test = preprocessor.transform(test_data_features)


In [33]:

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X_train, train_data_labels, test_size=0.2, random_state=42)

# Model Selection and Training
model = MultiOutputClassifier(LogisticRegression())
model.fit(X_train, y_train)


In [34]:

# Evaluate the model
y_val_pred_proba = model.predict_proba(X_val)

# Prepare the predictions dataframe
predictions_df = pd.DataFrame({
    'xyz_vaccine': y_val_pred_proba[0][:, 1],
    'seasonal_vaccine': y_val_pred_proba[1][:, 1]
})


In [35]:

# Calculate ROC AUC score
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], predictions_df['xyz_vaccine'])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], predictions_df['seasonal_vaccine'])
print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')


ROC AUC for xyz_vaccine: 0.8344622191967325
ROC AUC for seasonal_vaccine: 0.8564267812388112


In [36]:

# Hyperparameter Tuning
param_grid = {
    'estimator__C': [0.1, 1, 10],
    'estimator__solver': ['lbfgs', 'liblinear']
}

# Grid Search
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)


In [37]:

# Best parameters
print(grid_search.best_params_)


{'estimator__C': 0.1, 'estimator__solver': 'liblinear'}


In [38]:

# Best score
print(grid_search.best_score_)


0.8429928427601675


In [39]:

# Train the final model
final_model = grid_search.best_estimator_
final_model.fit(X_train, y_train)

# Make predictions on the test set
final_predictions_proba = final_model.predict_proba(X_test)


In [40]:

# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': test_respondent_ids,
    'xyz_vaccine': final_predictions_proba[0][:, 1],
    'seasonal_vaccine': final_predictions_proba[1][:, 1]
})
submission.to_csv('submission.csv', index=False)
