In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import MultiOutputClassifier

In [4]:
# Load datasets
train_features = pd.read_csv('Downloads/dataset and all/training_set_features.csv')
train_labels = pd.read_csv('Downloads/dataset and all/training_set_labels.csv')
test_features = pd.read_csv('Downloads/dataset and all/test_set_features.csv')
submission_format = pd.read_csv('Downloads/dataset and all/submission_format.csv')

In [5]:
# Merge the features and labels for training
train_data = pd.merge(train_features, train_labels, on='respondent_id')

In [6]:
# Split features and labels
X_train = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_train = train_data[['xyz_vaccine', 'seasonal_vaccine']]
X_test = test_features.drop(columns=['respondent_id'])

In [7]:
# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

In [8]:
# Preprocess the data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

In [9]:
# Create a preprocessing and modeling pipeline
model = MultiOutputClassifier(RandomForestClassifier(random_state=42))

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)])

# Split the training data for validation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model
clf.fit(X_train_split, y_train_split)

# Predict on the validation set
y_val_pred = clf.predict_proba(X_val_split)

In [10]:
# Calculate ROC AUC for each target variable
roc_auc_xyz = roc_auc_score(y_val_split['xyz_vaccine'], y_val_pred[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_val_split['seasonal_vaccine'], y_val_pred[1][:, 1])
print(f"ROC AUC for xyz_vaccine: {roc_auc_xyz}")
print(f"ROC AUC for seasonal_vaccine: {roc_auc_seasonal}")
print(f"Mean ROC AUC: {(roc_auc_xyz + roc_auc_seasonal) / 2}")

# Predict on the test set
y_test_pred = clf.predict_proba(X_test)

# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': y_test_pred[0][:, 1],
    'seasonal_vaccine': y_test_pred[1][:, 1]
})

# Save the submission file
submission.to_csv('Downloads/dataset and all/submission.csv', index=False)

ROC AUC for xyz_vaccine: 0.864173999277244
ROC AUC for seasonal_vaccine: 0.8570519011081396
Mean ROC AUC: 0.8606129501926918
