In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import MultiOutputClassifier

In [2]:
train_labels = pd.read_csv('training_set_labels.csv')
train_features = pd.read_csv('training_set_features.csv')
test_features = pd.read_csv('test_set_features.csv')

In [3]:
train_data = train_features.merge(train_labels, on='respondent_id')

In [4]:
X_train = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_train = train_data[['xyz_vaccine', 'seasonal_vaccine']]

In [5]:
X_test = test_features.drop(columns=['respondent_id'])

In [6]:
categorical_features = X_train.select_dtypes(include=['object']).columns
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns

In [7]:
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [8]:
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

In [10]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42)))
])

In [11]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
model.fit(X_train_split, y_train_split)

In [12]:
y_val_pred_proba = model.predict_proba(X_val_split)

y_val_pred_proba = np.hstack([y_val_pred_proba[0][:, 1].reshape(-1, 1), y_val_pred_proba[1][:, 1].reshape(-1, 1)])

In [13]:
roc_auc_1 = roc_auc_score(y_val_split['xyz_vaccine'], y_val_pred_proba[:, 0])
roc_auc_2 = roc_auc_score(y_val_split['seasonal_vaccine'], y_val_pred_proba[:, 1])
mean_roc_auc = np.mean([roc_auc_1, roc_auc_2])

print(f'Mean ROC AUC on validation set: {mean_roc_auc}')

Mean ROC AUC on validation set: 0.8345856455585976


In [14]:
model.fit(X_train, y_train)

y_test_pred_proba = model.predict_proba(X_test)

In [15]:
y_test_pred_proba = np.hstack([y_test_pred_proba[0][:, 1].reshape(-1, 1), y_test_pred_proba[1][:, 1].reshape(-1, 1)])

In [18]:
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': y_test_pred_proba[:, 0],
    'seasonal_vaccine': y_test_pred_proba[:, 1]
})

from google.colab import files

submission.to_csv('submission.csv', index=False)

files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>