In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score


In [11]:
train_features = pd.read_csv('training_set_features.csv')
test_features = pd.read_csv('test_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')

In [12]:
train_data = pd.merge(train_features, train_labels, on='respondent_id')

feature_columns = train_features.columns
target_columns = ['xyz_vaccine', 'seasonal_vaccine']

In [13]:
X = train_data[feature_columns]
y = train_data[target_columns]

categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(exclude=['object']).columns

In [14]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [15]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])


In [16]:
model = MultiOutputClassifier(LogisticRegression(max_iter=1000), n_jobs=-1)

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

clf.fit(X_train, y_train)

In [17]:
y_val_pred = clf.predict_proba(X_val)
y_val_pred = np.column_stack([pred[:, 1] for pred in y_val_pred])

roc_auc_lr = roc_auc_score(y_val, y_val_pred, average="macro")
print(f'Validation ROC AUC Score for Logistic Regression: {roc_auc_lr}')

test_probs = clf.predict_proba(test_features)
test_probs = np.column_stack([prob[:, 1] for prob in test_probs])


Validation ROC AUC Score for Logistic Regression: 0.8454018690727083


In [10]:
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_probs[:, 0],
    'seasonal_vaccine': test_probs[:, 1]
})

submission.to_csv('submission_logreg.csv', index=False)