In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

In [4]:
training_set = pd.read_csv('training_set_features.csv')
training_set_labels = pd.read_csv('training_set_labels.csv')
test_set = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')

In [5]:
training_data = training_set.merge(training_set_labels, on='respondent_id')

In [6]:
X = training_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_xyz = training_data['xyz_vaccine']
y_seasonal = training_data['seasonal_vaccine']

In [7]:
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
test_imputed = pd.DataFrame(imputer.transform(test_set.drop(columns=['respondent_id'])), columns=X.columns)

In [8]:
categorical_cols = X_imputed.select_dtypes(include=['object']).columns

In [9]:
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_cols = pd.DataFrame(encoder.fit_transform(X_imputed[categorical_cols]))
encoded_cols.columns = encoder.get_feature_names_out(categorical_cols)



In [10]:
X_encoded = X_imputed.drop(columns=categorical_cols).reset_index(drop=True)
X_encoded = pd.concat([X_encoded, encoded_cols], axis=1)

In [11]:
X_train, X_val, y_train_xyz, y_val_xyz = train_test_split(X_encoded, y_xyz, test_size=0.2, random_state=0)
_, _, y_train_seasonal, y_val_seasonal = train_test_split(X_encoded, y_seasonal, test_size=0.2, random_state=0)

In [12]:
model_xyz = RandomForestClassifier(random_state=0)
model_seasonal = RandomForestClassifier(random_state=0)

In [13]:
model_xyz.fit(X_train, y_train_xyz)
model_seasonal.fit(X_train, y_train_seasonal)

In [14]:
y_val_pred_xyz = model_xyz.predict_proba(X_val)[:, 1]
y_val_pred_seasonal = model_seasonal.predict_proba(X_val)[:, 1]

In [15]:
roc_auc_xyz = roc_auc_score(y_val_xyz, y_val_pred_xyz)
roc_auc_seasonal = roc_auc_score(y_val_seasonal, y_val_pred_seasonal)
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

In [16]:
print(f"ROC AUC for xyz_vaccine: {roc_auc_xyz}")
print(f"ROC AUC for seasonal_vaccine: {roc_auc_seasonal}")
print(f"Mean ROC AUC: {mean_roc_auc}")

ROC AUC for xyz_vaccine: 0.8174066425744279
ROC AUC for seasonal_vaccine: 0.852225051323828
Mean ROC AUC: 0.8348158469491279


In [17]:
test_encoded_cols = pd.DataFrame(encoder.transform(test_imputed[categorical_cols]))
test_encoded_cols.columns = encoder.get_feature_names_out(categorical_cols)
X_test_encoded = test_imputed.drop(columns=categorical_cols).reset_index(drop=True)
X_test_encoded = pd.concat([X_test_encoded, test_encoded_cols], axis=1)

In [18]:
test_pred_xyz = model_xyz.predict_proba(X_test_encoded)[:, 1]
test_pred_seasonal = model_seasonal.predict_proba(X_test_encoded)[:, 1]

In [19]:
submission = pd.DataFrame({
    'respondent_id': test_set['respondent_id'],
    'xyz_vaccine': test_pred_xyz,
    'seasonal_vaccine': test_pred_seasonal
})

In [20]:
submission.to_csv('submission.csv', index=False)