In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')
data = pd.concat([train_features, train_labels], axis=1)

features = train_features.columns.drop(['respondent_id'])
target = ['xyz_vaccine', 'seasonal_vaccine']

X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


numerical_features = ['xyz_concern', 'xyz_knowledge', 'opinion_xyz_vacc_effective', 
                      'opinion_xyz_risk', 'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
                      'opinion_seas_risk', 'opinion_seas_sick_from_vacc']
categorical_features = [col for col in features if col not in numerical_features]


numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])


base_model = RandomForestClassifier(random_state=42)
model = MultiOutputClassifier(base_model, n_jobs=-1)


pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

pipeline.fit(X_train, y_train)


y_pred_prob = pipeline.predict_proba(X_test)


xyz_vaccine_prob = y_pred_prob[0][:, 1]
seasonal_vaccine_prob = y_pred_prob[1][:, 1]


roc_auc_xyz = roc_auc_score(y_test['xyz_vaccine'], xyz_vaccine_prob)
roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], seasonal_vaccine_prob)


mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')


test_pred_prob = pipeline.predict_proba(test_features)


test_xyz_vaccine_prob = test_pred_prob[0][:, 1]
test_seasonal_vaccine_prob = test_pred_prob[1][:, 1]


results = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine_prob': test_xyz_vaccine_prob,
    'seasonal_vaccine_prob': test_seasonal_vaccine_prob
})


results.to_csv('vaccine_predictions.csv', index=False)




ROC AUC for xyz_vaccine: 0.8272415937607678
ROC AUC for seasonal_vaccine: 0.8509937634784637
Mean ROC AUC: 0.8391176786196157
