In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Load the feature and label datasets
df_features = pd.read_csv('training_set_features.csv')
df_labels = pd.read_csv('training_set_labels.csv')

# Merge datasets on respondent_id
df = pd.merge(df_features, df_labels, on='respondent_id')

# Separate features and target variables
X = df.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = df[['xyz_vaccine', 'seasonal_vaccine']]

# Fill missing values in categorical columns with 'missing'
categorical_cols = X.select_dtypes(include=['object']).columns
X[categorical_cols] = X[categorical_cols].fillna('missing')

# Fill missing values in numerical columns with the mean of the column
numerical_cols = X.select_dtypes(exclude=['object']).columns
X[numerical_cols] = X[numerical_cols].fillna(X[numerical_cols].mean())

# Ensure all numerical columns are of type float
X[numerical_cols] = X[numerical_cols].astype(float)

# Define preprocessing for numerical data
numerical_transformer = StandardScaler()

# Define preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create a preprocessing and modeling pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred_prob = model.predict_proba(X_test)

# Extract probabilities for each target
y_pred_prob_xyz = np.array([prob[:, 1] for prob in y_pred_prob]).T[:, 0]
y_pred_prob_seasonal = np.array([prob[:, 1] for prob in y_pred_prob]).T[:, 1]

# Calculate ROC AUC scores for each target
roc_auc_xyz = roc_auc_score(y_test['xyz_vaccine'], y_pred_prob_xyz)
roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], y_pred_prob_seasonal)
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC Score for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC Score for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC Score: {mean_roc_auc}')

# Load the new test set features
new_test_set_features = pd.read_csv('test_set_features.csv')

# Preprocess the new test set features
categorical_cols_new = new_test_set_features.select_dtypes(include=['object']).columns
new_test_set_features[categorical_cols_new] = new_test_set_features[categorical_cols_new].fillna('missing')

numerical_cols_new = new_test_set_features.select_dtypes(exclude=['object']).columns
new_test_set_features[numerical_cols_new] = new_test_set_features[numerical_cols_new].fillna(new_test_set_features[numerical_cols_new].mean())
new_test_set_features[numerical_cols_new] = new_test_set_features[numerical_cols_new].astype(float)

# Use the trained model to predict probabilities for the new test set
y_pred_prob_new = model.predict_proba(new_test_set_features)

# Extract probabilities for each target
y_pred_prob_new_xyz = np.array([prob[:, 1] for prob in y_pred_prob_new]).T[:, 0]
y_pred_prob_new_seasonal = np.array([prob[:, 1] for prob in y_pred_prob_new]).T[:, 1]

# Prepare the submission DataFrame
submission_new = pd.DataFrame({
    'respondent_id': new_test_set_features['respondent_id'],
    'xyz_vaccine': y_pred_prob_new_xyz,
    'seasonal_vaccine': y_pred_prob_new_seasonal
})

# Save the submission file
submission_new.to_csv('new_submission.csv', index=False)


ROC AUC Score for xyz_vaccine: 0.8603585205355118
ROC AUC Score for seasonal_vaccine: 0.8559791279539014
Mean ROC AUC Score: 0.8581688242447065
