In [4]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score

# Load the datasets
X = pd.read_csv('training_set_features.csv')
y = pd.read_csv('training_set_labels.csv')
z = pd.read_csv('test_set_features.csv')

# Extract and drop respondent_ids
respondent_ids = X['respondent_id']
X.drop('respondent_id', axis=1, inplace=True)
y.drop('respondent_id', axis=1, inplace=True)

# Identify numerical and categorical columns
cat_cols = [col for col in X.columns if X[col].dtype == 'O']
num_cols = [col for col in X.columns if col not in cat_cols]

# Define imputers and transformers
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Define column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

# Initialize XGBoost classifier within a MultiOutputClassifier
xgb_clf = MultiOutputClassifier(XGBClassifier(use_label_encoder=False, eval_metric='logloss'))

# Create pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', xgb_clf)])

# Define parameter grid for GridSearch
param_grid = {
    'classifier__estimator__n_estimators': [100, 200],
    'classifier__estimator__learning_rate': [0.01, 0.1],
    'classifier__estimator__max_depth': [3, 5],
    'classifier__estimator__scale_pos_weight': [1, 3, 5]
}

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

# Fit model
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Predict probabilities on hold-out test set
y_pred_prob = best_model.predict_proba(X_test)

# Evaluate the model using ROC AUC score on the hold-out test set
roc_auc = roc_auc_score(y_test, np.array([prob[:, 1] for prob in y_pred_prob]).T)
print(f"ROC AUC Score: {roc_auc}")

# Extract respondent IDs from test set features
respondent_ids_test = z['respondent_id']

# Drop respondent_ids from test set features
z.drop('respondent_id', axis=1, inplace=True)

# Predict probabilities using the best model on the test set features
y_test_pred_prob = best_model.predict_proba(z)

# Prepare submission with probabilities rounded to one decimal place
submission = pd.DataFrame({
    'respondent_id': respondent_ids_test,
    'xyz_vaccine': np.round(y_test_pred_prob[0][:, 1], 1),  # Probabilities for xyz_vaccine
    'seasonal_vaccine': np.round(y_test_pred_prob[1][:, 1], 1)  # Probabilities for seasonal_vaccine
})

# Save to CSV
submission.to_csv('submission_Xgboost_fnl.csv', index=False)


ROC AUC Score: 0.8507081915014071


In [7]:
print("Best parameters found by GridSearchCV:")
print(grid_search.best_params_)

Best parameters found by GridSearchCV:
{'classifier__estimator__learning_rate': 0.1, 'classifier__estimator__max_depth': 3, 'classifier__estimator__n_estimators': 200, 'classifier__estimator__scale_pos_weight': 1}


In [6]:
y_test_pred_prob

[array([[0.9477094 , 0.05229063],
        [0.9757871 , 0.02421291],
        [0.5236005 , 0.47639948],
        ...,
        [0.89288515, 0.10711483],
        [0.95115113, 0.04884888],
        [0.4975807 , 0.5024193 ]], dtype=float32),
 array([[0.73893166, 0.26106837],
        [0.9674314 , 0.03256859],
        [0.2898041 , 0.7101959 ],
        ...,
        [0.8198638 , 0.1801362 ],
        [0.6463791 , 0.35362086],
        [0.38615024, 0.61384976]], dtype=float32)]