In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [3]:
# Step 1: Load training data
train_features = pd.read_csv('/Users/divyanshsinghsolanki/Downloads/dataset and all 2/training_set_features.csv')
train_labels = pd.read_csv('/Users/divyanshsinghsolanki/Downloads/dataset and all 2/training_set_labels.csv')

# Step 2: Preprocessing pipeline
# Define preprocessing steps for numerical and categorical columns
numerical_cols = train_features.select_dtypes(include=['number']).columns.tolist()
categorical_cols = train_features.select_dtypes(include=['object']).columns.tolist()

# Create preprocessing pipelines for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Step 3: Define models and train them
models = {}
for target_col in ['xyz_vaccine', 'seasonal_vaccine']:
    # Split data into train and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(train_features, train_labels[target_col], random_state=0)

    # Create pipeline with preprocessing and model
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=0))  # Example model: Random Forest
    ])

    # Train the model
    model_pipeline.fit(X_train, y_train)

    # Evaluate on validation set
    y_pred_valid = model_pipeline.predict_proba(X_valid)[:, 1]  # Predict probabilities for positive class
    roc_auc = roc_auc_score(y_valid, y_pred_valid)
    print(f'ROC AUC for {target_col}: {roc_auc}')

    # Save the model
    models[target_col] = model_pipeline

# Step 4: Generate predictions for test set
test_data = pd.read_csv('/Users/divyanshsinghsolanki/Downloads/dataset and all 2/test_set_features.csv')

# Generate predictions for each model
predictions = {}
for target_col, model in models.items():
    probabilities = model.predict_proba(test_data)[:, 1]  # Predict probabilities for positive class
    predictions[target_col] = probabilities

# Step 5: Create submission file
submission_df = pd.DataFrame({
    'respondent_id': test_data['respondent_id'],
    'xyz_vaccine': predictions['xyz_vaccine'],
    'seasonal_vaccine': predictions['seasonal_vaccine']
})

submission_df.to_csv('test_set_labels.csv', index=False)

ROC AUC for xyz_vaccine: 0.8244443240514759
ROC AUC for seasonal_vaccine: 0.8550034190902512
