In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import MultiOutputClassifier

In [47]:
# Load the training set features and labels
features = pd.read_csv('training_set_features.csv')
labels = pd.read_csv('training_set_labels.csv')

In [48]:
# Display the column names
print("Features column names:", features.columns)
print("Labels column names:", labels.columns)

Features column names: Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')
Labels column names: Index(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], dtype='object')


In [49]:
# Ensure the target columns exist in the labels dataset
if 'xyz_vaccine' not in labels.columns or 'seasonal_vaccine' not in labels.columns:
    raise ValueError("Target columns 'xyz_vaccine' and 'seasonal_vaccine' not found in the labels dataset.")

# Merge features and labels on 'respondent_id'
data = pd.merge(features, labels, on='respondent_id')

# Handle missing values (simple strategy: fill with median for numerical and mode for categorical)
for col in data.select_dtypes(include=['object']).columns:
    data[col].fillna(data[col].mode()[0], inplace=True)
for col in data.select_dtypes(include=['number']).columns:
    data[col].fillna(data[col].median(), inplace=True)

# Separate features and target variables
X = data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = data[['xyz_vaccine', 'seasonal_vaccine']]

In [50]:
# Check if the specified categorical features are present in the dataset
for feature in categorical_features:
    if feature not in X.columns:
        raise ValueError(f"Categorical feature '{feature}' not found in the dataset.")

# Check if the specified numerical features are present in the dataset
for feature in numerical_features:
    if feature not in X.columns:
        raise ValueError(f"Numerical feature '{feature}' not found in the dataset.")

# Preprocessor for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [51]:
# Create the model
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))

In [52]:
# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

In [53]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
# Train the model
pipeline.fit(X_train, y_train)

In [55]:
# Predict probabilities on the validation set
y_pred_proba = pipeline.predict_proba(X_val)

In [56]:
# Extract the probabilities for each target variable
xyz_pred_proba = y_pred_proba[0][:, 1]
seasonal_pred_proba = y_pred_proba[1][:, 1]

In [57]:
# Evaluate the model using ROC AUC score
xyz_auc = roc_auc_score(y_val['xyz_vaccine'], xyz_pred_proba)
seasonal_auc = roc_auc_score(y_val['seasonal_vaccine'], seasonal_pred_proba)
print(f'XYZ Vaccine AUC: {xyz_auc}')
print(f'Seasonal Vaccine AUC: {seasonal_auc}')

XYZ Vaccine AUC: 0.5734506971232635
Seasonal Vaccine AUC: 0.6484781411267908


In [58]:
# Load the test data
test_features = pd.read_csv('test_set_features.csv')

In [59]:
# Handle missing values in test data
for col in test_features.select_dtypes(include=['object']).columns:
    test_features[col].fillna(test_features[col].mode()[0], inplace=True)
for col in test_features.select_dtypes(include=['number']).columns:
    test_features[col].fillna(test_features[col].median(), inplace=True)

In [60]:
# Predict probabilities on the test set
test_pred_proba = pipeline.predict_proba(test_features.drop(columns=['respondent_id']))

In [61]:
# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_pred_proba[0][:, 1],
    'seasonal_vaccine': test_pred_proba[1][:, 1]
})

In [62]:
# Save the submission file
submission.to_csv('submission.csv', index=False)