In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

In [7]:
# Load feature and label datasets
features_df = pd.read_csv('training_set_features.csv')
labels_df = pd.read_csv('training_set_labels.csv')

# Merge features and labels on respondent_id
data = pd.merge(features_df, labels_df, on='respondent_id')

# Load test set features
test_data = pd.read_csv('test_set_features.csv')

In [8]:
# Fill missing values with mode for simplicity
data.fillna(data.mode().iloc[0], inplace=True)
test_data.fillna(test_data.mode().iloc[0], inplace=True)

# Identify categorical variables
categorical_features = ['age_group', 'education', 'race', 'sex', 'income_poverty', 
                        'marital_status', 'rent_or_own', 'employment_status', 
                        'hhs_geo_region', 'census_msa', 'employment_industry', 
                        'employment_occupation']

# Apply Label Encoding
for feature in categorical_features:
    le = LabelEncoder()
    data[feature] = le.fit_transform(data[feature])
    test_data[feature] = le.transform(test_data[feature])

# Standardize numerical features
numerical_features = ['xyz_concern', 'xyz_knowledge', 'opinion_xyz_vacc_effective', 
                      'opinion_xyz_risk', 'opinion_xyz_sick_from_vacc', 
                      'opinion_seas_vacc_effective', 'opinion_seas_risk', 
                      'opinion_seas_sick_from_vacc', 'household_adults', 
                      'household_children']

scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])
test_data[numerical_features] = scaler.transform(test_data[numerical_features])

# Define target variables
target = ['xyz_vaccine', 'seasonal_vaccine']

# Define features
features = data.columns.drop(['respondent_id'] + target)

In [9]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(data[features], data[target], test_size=0.2, random_state=42)

In [10]:
# Define the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
multi_rf = MultiOutputClassifier(rf, n_jobs=-1)

# Train the model
multi_rf.fit(X_train, y_train)

In [11]:
# Predict probabilities
y_pred_proba = multi_rf.predict_proba(X_val)

# Calculate ROC AUC score for each target
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred_proba[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred_proba[1][:, 1])

# Mean ROC AUC
mean_roc_auc = np.mean([roc_auc_xyz, roc_auc_seasonal])
print(f'Mean ROC AUC: {mean_roc_auc:.4f}')

Mean ROC AUC: 0.8405


In [12]:
# Make predictions on test set
test_pred_proba = multi_rf.predict_proba(test_data[features])

# Create submission dataframe
submission = pd.DataFrame({
    'respondent_id': test_data['respondent_id'],
    'xyz_vaccine': test_pred_proba[0][:, 1],
    'seasonal_vaccine': test_pred_proba[1][:, 1]
})

# Save submission to CSV
submission.to_csv('submission.csv', index=False)