In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import MultiOutputClassifier

In [2]:
# Load the datasets
train_features = pd.read_csv('/content/training_set_features.csv')
train_labels = pd.read_csv('/content/training_set_labels.csv')
test_features = pd.read_csv('/content/test_set_features.csv')

In [3]:
# Merge the training features and labels
train_data = train_features.merge(train_labels, on='respondent_id')

In [4]:
# Define the target variables and features
target_vars = ['xyz_vaccine', 'seasonal_vaccine']  # Updated target variable name
features = [col for col in train_data.columns if col not in target_vars + ['respondent_id']]


In [5]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data[features], train_data[target_vars], test_size=0.2, random_state=42)


In [6]:
# Preprocessing for numerical data
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [7]:
# Preprocessing for categorical data
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [8]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [9]:
# Define the model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))])


In [10]:
# Train the model
model.fit(X_train, y_train)


In [11]:
# Make predictions on the validation set
y_val_pred = model.predict_proba(X_val)

In [12]:
# Calculate ROC AUC for each target variable
roc_auc_h1n1 = roc_auc_score(y_val['xyz_vaccine'], [prob[1] for prob in y_val_pred[0]])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], [prob[1] for prob in y_val_pred[1]])

print('ROC AUC for xyz_vaccine:', roc_auc_h1n1)
print('ROC AUC for seasonal_vaccine:', roc_auc_seasonal)


ROC AUC for xyz_vaccine: 0.8294325525888947
ROC AUC for seasonal_vaccine: 0.8518072872366175


In [13]:
# Make predictions on the test set
y_test_pred = model.predict_proba(test_features)


In [14]:
# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': [prob[1] for prob in y_test_pred[0]],
    'seasonal_vaccine': [prob[1] for prob in y_test_pred[1]]
})

In [15]:
# Save the submission file
submission.to_csv('/content/submission_format.csv', index=False)

print('Submission file created: submission.csv')

Submission file created: submission.csv


In [None]:
# Here we'll fill missing values with the mean or mode
X_train.fillna(X_train.mean(numeric_only=True), inplace=True)
X_train.fillna(X_train.mode().iloc[0], inplace=True)
X_test.fillna(X_test.mean(numeric_only=True), inplace=True)
X_test.fillna(X_test.mode().iloc[0], inplace=True)

In [None]:
# Drop the respondent_id from features but keep it for submission in test set
respondent_id_test = X_test['respondent_id']
X_train = X_train.drop(columns=['respondent_id'])
X_test = X_test.drop(columns=['respondent_id'])

In [None]:
# Handle categorical features (convert to dummy variables)
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

In [None]:
# Align the columns of the test set with the training set
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)


In [None]:
# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
multi_target_model = MultiOutputClassifier(model, n_jobs=-1)
multi_target_model.fit(X_train, y_train[['h1n1_vaccine', 'seasonal_vaccine']])
