In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import numpy as np

# Load the dataset
data = pd.read_csv('training_set_features.csv')

# Separate features and target variables
X = data.drop(columns=['respondent_id'])
y = data[['xyz_vaccine', 'seasonal_vaccine']].astype(int)  # Ensure these columns exist in the dataset

# Handle missing values and encode categorical variables
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

X_preprocessed = preprocessor.fit_transform(X)

print("Data preprocessing complete.")


Data preprocessing complete.


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Build the neural network model
def build_model(input_dim):
    model = Sequential()
    model.add(Dense(64, input_dim=input_dim, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(2, activation='sigmoid'))  # Two outputs for multilabel classification
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

input_dim = X_train.shape[1]
model = build_model(input_dim)

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val))

# Validate the model
y_pred = model.predict(X_val)

# Evaluate the model
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred[:, 0])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred[:, 1])

print(f'ROC AUC for XYZ Vaccine: {roc_auc_xyz}')
print(f'ROC AUC for Seasonal Vaccine: {roc_auc_seasonal}')


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
ROC AUC for XYZ Vaccine: 1.0
ROC AUC for Seasonal Vaccine: 1.0


In [None]:
# Perform cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
roc_auc_scores_xyz = []
roc_auc_scores_seasonal = []

for train_index, val_index in kf.split(X_preprocessed):
    X_train_cv, X_val_cv = X_preprocessed[train_index], X_preprocessed[val_index]
    y_train_cv, y_val_cv = y.iloc[train_index], y.iloc[val_index]

    model_cv = build_model(input_dim)
    model_cv.fit(X_train_cv, y_train_cv, epochs=50, batch_size=32, verbose=0)

    y_pred_cv = model_cv.predict(X_val_cv)
    roc_auc_xyz_cv = roc_auc_score(y_val_cv['xyz_vaccine'], y_pred_cv[:, 0])
    roc_auc_seasonal_cv = roc_auc_score(y_val_cv['seasonal_vaccine'], y_pred_cv[:, 1])

    roc_auc_scores_xyz.append(roc_auc_xyz_cv)
    roc_auc_scores_seasonal.append(roc_auc_seasonal_cv)

print(f'Cross-validated ROC AUC for XYZ Vaccine: {np.mean(roc_auc_scores_xyz)}')
print(f'Cross-validated ROC AUC for Seasonal Vaccine: {np.mean(roc_auc_scores_seasonal)}')


Cross-validated ROC AUC for XYZ Vaccine: 1.0
Cross-validated ROC AUC for Seasonal Vaccine: 1.0


In [None]:
# Load test data
test_data = pd.read_csv('test_set_features.csv')  # Adjust path if necessary
test_ids = test_data['respondent_id']
X_test = test_data.drop(columns=['respondent_id'])

# Preprocess the test data
X_test_preprocessed = preprocessor.transform(X_test)

# Make predictions
test_pred = model.predict(X_test_preprocessed)

# Prepare submission file
submission = pd.DataFrame({
    'respondent_id': test_ids,
    'xyz_vaccine': test_pred[:, 0],
    'seasonal_vaccine': test_pred[:, 1]
})

submission.to_csv('submission.csv', index=False)
print("Submission file created.")


Submission file created.
