In [1]:
import pandas as pd

# Load datasets
train_features = pd.read_csv('/content/training_set_features.csv')
train_labels = pd.read_csv('/content/training_set_labels.csv')
test_features = pd.read_csv('/content/test_set_features.csv')

# Display the first few rows of each dataset
print(train_features.head())
print(train_labels.head())
print(test_features.head())


   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [3]:
from sklearn.preprocessing import LabelEncoder

# Identify categorical columns
categorical_cols = train_features.select_dtypes(include=['object']).columns

# Apply LabelEncoder to each categorical column
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_features[col] = le.fit_transform(train_features[col].astype(str))
    test_features[col] = le.transform(test_features[col].astype(str))
    label_encoders[col] = le

# Handle missing values (if any)
train_features.fillna(-1, inplace=True)
test_features.fillna(-1, inplace=True)


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Split the training data into train and validation sets
X = train_features.drop(columns=['respondent_id'])
y_xyz = train_labels['xyz_vaccine']
y_seasonal = train_labels['seasonal_vaccine']

X_train, X_val, y_train_xyz, y_val_xyz = train_test_split(X, y_xyz, test_size=0.2, random_state=42)
X_train, X_val, y_train_seasonal, y_val_seasonal = train_test_split(X, y_seasonal, test_size=0.2, random_state=42)

# Train the model for xyz_vaccine
model_xyz = RandomForestClassifier(random_state=42)
model_xyz.fit(X_train, y_train_xyz)
y_pred_xyz = model_xyz.predict_proba(X_val)[:, 1]

# Train the model for seasonal_vaccine
model_seasonal = RandomForestClassifier(random_state=42)
model_seasonal.fit(X_train, y_train_seasonal)
y_pred_seasonal = model_seasonal.predict_proba(X_val)[:, 1]

# Evaluate the models
roc_auc_xyz = roc_auc_score(y_val_xyz, y_pred_xyz)
roc_auc_seasonal = roc_auc_score(y_val_seasonal, y_pred_seasonal)
print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')


ROC AUC for xyz_vaccine: 0.8633481876475977
ROC AUC for seasonal_vaccine: 0.8561853137827957


In [5]:
# Make predictions on the test set
test_X = test_features.drop(columns=['respondent_id'])

test_predictions_xyz = model_xyz.predict_proba(test_X)[:, 1]
test_predictions_seasonal = model_seasonal.predict_proba(test_X)[:, 1]

# Prepare the submission DataFrame
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_predictions_xyz,
    'seasonal_vaccine': test_predictions_seasonal
})

# Save the submission file
submission.to_csv('/content/submission_format.csv', index=False)
print(submission.head())


   respondent_id  xyz_vaccine  seasonal_vaccine
0          26707         0.18              0.37
1          26708         0.08              0.04
2          26709         0.28              0.75
3          26710         0.60              0.90
4          26711         0.34              0.52
