In [10]:
import pandas as pd

# Load datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')

# Display first few rows of the datasets
print(train_features.head())
print(train_labels.head())
print(test_features.head())
print(submission_format.head())


   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [11]:
from sklearn.preprocessing import LabelEncoder

# Merge train features and labels
train_data = train_features.merge(train_labels, on='respondent_id')

# Handle missing values
train_data = train_data.fillna(train_data.mode().iloc[0])
test_features = test_features.fillna(test_features.mode().iloc[0])

# Encode categorical variables
label_encoders = {}
categorical_columns = train_data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    test_features[col] = le.transform(test_features[col])
    label_encoders[col] = le

# Split features and labels
X_train = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_train_xyz = train_data['xyz_vaccine']
y_train_seasonal = train_data['seasonal_vaccine']
X_test = test_features.drop(columns=['respondent_id'])


In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Split the training data for validation
X_train_split, X_val, y_train_split_xyz, y_val_xyz = train_test_split(X_train, y_train_xyz, test_size=0.2, random_state=42)
X_train_split, X_val, y_train_split_seasonal, y_val_seasonal = train_test_split(X_train, y_train_seasonal, test_size=0.2, random_state=42)

# Train models
model_xyz = RandomForestClassifier(n_estimators=100, random_state=42)
model_xyz.fit(X_train_split, y_train_split_xyz)

model_seasonal = RandomForestClassifier(n_estimators=100, random_state=42)
model_seasonal.fit(X_train_split, y_train_split_seasonal)

# Predict on validation set
y_pred_val_xyz = model_xyz.predict_proba(X_val)[:, 1]
y_pred_val_seasonal = model_seasonal.predict_proba(X_val)[:, 1]

# Evaluate models
roc_auc_xyz = roc_auc_score(y_val_xyz, y_pred_val_xyz)
roc_auc_seasonal = roc_auc_score(y_val_seasonal, y_pred_val_seasonal)

print(f'Validation ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'Validation ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')


Validation ROC AUC for xyz_vaccine: 0.8285427644572189
Validation ROC AUC for seasonal_vaccine: 0.8524193529039108


In [13]:
# Make predictions on the test set
test_pred_xyz = model_xyz.predict_proba(X_test)[:, 1]
test_pred_seasonal = model_seasonal.predict_proba(X_test)[:, 1]

# Prepare submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_pred_xyz,
    'seasonal_vaccine': test_pred_seasonal
})

# Save the submission file
submission.to_csv('submission.csv', index=False)
