In [12]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score


data = pd.read_csv('training_set_features.csv')


print(data.head())


print(data.isnull().sum())

X = data.drop(['xyz_vaccine', 'seasonal_vaccine'], axis=1)
y_xyz = data['xyz_vaccine']
y_seasonal = data['seasonal_vaccine']

numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()


numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])




X_train, X_test, y_xyz_train, y_xyz_test, y_seasonal_train, y_seasonal_test = train_test_split(
    X, y_xyz, y_seasonal, test_size=0.2, random_state=42)


model_xyz = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', RandomForestClassifier(random_state=42))])

model_seasonal = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', RandomForestClassifier(random_state=42))])

model_xyz.fit(X_train, y_xyz_train)
model_seasonal.fit(X_train, y_seasonal_train)


y_xyz_pred_proba = model_xyz.predict_proba(X_test)[:, 1]
y_seasonal_pred_proba = model_seasonal.predict_proba(X_test)[:, 1]

roc_auc_xyz = roc_auc_score(y_xyz_test, y_xyz_pred_proba)
roc_auc_seasonal = roc_auc_score(y_seasonal_test, y_seasonal_pred_proba)

print(f'ROC AUC Score for XYZ Vaccine: {roc_auc_xyz}')
print(f'ROC AUC Score for Seasonal Vaccine: {roc_auc_seasonal}')


test_data = pd.read_csv('test_set_features.csv')  
xyz_predictions = model_xyz.predict_proba(test_data)[:, 1]
seasonal_predictions = model_seasonal.predict_proba(test_data)[:, 1]


predictions = pd.DataFrame({
    'xyz_vaccine_proba': xyz_predictions,
    'seasonal_vaccine_proba': seasonal_predictions
})

predictions.to_csv('vaccine_predictions.csv', index=False)



   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  



ROC AUC Score for XYZ Vaccine: 0.8610516397270808
ROC AUC Score for Seasonal Vaccine: 0.8546619942502236




Predictions saved to vaccine_predictions.csv
