In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from  sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score,roc_curve,auc
import matplotlib.pyplot as plt
from sklearn.multioutput import MultiOutputClassifier


In [2]:
train_feature = pd.read_csv("training_set_features.csv")
train_label = pd.read_csv("training_set_labels.csv")
test_feature = pd.read_csv("test_set_features.csv")
submission_format = pd.read_csv("submission_format.csv")


In [3]:
print(train_feature.head())
print(train_label.head())
print(test_feature.head())
print(submission_format.head())

   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [4]:
numerical_cols = train_feature.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = train_feature.select_dtypes(include=['object']).columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [5]:
X_train = preprocessor.fit_transform(train_feature)
X_test = preprocessor.transform(test_feature)


In [6]:
y_train = train_label[['xyz_vaccine', 'seasonal_vaccine']]

In [7]:
X_train_train, X_train_val, y_train_train, y_train_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [8]:
svm_classifier = SVC(kernel='rbf',probability = True, random_state=42)  


In [None]:
svm_model = MultiOutputClassifier(svm_classifier)
svm_model.fit(X_train_train, y_train_train)

In [None]:
# Evaluate the model on the validation set
y_val_pred = svm_model.predict_proba(X_test)
y_val_pred = pd.DataFrame({
    'xyz_vaccine': [pred[1] for pred in y_val_pred[0]],
    'seasonal_vaccine': [pred[1] for pred in y_val_pred[1]]
})

# Calculate ROC AUC score
roc_auc_xyz = roc_auc_score(y_train_val['xyz_vaccine'], y_val_pred['xyz_vaccine'])
roc_auc_seasonal = roc_auc_score(y_train_val['seasonal_vaccine'], y_val_pred['seasonal_vaccine'])
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')

In [None]:
# Define the Logistic Regression classifier with probability=True
logreg_classifier = LogisticRegression(random_state=42, solver='lbfgs', multi_class='auto', max_iter=10000, probability=True)

# Wrap Logistic Regression in MultiOutputClassifier
logreg_model = MultiOutputClassifier(logreg_classifier)

# Fit the model
logreg_model.fit(X_train_train, y_train_train)

# Predict probabilities on the validation set
y_val_pred_proba = logreg_model.predict_proba(X_val)

# Calculate ROC AUC score
roc_auc_xyz = roc_auc_score(y_train_val['xyz_vaccine'], y_val_pred['xyz_vaccine'])
roc_auc_seasonal = roc_auc_score(y_train_val['seasonal_vaccine'], y_val_pred['seasonal_vaccine'])
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')