In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
train_features = pd.read_csv(r"...\SummerAnalytics\dataset and all\training_set_features.csv")
train_labels = pd.read_csv(r"...\SummerAnalytics\dataset and all\training_set_labels.csv")
test_features = pd.read_csv(r"...\SummerAnalytics\dataset and all\test_set_features.csv")

In [4]:
train_features.isnull().sum()

respondent_id                      0
xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

In [5]:
X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels, test_size=0.2)

In [6]:
numerical_columns = [col for col in X_train.columns if X_train[col].dtype != 'object']
for col in numerical_columns:
    X_train[col].fillna(X_train[col].mean(), inplace=True)
    X_val[col].fillna(X_train[col].mean(), inplace=True)
    test_features[col].fillna(X_train[col].mean(), inplace=True)

In [7]:
train_features.isnull().sum()

respondent_id                      0
xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

categorical_columns = [col for col in X_train.columns if X_train[col].dtype == 'object']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ],
    remainder='passthrough' 
)

X_train_encoded = preprocessor.fit_transform(X_train)
X_val_encoded = preprocessor.transform(X_val)
X_test_encoded = preprocessor.transform(test_features)


In [9]:
train_features.isnull().sum()

respondent_id                      0
xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_val_scaled = scaler.transform(X_val_encoded)
#X_test_scaled = scaler.transform(test_features.drop('respondent_id', axis=1))

svm_model = SVC(probability=True)

for target in ['xyz_vaccine', 'seasonal_vaccine']:
    svm_model.fit(X_train_scaled, y_train[target])
    val_probs = svm_model.predict_proba(X_val_scaled)[:, 1]
    auc_score = roc_auc_score(y_val[target], val_probs)
    print(f'ROC AUC for {target}: {auc_score}')

In [None]:
test_probs_xyz = svm_model.predict_proba(X_test_scaled)[:, 1]
test_probs_seasonal = svm_model.predict_proba(X_test_scaled)[:, 1]