In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

In [2]:
trainFeatures= pd.read_csv('training_set_features.csv')
trainLabels= pd.read_csv('training_set_labels.csv')
testFeatures= pd.read_csv('test_set_features.csv')

In [3]:
features= trainFeatures.drop(columns=['respondent_id'])
labels= trainLabels.drop(columns=['respondent_id'])

In [4]:
X_train, X_val, y_train, y_val= train_test_split(features, labels, test_size=0.3, random_state=21)

In [5]:
# Prepocessing
numeric_cols= features.select_dtypes(include=['int64','float64']).columns
cat_cols= features.select_dtypes(include=['object']).columns

In [6]:
# Definition
preprocessor= ColumnTransformer(transformers=[
    ('num', SimpleImputer(strategy='median'), numeric_cols),
    ('cat', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]), cat_cols)
])

In [7]:
# Model Definition
xyzModel= Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False)),
    ('classifier', SVC(probability=True, random_state=21))
])

In [8]:
# Train xyz
xyzModel.fit(X_train, y_train['xyz_vaccine'])

In [9]:
# Probability Calculation
xyzProb= xyzModel.predict_proba(X_val)[:,1]

In [10]:
# Model Seasonal
seasonalModel= Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False)),
    ('classifier', SVC(probability=True, random_state=21))
])

In [11]:
# Train
seasonalModel.fit(X_train, y_train['seasonal_vaccine'])

In [12]:
# Probability Calculation
seasonalProb= seasonalModel.predict_proba(X_val)[:,1]

In [13]:
# ROC AUC
rocXYZ= roc_auc_score(y_val['xyz_vaccine'], xyzProb)
rocSeasonal= roc_auc_score(y_val['seasonal_vaccine'], seasonalProb)
meanScore= (rocXYZ+rocSeasonal)/2
print(f'XYZ Score: {rocXYZ}')
print(f'Seasonal Score: {rocSeasonal}')
print(f'Mean Score: {meanScore}')

XYZ Score: 0.8186037448439426
Seasonal Score: 0.8491947465427638
Mean Score: 0.8338992456933532


In [15]:
# Since a good score, so now train on full data
xyzModel.fit(features, labels['xyz_vaccine'])
XYZProbab= xyzModel.predict_proba(testFeatures.drop(columns=['respondent_id']))[:,1]
seasonalModel.fit(features, labels['seasonal_vaccine'])
SeasonalProbab= seasonalModel.predict_proba(testFeatures.drop(columns=['respondent_id']))[:,1]

In [16]:
fd= pd.DataFrame({'respondent_id': testFeatures['respondent_id'], 'xyz_vaccine': XYZProbab, 'seasonal_vaccine': SeasonalProbab})

In [17]:
fd.to_csv('submission.csv', index=False)