In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier




In [34]:

data = pd.read_csv('training_set_labels.csv')
fea= pd.read_csv('training_set_features.csv')


In [35]:


X = fea.drop(columns=['respondent_id'])
y = data[['xyz_vaccine','seasonal_vaccine']]

num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)])


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
def evaluate_model(model, X_train, y_train, X_test, y_test):
    clf = MultiOutputClassifier(model, n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test)
    h1n1_auc = roc_auc_score(y_test['xyz_vaccine'], y_pred[0][:, 1])
    seasonal_auc = roc_auc_score(y_test['seasonal_vaccine'], y_pred[1][:, 1])
    return (h1n1_auc + seasonal_auc) / 2



In [38]:

lr = Pipeline(steps=[('preprocessor', preprocessor),
                     ('classifier', LogisticRegression(max_iter=1000))])

svm = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', SVC(probability=True))])

nb = Pipeline(steps=[('preprocessor', preprocessor),
                     ('classifier', GaussianNB())])

rf = Pipeline(steps=[('preprocessor', preprocessor),
                     ('classifier', RandomForestClassifier(n_estimators=100))])


In [39]:
models = {'Logistic Regression': lr, 'SVM': svm, 'Naive Bayes': nb, 'Random Forest': rf}
results = {}

In [40]:
for model_name, model in models.items():
    auc = evaluate_model(model, X_train, y_train, X_test, y_test)
    results[model_name] = auc

In [41]:
print(results)

{'Logistic Regression': 0.841380631350225, 'SVM': 0.8500231971740572, 'Naive Bayes': 0.7266679868936973, 'Random Forest': 0.8575810453720376}


In [42]:
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [43]:
best_model = grid_search.best_estimator_
best_auc = evaluate_model(best_model, X_train, y_train, X_test, y_test)
print(f'Best Random Forest AUC: {best_auc}')

Best Random Forest AUC: 0.8620260328553757


In [44]:
final_model = best_model
final_model.fit(X_train, y_train)

In [45]:
y_pred = final_model.predict_proba(X_test)

In [46]:
submission = pd.DataFrame({
    'respondent_id': data.loc[X_test.index, 'respondent_id'],
    'h1n1_vaccine': y_pred[0][:, 1],
    'seasonal_vaccine': y_pred[1][:, 1]
})

In [49]:
submission.to_csv('submission_format.csv', index=False)

In [51]:
fin=pd.read_csv('test_set_features.csv')

In [53]:
sub= fin.drop(columns=['respondent_id'])

In [54]:
y_pred = final_model.predict_proba(sub)

In [59]:
submit = pd.DataFrame({
    'respondent_id': fin.loc[sub.index, 'respondent_id'],
    'xyz_vaccine': y_pred[0][:, 1],
    'seasonal_vaccine': y_pred[1][:, 1]
})

In [60]:
submit.shape

(26708, 3)

In [61]:
submit.to_csv('tosubmit.csv', index=False)