In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

# Loading the data
training_features = 'training_set_features.csv'
training_labels = 'training_set_labels.csv'
test_features = 'test_set_features.csv'

training_features_df = pd.read_csv(training_features)
training_labels_df = pd.read_csv(training_labels)
test_features_df = pd.read_csv(test_features)

# Mergeing features and labels based on the respondent_id
data = pd.merge(training_features_df, training_labels_df, on='respondent_id')

# Separating features and target variables
Cols1 = data.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)
Cols2 = data[['xyz_vaccine', 'seasonal_vaccine']]

# Recognizing categorical and numerical columns
categorical_cols = Cols1.select_dtypes(include=['object']).columns
numerical_cols = Cols1.select_dtypes(exclude=['object']).columns

# Preprocessing categorical data
categorical_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing numerical data
numerical_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_trans, numerical_cols),
        ('categorial', categorical_trans, categorical_cols)
    ])

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(Cols1, Cols2, test_size=0.2, random_state=42)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Training logistic regression models for each target variable
xyz = LogisticRegression(max_iter=1000)
seasonal = LogisticRegression(max_iter=1000)

xyz.fit(X_train, Y_train['xyz_vaccine'])
seasonal.fit(X_train, Y_train['seasonal_vaccine'])

y_pred_prob_xyz = xyz.predict_proba(X_test)[:, 1]
y_pred_prob_seasonal = seasonal.predict_proba(X_test)[:, 1]

# Evaluating the model using ROC AUC score
auc_xyz = roc_auc_score(Y_test['xyz_vaccine'], y_pred_prob_xyz)
auc_seasonal = roc_auc_score(Y_test['seasonal_vaccine'], y_pred_prob_seasonal)
mean_auc = np.mean([auc_xyz, auc_seasonal])

print(f'AUC for XYZ Vaccine: {auc_xyz}')
print(f'AUC for Seasonal Vaccine: {auc_seasonal}')
print(f'Mean AUC: {mean_auc}')

X_test_new = preprocessor.transform(test_features_df)

# Predicting probabilities for the new test set
new_pred_prob_xyz = xyz.predict_proba(X_test_new)[:, 1]
new_pred_prob_seasonal = seasonal.predict_proba(X_test_new)[:, 1]

# Creating a DataFrame for the predictions
solution_df = pd.DataFrame({
    'respondent_id': test_features_df['respondent_id'],
    'xyz_vaccine': new_pred_prob_xyz,
    'seasonal_vaccine': new_pred_prob_seasonal
})

# Displaying the solution DataFrame
print(solution_df)

solution_df.to_csv('solution.csv', index=False)

AUC for XYZ Vaccine: 0.8313516375463279
AUC for Seasonal Vaccine: 0.8560635216059745
Mean AUC: 0.8437075795761513
       respondent_id  xyz_vaccine  seasonal_vaccine
0              26707     0.050004          0.297041
1              26708     0.046350          0.046415
2              26709     0.366987          0.514891
3              26710     0.514147          0.881483
4              26711     0.150119          0.457555
...              ...          ...               ...
26703          53410     0.344451          0.539056
26704          53411     0.093758          0.285241
26705          53412     0.135915          0.196219
26706          53413     0.059891          0.361090
26707          53414     0.581369          0.536006

[26708 rows x 3 columns]
