In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [4]:
# Load data
features_data = pd.read_csv("/content/drive/MyDrive/DataHack/training_set_features.csv")
labels_data = pd.read_csv("/content/drive/MyDrive/DataHack/training_set_labels.csv")
test_data = pd.read_csv("/content/drive/MyDrive/DataHack/test_set_features.csv")

In [5]:
labels_data.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0


In [62]:
print(labels_data['xyz_vaccine'].value_counts())
print(labels_data['seasonal_vaccine'].value_counts())

xyz_vaccine
0    21033
1     5674
Name: count, dtype: int64
seasonal_vaccine
0    14272
1    12435
Name: count, dtype: int64


In [6]:
# Select desired features (exclude unwanted features)
desired_features = [col for col in features_data.columns if col not in [
    "age_group", "education", "race", "sex", "income_poverty", "marital_status",
    "rent_or_own", "employment_status", "hhs_geo_region", "census_msa",
    "household_adults", "household_children", "employment_industry",
    "employment_occupation"
]]

In [7]:
# Merge features and labels based on respondent_id
data = labels_data.merge(features_data[desired_features], on="respondent_id")


In [16]:
# Separate features and target variables
features = data[desired_features]
target_xyz = data["xyz_vaccine"]  # Separate target for xyz vaccine
target_seasonal = data["seasonal_vaccine"]  # Separate target for seasonal vaccine

In [17]:
# Convert categorical features to numerical (if needed)
categorical_features = [col for col in features.columns if features[col].dtype == "object"]
for col in categorical_features:
    features[col] = features[col].astype("category").cat.codes

In [87]:
# Split data into training and testing sets for xyz vaccine
X_train, X_test, y_train_xyz, y_test_xyz = train_test_split(features, target_xyz, test_size=0.2)


In [88]:
#print(labels_data['xyz_vaccine'].value_counts())
print(y_test_xyz.value_counts())

xyz_vaccine
0    4186
1    1156
Name: count, dtype: int64


In [19]:
# Create LGBM dataset for xyz vaccine
lgb_train_xyz = lgb.Dataset(X_train, label=y_train_xyz)
lgb_eval_xyz = lgb.Dataset(X_test, label=y_test_xyz, reference=lgb_train_xyz)

In [121]:
# Train the model for xyz vaccine
params1 = {'task': 'train',
         'boosting_type': 'gbdt',
          'objective': 'binary',
         'metric' : 'auc' ,
         'is_imbalance': 1/4,
         'num_leaves': 1000,
         #'learning_rate':0.1,
         #'num_iterations': 1000
         }

In [122]:
model_xyz = lgb.train(params1, lgb_train_xyz, valid_sets=[lgb_eval_xyz])


[LightGBM] [Info] Number of positive: 4544, number of negative: 16821
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005623 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 21365, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212684 -> initscore=-1.308820
[LightGBM] [Info] Start training from score -1.308820


In [123]:
# Split data into training and testing sets for seasonal vaccine
X_train, X_test, y_train_seasonal, y_test_seasonal = train_test_split(features, target_seasonal, test_size=0.2)

In [124]:
#print(labels_data['xyz_vaccine'].value_counts())
print(y_test_seasonal.value_counts())

seasonal_vaccine
0    2820
1    2522
Name: count, dtype: int64


In [125]:
# Create LGBM dataset for seasonal vaccine
lgb_train_seasonal = lgb.Dataset(X_train, label=y_train_seasonal)
lgb_eval_seasonal = lgb.Dataset(X_test, label=y_test_seasonal, reference=lgb_train_seasonal)

In [126]:
# Train the model for seasonal vaccine
params2 = {"objective": "binary",
    "metric": "auc",  # Use AUC for multi-label evaluation
    "num_leaves": 31,
    #"learning_rate": 0.1,
    #"feature_fraction": 0.9,
    #"bagging_fraction": 0.8,
    #"bagging_freq": 5,
    "early_stopping_rounds": 10,
    "is_unbalance": True,
    "seed": 42
         }

In [127]:
# Train the model for seasonal vaccine
model_seasonal = lgb.train(params2, lgb_train_seasonal, valid_sets=[lgb_eval_seasonal])

[LightGBM] [Info] Number of positive: 9913, number of negative: 11452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003882 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 21365, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.463983 -> initscore=-0.144317
[LightGBM] [Info] Start training from score -0.144317
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[47]	valid_0's auc: 0.839041


In [128]:
# Make predictions on the testing set (for each vaccine model)
y_pred_xyz = model_xyz.predict(X_test)
y_pred_seasonal = model_seasonal.predict(X_test)

In [129]:
print("Shape of y_pred_xyz:", y_pred_xyz.shape)

Shape of y_pred_xyz: (5342,)


In [130]:
# Ensure y_test_xyz contains valid binary labels (0 or 1)
print(f"Unique values in y_test_xyz: {y_test_xyz.unique()}")


Unique values in y_test_xyz: [0 1]


In [131]:
# Calculate ROC AUC score using the first column
auc_xyz = roc_auc_score(y_test_xyz, y_pred_xyz[:], average="macro")
print(f"ROC AUC (xyz vaccine): {auc_xyz:.4f}")

ROC AUC (xyz vaccine): 0.5165


In [132]:
# Print the shape of y_pred_seasonal (should be (n_samples,))
print("Shape of y_pred_seasonal:", y_pred_seasonal.shape)

Shape of y_pred_seasonal: (5342,)


In [133]:
auc_seasonal = roc_auc_score(y_test_seasonal, y_pred_seasonal[:], average="macro")
print(f"ROC AUC (seasonal vaccine): {auc_seasonal:.4f}")

ROC AUC (seasonal vaccine): 0.8390


In [134]:
mean_auc = (auc_xyz + auc_seasonal) / 2
print(f"Mean ROC AUC: {mean_auc:.4f}")

Mean ROC AUC: 0.6778


In [135]:
# Use the trained model to predict probabilities on new data
new_data = test_data[desired_features]  # Select desired features from test data


In [136]:
# Predict probabilities for xyz vaccine using the trained model_xyz
xyz_proba = model_xyz.predict(new_data)[:]

# Predict probabilities for seasonal vaccine using the trained model_seasonal
seasonal_proba = model_seasonal.predict(new_data)[:]

In [138]:
# Create submission dataframe
submission = pd.DataFrame({
    "respondent_id": test_data["respondent_id"],
    "xyz_vaccine": xyz_proba,
    "seasonal_vaccine": seasonal_proba
})

submission.to_csv("/content/drive/MyDrive/DataHack/submission_format.csv", index=False)