In [19]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier


In [20]:
# Load the data
X_train_df = pd.read_csv('Desktop/training_set_features.csv')
y_labels_df = pd.read_csv('Desktop/training_set_labels.csv')
X_test_df = pd.read_csv('Desktop/test_set_features.csv')
submission_df = pd.read_csv('Desktop/submission_format.csv')

In [21]:
df=pd.merge(X_train_df,y_labels_df,how='outer',on='respondent_id')

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   xyz_concern                  26615 non-null  float64
 2   xyz_knowledge                26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_xyz              24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   xyz_concern                  26615 non-null  float64
 2   xyz_knowledge                26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_xyz              24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [24]:
missing_percent = ((df.isnull().sum()/len(df))*100).round(decimals=2)
missing_percent

respondent_id                   0.00
xyz_concern                     0.34
xyz_knowledge                   0.43
behavioral_antiviral_meds       0.27
behavioral_avoidance            0.78
behavioral_face_mask            0.07
behavioral_wash_hands           0.16
behavioral_large_gatherings     0.33
behavioral_outside_home         0.31
behavioral_touch_face           0.48
doctor_recc_xyz                 8.09
doctor_recc_seasonal            8.09
chronic_med_condition           3.64
child_under_6_months            3.07
health_worker                   3.01
health_insurance               45.96
opinion_xyz_vacc_effective      1.46
opinion_xyz_risk                1.45
opinion_xyz_sick_from_vacc      1.48
opinion_seas_vacc_effective     1.73
opinion_seas_risk               1.92
opinion_seas_sick_from_vacc     2.01
age_group                       0.00
education                       5.27
race                            0.00
sex                             0.00
income_poverty                 16.56
m

In [25]:
df.isnull().sum()

respondent_id                      0
xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

In [26]:
df.drop(["health_insurance", "employment_industry", "employment_occupation"], axis = 1, inplace= True)

In [27]:
df = df.apply(lambda x: x.fillna(x.value_counts().index[0]))

In [28]:
df.isnull().sum()

respondent_id                  0
xyz_concern                    0
xyz_knowledge                  0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_xyz                0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
opinion_xyz_vacc_effective     0
opinion_xyz_risk               0
opinion_xyz_sick_from_vacc     0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                    0
employment_status              0
hhs_geo_region                 0
census_msa

In [29]:
# Preprocess the data
# Drop respondent_id as it is not a feature
X = df.drop(columns=['respondent_id'])
y = df[['xyz_vaccine', 'seasonal_vaccine']]

In [30]:
X

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,xyz_vaccine,seasonal_vaccine
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,0,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,0,1
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,0,0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,0,1
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,0,0
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,0,0
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,lzgpxyit,"MSA, Not Principle City",0.0,0.0,0,1
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,0,0


In [31]:
y

Unnamed: 0,xyz_vaccine,seasonal_vaccine
0,0,0
1,0,1
2,0,0
3,0,1
4,0,0
...,...,...
26702,0,0
26703,0,0
26704,0,1
26705,0,0


In [32]:
# One-hot encode categorical variables
categorical_columns = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa']
X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)
#  'employment_industry', 'employment_occupation'

In [33]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [34]:
# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [35]:
# Initialize and train the model
model = MultiOutputClassifier(XGBClassifier(use_label_encoder=False, eval_metric='logloss'))


In [36]:
# Train models
model.fit(X_train, y_train)


In [37]:
# Make predictions
y_pred_proba = model.predict_proba(X_test)


In [38]:
# Predictions to dataframe
y_pred_proba_df = pd.DataFrame({col: probs[:, 1] for col, probs in zip(y.columns, y_pred_proba)})


In [39]:
# Evaluate the model
roc_auc_macro = roc_auc_score(y_test, y_pred_proba_df, average="macro")

print(f'Macro ROC AUC: {roc_auc_macro}')


Macro ROC AUC: 1.0


In [40]:
# Prepare the submission file
X_test_df = pd.read_csv('Desktop/test_set_features.csv')
X_submit = X_test_df.drop(columns=['respondent_id'])
X_submit = pd.get_dummies(X_submit, columns=categorical_columns, drop_first=True)


In [41]:
# Ensure the training and submission data have the same columns
missing_cols = set(X.columns) - set(X_submit.columns)
for c in missing_cols:
    X_submit[c] = 0
X_submit = X_submit[X.columns]

X_submit = scaler.transform(X_submit)
# X_submit = selector.transform(X_submit)

In [42]:
# Make predictions for the submission set
submit_pred_proba = model.predict_proba(X_submit)
submit_pred_proba_df = pd.DataFrame({col: probs[:, 1] for col, probs in zip(y.columns, submit_pred_proba)})
submission = pd.DataFrame({'respondent_id': X_test_df['respondent_id'], 'xyz_vaccine': submit_pred_proba_df['xyz_vaccine'], 'seasonal_vaccine': submit_pred_proba_df['seasonal_vaccine']})


In [43]:
# Save the submission file
submission.to_csv('submission.csv', index=False)