In [41]:
import zipfile
import os

zip_path = '/content/dataset and all.zip'
extract_path = '/content/your_extracted_files'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"Files extracted to: {extract_path}")

Files extracted to: /content/your_extracted_files


# **Load Datasets**

In [42]:
import pandas as pd
train_feature_df = pd.read_csv('/content/your_extracted_files/training_set_features.csv')
test_feature_df = pd.read_csv('/content/your_extracted_files/test_set_features.csv')
train_labels_df = pd.read_csv('/content/your_extracted_files/training_set_labels.csv')

In [43]:
train_feature_df.isnull().sum()

respondent_id                      0
xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

In [46]:
train_feature_df.shape

(26707, 36)

In [45]:
test_feature_df.isnull().sum()

respondent_id                      0
xyz_concern                       85
xyz_knowledge                    122
behavioral_antiviral_meds         79
behavioral_avoidance             213
behavioral_face_mask              19
behavioral_wash_hands             40
behavioral_large_gatherings       72
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            932
child_under_6_months             813
health_worker                    789
health_insurance               12228
opinion_xyz_vacc_effective       398
opinion_xyz_risk                 380
opinion_xyz_sick_from_vacc       375
opinion_seas_vacc_effective      452
opinion_seas_risk                499
opinion_seas_sick_from_vacc      521
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4497
m

In [47]:
test_feature_df.shape

(26708, 36)

# **Nominal & Categorical features seperation**

In [48]:
numerical_cols = train_feature_df.select_dtypes(include=['number']).columns
categorical_cols = train_feature_df.select_dtypes(include=['object']).columns

# **Numerical missing values are filled with the mean of corresponding features**

In [49]:
train_feature_df[numerical_cols] = train_feature_df[numerical_cols].fillna(train_feature_df[numerical_cols].mean())

In [50]:
test_feature_df[numerical_cols] = test_feature_df[numerical_cols].fillna(train_feature_df[numerical_cols].mean())

# **Categorical features are filled with mode**

In [51]:
train_feature_df[categorical_cols] = train_feature_df[categorical_cols].fillna(train_feature_df[categorical_cols].mode().iloc[0])

In [52]:
test_feature_df[categorical_cols] = test_feature_df[categorical_cols].fillna(test_feature_df[categorical_cols].mode().iloc[0])

In [53]:
train_data = pd.concat([train_feature_df, train_labels_df], axis=1)


In [54]:
train_data.shape

(26707, 39)

In [55]:
# Transpose the DataFrame, drop duplicate columns, and transpose back for removing duplicate columns
train_data = train_data.T.drop_duplicates().T
train_data.shape

(26707, 38)

In [56]:
test_feature_df.shape

(26708, 36)

# **Encoding of categorical features**

In [57]:
train_data = pd.get_dummies(train_data, columns=['age_group','education','race','sex','income_poverty','marital_status','rent_or_own','employment_status','census_msa'])

In [58]:
test_feature_df = pd.get_dummies(test_feature_df, columns=['age_group','education','race','sex','income_poverty','marital_status','rent_or_own','employment_status','census_msa'])

In [59]:
train_data.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,marital_status_Married,marital_status_Not Married,rent_or_own_Own,rent_or_own_Rent,employment_status_Employed,employment_status_Not in Labor Force,employment_status_Unemployed,"census_msa_MSA, Not Principle City","census_msa_MSA, Principle City",census_msa_Non-MSA
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,False,True,True,False,False,True,False,False,False,True
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,False,True,False,True,True,False,False,True,False,False
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,False,True,True,False,True,False,False,True,False,False
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,False,True,False,True,False,True,False,False,True,False
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,True,False,True,False,True,False,False,True,False,False


In [60]:
test_feature_df.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,marital_status_Married,marital_status_Not Married,rent_or_own_Own,rent_or_own_Rent,employment_status_Employed,employment_status_Not in Labor Force,employment_status_Unemployed,"census_msa_MSA, Not Principle City","census_msa_MSA, Principle City",census_msa_Non-MSA
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,False,True,False,True,True,False,False,True,False,False
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,True,True,False,False,False,False,True
2,26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,True,False,True,False,True,False,False,False,False,True
3,26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,True,False,False,True,False,True,False,False
4,26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,False,True,True,False,True,False,False,False,False,True


In [62]:
categorical_cols

Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
       'census_msa', 'employment_industry', 'employment_occupation'],
      dtype='object')

In [None]:
!pip install category_encoders

In [64]:
import category_encoders as ce

In [65]:
encoder = ce.BinaryEncoder(cols=['hhs_geo_region','employment_industry','employment_occupation'])

# Fit and transform the data
train_data_encoded = encoder.fit_transform(train_data)
test_data_encoded = encoder.fit_transform(test_feature_df)

In [66]:
train_data_encoded.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,marital_status_Married,marital_status_Not Married,rent_or_own_Own,rent_or_own_Rent,employment_status_Employed,employment_status_Not in Labor Force,employment_status_Unemployed,"census_msa_MSA, Not Principle City","census_msa_MSA, Principle City",census_msa_Non-MSA
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,False,True,True,False,False,True,False,False,False,True
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,False,True,False,True,True,False,False,True,False,False
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,False,True,True,False,True,False,False,True,False,False
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,False,True,False,True,False,True,False,False,True,False
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,True,False,True,False,True,False,False,True,False,False


In [67]:
train_data_encoded.shape

(26707, 68)

In [68]:
test_data_encoded.shape

(26708, 66)

In [69]:
columns_to_remove= ['xyz_vaccine','seasonal_vaccine']
train_data_encoded = train_data_encoded.drop(columns=columns_to_remove)

In [70]:
train_data_encoded = pd.concat([train_data_encoded, train_labels_df], axis=1)


In [71]:
train_data_encoded.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,rent_or_own_Rent,employment_status_Employed,employment_status_Not in Labor Force,employment_status_Unemployed,"census_msa_MSA, Not Principle City","census_msa_MSA, Principle City",census_msa_Non-MSA,respondent_id.1,xyz_vaccine,seasonal_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,False,False,True,False,False,False,True,0,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,True,True,False,False,True,False,False,1,0,1
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,False,True,False,False,2,0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,True,False,True,False,False,True,False,3,0,1
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,False,True,False,False,True,False,False,4,0,0


In [72]:
train_data_encoded = train_data_encoded.T.drop_duplicates().T
train_data_encoded.sample()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,rent_or_own_Own,rent_or_own_Rent,employment_status_Employed,employment_status_Not in Labor Force,employment_status_Unemployed,"census_msa_MSA, Not Principle City","census_msa_MSA, Principle City",census_msa_Non-MSA,xyz_vaccine,seasonal_vaccine
19349,19349,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,...,True,False,False,True,False,False,True,False,1,1


# **Train-Test-Split**

In [91]:
from sklearn.model_selection import train_test_split

X = train_data_encoded.drop(columns=['respondent_id','xyz_vaccine','seasonal_vaccine'])
y = train_data_encoded[['xyz_vaccine', 'seasonal_vaccine']]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [92]:
X

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,marital_status_Married,marital_status_Not Married,rent_or_own_Own,rent_or_own_Rent,employment_status_Employed,employment_status_Not in Labor Force,employment_status_Unemployed,"census_msa_MSA, Not Principle City","census_msa_MSA, Principle City",census_msa_Non-MSA
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,False,True,True,False,False,True,False,False,False,True
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,False,True,False,True,True,False,False,True,False,False
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.220312,...,False,True,True,False,True,False,False,True,False,False
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,False,True,False,True,False,True,False,False,True,False
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,True,False,True,False,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,False,True,True,False,False,True,False,False,False,True
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,False,True,False,True,True,False,False,False,True,False
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,False,True,True,False,True,False,False,True,False,False
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.677264,0.0,...,True,False,False,True,True,False,False,False,False,True


# **Train the model**

In [93]:
print(type(y_train))
if not isinstance(y_train, pd.DataFrame):
    y_train = pd.DataFrame(y_train)

<class 'pandas.core.frame.DataFrame'>


In [94]:
from sklearn.preprocessing import LabelEncoder
label_encoders = {}
for column in y_train.columns:
    if y_train[column].dtype == 'object' or not np.issubdtype(y_train[column].dtype, np.number):
        le = LabelEncoder()
        y_train[column] = le.fit_transform(y_train[column])
        label_encoders[column] = le

In [95]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

In [96]:
import numpy as np

In [97]:
label_encoders = {}
for column in y_val.columns:
    if y_val[column].dtype == 'object' or not np.issubdtype(y_val[column].dtype, np.number):
        le = LabelEncoder()
        y_val[column] = le.fit_transform(y_val[column])
        label_encoders[column] = le

In [98]:
from sklearn.metrics import roc_auc_score

# Predict probabilities
y_val_pred = model.predict_proba(X_val)

# Calculate ROC AUC score for each target
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_val_pred[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_val_pred[1][:, 1])

# Mean ROC AUC score
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2
print(f'Mean ROC AUC: {mean_roc_auc}')

Mean ROC AUC: 0.8585198086610649


In [99]:
label_encoders = {}
for column in test_data_encoded.columns:
    if test_data_encoded[column].dtype == 'object' or not np.issubdtype(test_data_encoded[column].dtype, np.number):
        le = LabelEncoder()
        test_data_encoded[column] = le.fit_transform(test_data_encoded[column])
        label_encoders[column] = le

In [100]:
test_preds = model.predict_proba(test_data_encoded.drop(columns=['respondent_id']))

# Create a submission dataframe
submission = pd.DataFrame({
    'respondent_id': test_data_encoded['respondent_id'],
    'seasonal_vaccine': test_preds[1][:, 1],
    'xyz_vaccine': test_preds[0][:, 1]
})

# Save the submission file
submission.to_csv('submission.csv', index=False)