### Exploring Data

In [12]:
#pip install scikit-learn

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer


In [15]:

features_path = 'training_set_features.csv'
labels_path = 'training_set_labels.csv'

features = pd.read_csv(features_path)

In [16]:

labels = pd.read_csv(labels_path)

In [17]:

features.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [18]:
labels.head()


Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0


In [7]:
features.isna().sum()

respondent_id                      0
xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

In [8]:
labels.isna().sum()

respondent_id       0
xyz_vaccine         0
seasonal_vaccine    0
dtype: int64

#### Preprocessing

In [19]:
X = features.drop(['respondent_id'], axis=1)
y = labels[['xyz_vaccine', 'seasonal_vaccine']]

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns


In [21]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42)))])


#### Model Training

In [22]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred_proba = model.predict_proba(X_test)

# Extract probabilities for each label
xyz_pred_proba = y_pred_proba[0][:, 1]
seasonal_pred_proba = y_pred_proba[1][:, 1]


#### ROC AUC Score

In [23]:
xyz_auc = roc_auc_score(y_test['xyz_vaccine'], xyz_pred_proba)
seasonal_auc = roc_auc_score(y_test['seasonal_vaccine'], seasonal_pred_proba)

mean_auc = (xyz_auc + seasonal_auc) / 2

xyz_auc, seasonal_auc, mean_auc

(0.8625760574506888, 0.8543647818233574, 0.8584704196370231)

#### Model Testing

In [24]:
test_features_path = 'test_set_features.csv'
test_features = pd.read_csv(test_features_path)


test_pred_proba = model.predict_proba(test_features)

test_xyz_pred_proba = test_pred_proba[0][:, 1]
test_seasonal_pred_proba = test_pred_proba[1][:, 1]

submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_xyz_pred_proba,
    'seasonal_vaccine': test_seasonal_pred_proba
})

submission.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,26707,0.21,0.4
1,26708,0.03,0.03
2,26709,0.29,0.76
3,26710,0.66,0.83
4,26711,0.21,0.48


#### Saving in CSV File

In [26]:
submission_path = 'submission.csv'
submission.to_csv(submission_path, index=False)

submission_path


'submission.csv'