# Model for the problem statement

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

In [2]:
train_data_features = pd.read_csv('training_set_features.csv')
test_data = pd.read_csv('test_set_features.csv')
train_data_labels=pd.read_csv('training_set_labels.csv')

In [3]:
train_data_features.shape

(26707, 36)

In [4]:
test_data.shape

(26708, 36)

In [5]:
train_data_features.isnull().sum()

respondent_id                      0
xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

In [6]:
test_data.isnull().sum()

respondent_id                      0
xyz_concern                       85
xyz_knowledge                    122
behavioral_antiviral_meds         79
behavioral_avoidance             213
behavioral_face_mask              19
behavioral_wash_hands             40
behavioral_large_gatherings       72
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            932
child_under_6_months             813
health_worker                    789
health_insurance               12228
opinion_xyz_vacc_effective       398
opinion_xyz_risk                 380
opinion_xyz_sick_from_vacc       375
opinion_seas_vacc_effective      452
opinion_seas_risk                499
opinion_seas_sick_from_vacc      521
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4497
m

### handling missing data and dropping columns

In [8]:
# have to drop columns who have more than 50% missing value.To find that made threshold value.
threshold = 0.5 * len(train_data_features)
cols_to_drop = [col for col in train_data_features.columns if train_data_features[col].isnull().sum() > threshold]
train_data_features = train_data_features.drop(columns=cols_to_drop)
test_data = test_data.drop(columns=cols_to_drop)


categorising the dataset


In [None]:
numerical_cols = train_data_features.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = train_data_features.select_dtypes(include=['object']).columns

numerical_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

train_data_features[numerical_cols] = numerical_imputer.fit_transform(train_data_features[numerical_cols])
train_data_features[categorical_cols] = categorical_imputer.fit_transform(train_data_features[categorical_cols])

test_data[numerical_cols] = numerical_imputer.transform(test_data[numerical_cols])
test_data[categorical_cols] = categorical_imputer.transform(test_data[categorical_cols])

In [10]:
train_data = train_data_features.join(train_data_labels.set_index('respondent_id'), on='respondent_id')

In [11]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  float64
 1   xyz_concern                  26707 non-null  float64
 2   xyz_knowledge                26707 non-null  float64
 3   behavioral_antiviral_meds    26707 non-null  float64
 4   behavioral_avoidance         26707 non-null  float64
 5   behavioral_face_mask         26707 non-null  float64
 6   behavioral_wash_hands        26707 non-null  float64
 7   behavioral_large_gatherings  26707 non-null  float64
 8   behavioral_outside_home      26707 non-null  float64
 9   behavioral_touch_face        26707 non-null  float64
 10  doctor_recc_xyz              26707 non-null  float64
 11  doctor_recc_seasonal         26707 non-null  float64
 12  chronic_med_condition        26707 non-null  float64
 13  child_under_6_mo

In [12]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26708 entries, 0 to 26707
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26708 non-null  float64
 1   xyz_concern                  26708 non-null  float64
 2   xyz_knowledge                26708 non-null  float64
 3   behavioral_antiviral_meds    26708 non-null  float64
 4   behavioral_avoidance         26708 non-null  float64
 5   behavioral_face_mask         26708 non-null  float64
 6   behavioral_wash_hands        26708 non-null  float64
 7   behavioral_large_gatherings  26708 non-null  float64
 8   behavioral_outside_home      26708 non-null  float64
 9   behavioral_touch_face        26708 non-null  float64
 10  doctor_recc_xyz              26708 non-null  float64
 11  doctor_recc_seasonal         26708 non-null  float64
 12  chronic_med_condition        26708 non-null  float64
 13  child_under_6_mo

combing the test dataset with train 

In [13]:
test_data = test_data.join(train_data_labels.set_index('respondent_id'), on='respondent_id')

In [14]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26708 entries, 0 to 26707
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26708 non-null  float64
 1   xyz_concern                  26708 non-null  float64
 2   xyz_knowledge                26708 non-null  float64
 3   behavioral_antiviral_meds    26708 non-null  float64
 4   behavioral_avoidance         26708 non-null  float64
 5   behavioral_face_mask         26708 non-null  float64
 6   behavioral_wash_hands        26708 non-null  float64
 7   behavioral_large_gatherings  26708 non-null  float64
 8   behavioral_outside_home      26708 non-null  float64
 9   behavioral_touch_face        26708 non-null  float64
 10  doctor_recc_xyz              26708 non-null  float64
 11  doctor_recc_seasonal         26708 non-null  float64
 12  chronic_med_condition        26708 non-null  float64
 13  child_under_6_mo

### training data

In [16]:
from sklearn.multioutput import MultiOutputClassifier
X = train_data.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]

# categorical and numerical columns
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(exclude=['object']).columns

#  preprocessor with column transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numerical_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

# Scale features 
scaler = StandardScaler()

# Create a pipeline for preprocessing and scaling
preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', scaler)
])

# preprocessing pipeline to the features
X_processed = preprocessing_pipeline.fit_transform(X)

# Train the Logistic Regression model
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Logistic Regression with MultiOutputClassifier for multilabel problem
logistic_model = MultiOutputClassifier(LogisticRegression(solver='liblinear'))
logistic_model.fit(X_train, y_train)

# Make predictions
y_pred_proba = logistic_model.predict_proba(X_test)

# Extract the probability predictions for each label
y_pred_proba_xyz = y_pred_proba[0][:, 1]
y_pred_proba_seasonal = y_pred_proba[1][:, 1]


### Predict and evaluate

In [21]:
#Evaluate the model using ROC AUC score
roc_auc_xyz = roc_auc_score(y_test['xyz_vaccine'], y_pred_proba_xyz)
roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], y_pred_proba_seasonal)

# Calculate the mean ROC AUC score
mean_roc_auc = np.mean([roc_auc_xyz, roc_auc_seasonal])

roc_auc_xyz_rounded = round(roc_auc_xyz, 3)
roc_auc_seasonal_rounded = round(roc_auc_seasonal, 3)
mean_roc_auc_rounded = round(mean_roc_auc, 3) 

print(f'ROC AUC Score for xyz_vaccine: {roc_auc_xyz_rounded}')
print(f'ROC AUC Score for seasonal_vaccine: {roc_auc_seasonal_rounded}')
print(f'Mean ROC AUC Score: {mean_roc_auc_rounded}')

ROC AUC Score for xyz_vaccine: 0.835
ROC AUC Score for seasonal_vaccine: 0.856
Mean ROC AUC Score: 0.845


In [19]:
from sklearn.model_selection import cross_val_score
# Evaluate the model using cross-validation
cross_val_scores = cross_val_score(MultiOutputClassifier(LogisticRegression(solver='liblinear')), X_processed, y, cv=5, scoring='roc_auc')

# Calculate the mean cross-validation score
mean_cross_val_score = np.mean(cross_val_scores)

print(f'Mean cross-validation ROC AUC score: {mean_cross_val_score}')

Mean cross-validation ROC AUC score: 0.843679545701144


In [27]:
import os
submission = pd.DataFrame({
    'respondent_id': train_data['respondent_id'],
    'xyz_vaccine': logistic_model.predict_proba(X_processed)[0][:, 1],
    'seasonal_vaccine': logistic_model.predict_proba(X_processed)[1][:, 1]
})

# Saving submission file
submission.to_csv('submission.csv', index=False)
file_exists = os.path.isfile('submission.csv')
print(f"File saved: {file_exists}")

# first few rows
if file_exists:
    saved_submission = pd.read_csv('submission.csv')
    print(saved_submission.head())

#path
file_path = os.path.join(current_directory, 'submission.csv')
print(f"File Path: {file_path}")


File saved: True
   respondent_id  xyz_vaccine  seasonal_vaccine
0            0.0     0.026849          0.040245
1            1.0     0.176353          0.110810
2            2.0     0.028422          0.089313
3            3.0     0.076435          0.927187
4            4.0     0.040042          0.057644
File Path: C:\Users\divya\miniconda3\submission.csv
