<a href="https://colab.research.google.com/github/AzizAlbeshri/CSC4260-fluShotLearning/blob/SaifAlthubaiti511/fluShotLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing Libraries**

In [None]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
# Import SimpleImputer
from sklearn.impute import SimpleImputer
# Import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

**Loading the Datasets**

In [None]:
# Load the datasets
df_features = pd.read_csv('/content/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Features.csv')
df_labels = pd.read_csv('/content/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Labels.csv')

**Data Inspection**

In [None]:
# Display the first few rows of each dataset to understand their structure
df_features.head(), df_labels.head()

(   respondent_id  h1n1_concern  h1n1_knowledge  behavioral_antiviral_meds  \
 0              0           1.0             0.0                        0.0   
 1              1           3.0             2.0                        0.0   
 2              2           1.0             1.0                        0.0   
 3              3           1.0             1.0                        0.0   
 4              4           2.0             1.0                        0.0   
 
    behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
 0                   0.0                   0.0                    0.0   
 1                   1.0                   0.0                    1.0   
 2                   1.0                   0.0                    0.0   
 3                   1.0                   0.0                    1.0   
 4                   1.0                   0.0                    1.0   
 
    behavioral_large_gatherings  behavioral_outside_home  \
 0                          0.

In [None]:
#The first few rows and data types of df_features
print(df_features.head())
print(df_features.dtypes)

   respondent_id  h1n1_concern  h1n1_knowledge  behavioral_antiviral_meds  \
0              0           1.0             0.0                        0.0   
1              1           3.0             2.0                        0.0   
2              2           1.0             1.0                        0.0   
3              3           1.0             1.0                        0.0   
4              4           2.0             1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0               

In [None]:
#The first few rows and data types of df_labels
print(df_labels.head())
print(df_labels.dtypes)

   respondent_id  h1n1_vaccine  seasonal_vaccine
0              0             0                 0
1              1             0                 1
2              2             0                 0
3              3             0                 1
4              4             0                 0
respondent_id       int64
h1n1_vaccine        int64
seasonal_vaccine    int64
dtype: object


**Identifying Column Types:**
Numerical and categorical columns are identified to treat them appropriately during preprocessing. 'respondent_id' is removed from the numerical columns as it's not relevant for modeling.

In [None]:
# Identifying numerical and categorical columns
numerical_columns = df_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = df_features.select_dtypes(include=['object']).columns.tolist()

# Removing 'respondent_id' as it's not a feature for modeling
numerical_columns.remove('respondent_id')

**Imputing Missing Values:**
Imputation is done separately for numerical and categorical features using SimpleImputer. For numerical columns, the median is used to fill missing values, while the most frequent category fills gaps in categorical columns. This step addresses missing data, ensuring the model has a complete dataset to learn from.

In [None]:
# Imputing missing values for numerical features
numerical_imputer = SimpleImputer(strategy='median')
df_features[numerical_columns] = numerical_imputer.fit_transform(df_features[numerical_columns])

# For categorical features
categorical_imputer = SimpleImputer(strategy='most_frequent')
df_features[categorical_columns] = categorical_imputer.fit_transform(df_features[categorical_columns])

**Encoding Categorical Variables:**
One-hot encoding is applied to categorical features to convert them into a format that can be provided to machine learning algorithms, enhancing model accuracy and interpretability.


In [None]:
# Encoding categorical variables
# One-hot encoding
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
# The encoder first learns the categories present in each categorical feature (fit part) and then transforms these categorical features into one-hot encoded vectors (transform part).
encoded_categorical = encoder.fit_transform(df_features[categorical_columns])
# After encoding, each unique category within a feature becomes its own binary feature (column) in the output. Retrieves the names of these new binary features.
encoded_feature_names = encoder.get_feature_names_out(categorical_columns)
df_encoded_categorical = pd.DataFrame(encoded_categorical, columns=encoded_feature_names)



**Combining Features:**
Merging encoded categorical and numerical features into a single DataFrame prepares the final dataset for model training. This step involves concatenating the preprocessed numerical and one-hot encoded categorical features, ensuring the model receives all relevant information.

In [None]:
# Combining encoded categorical features with numerical features
df_features_preprocessed = pd.concat([df_features[numerical_columns].reset_index(drop=True), df_encoded_categorical.reset_index(drop=True)], axis=1)

In [None]:
df_features_preprocessed.head()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
