<a href="https://colab.research.google.com/github/AzizAlbeshri/CSC4260-fluShotLearning/blob/SaifAlthubaiti511/fluShotLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing Libraries**

In [16]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
# Import SimpleImputer
from sklearn.impute import SimpleImputer
# Import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

**Loading the Datasets**

In [17]:
# Load the datasets
features_df = pd.read_csv('/content/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Features.csv')
labels_df = pd.read_csv('/content/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Labels.csv')

# Combine into one dataframe
df = pd.concat([features_df, labels_df], axis=1)

**Data Inspection**

In [18]:
# Display the first few rows of each dataset to understand their structure
df.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,respondent_id.1,h1n1_vaccine,seasonal_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,1,0,1
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,2,0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,3,0,1
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,4,0,0


In [19]:
#The first few rows and data types of df_features
print(features_df.head())
print(features_df.dtypes)

   respondent_id  h1n1_concern  h1n1_knowledge  behavioral_antiviral_meds  \
0              0           1.0             0.0                        0.0   
1              1           3.0             2.0                        0.0   
2              2           1.0             1.0                        0.0   
3              3           1.0             1.0                        0.0   
4              4           2.0             1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0               

In [20]:
#The first few rows and data types of labels_df
print(labels_df.head())
print(labels_df.dtypes)

   respondent_id  h1n1_vaccine  seasonal_vaccine
0              0             0                 0
1              1             0                 1
2              2             0                 0
3              3             0                 1
4              4             0                 0
respondent_id       int64
h1n1_vaccine        int64
seasonal_vaccine    int64
dtype: object


# **Data preprocessing**

Upon inspection, the datasets we noticed:

1-Significant missing values in many columns: notable Columns with High Missing Rates: The employment_occupation, employment_industry, and health_insurance columns have a substantial proportion of missing values, exceeding 45% in some cases. Such a high level of missingness can introduce biases or inaccuracies in predictive models if not addressed appropriately.

2- Diverse Range of Feature Types our dataset comprises a mix of numerical, categorical, and binary features, reflecting a broad spectrum of factors that could influence vaccine uptake.

3-Given the nature of vaccine uptake data, there may be imbalances in the target variables (e.g., more individuals not receiving a vaccine than those who do). This can affect model performance, particularly for classification algorithms.

**Identifying Column Types:**
Numerical and categorical columns are identified to treat them appropriately during preprocessing. 'respondent_id' is removed from the numerical columns as it's not relevant for modeling.

In [21]:
# Identifying numerical and categorical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

# Removing 'respondent_id' as it's not a feature for modeling
numerical_columns.remove('respondent_id')

**Imputing Missing Values:**
Imputation is done separately for numerical and categorical features using SimpleImputer. For numerical columns, the median is used to fill missing values, while the most frequent category fills gaps in categorical columns. This step addresses missing data, ensuring the model has a complete dataset to learn from.

In [22]:
# Imputing missing values for numerical features
numerical_imputer = SimpleImputer(strategy='median')
df[numerical_columns] = numerical_imputer.fit_transform(df[numerical_columns])

# For categorical features
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns])

**Encoding Categorical Variables:**
One-hot encoding is applied to categorical features to convert them into a format that can be provided to machine learning algorithms, enhancing model accuracy and interpretability.


In [23]:
# Encoding categorical variables
# One-hot encoding
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# The encoder first learns the categories present in each categorical feature (fit part) and then transforms these categorical features into one-hot encoded vectors (transform part).
encoded_categorical = encoder.fit_transform(df[categorical_columns])
# After encoding, each unique category within a feature becomes its own binary feature (column) in the output. Retrieves the names of these new binary features.
encoded_feature_names = encoder.get_feature_names_out(categorical_columns)
df_encoded_categorical = pd.DataFrame(encoded_categorical, columns=encoded_feature_names)

**Combining Features:**
Merging encoded categorical and numerical features into a single dataframe prepares the final dataset for model training. This step involves concatenating the preprocessed numerical and one-hot encoded categorical features, ensuring the model receives all relevant information.

In [24]:
# Combining encoded categorical features with numerical features
df_features_preprocessed = pd.concat([df[numerical_columns].reset_index(drop=True), df_encoded_categorical.reset_index(drop=True)], axis=1)

In [25]:
df_features_preprocessed.head(25)

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,0.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


**Split data**

In [26]:
# Get X features
X = df.drop(['h1n1_vaccine', 'seasonal_vaccine'], axis=1)
# Get y labels
y = df[['h1n1_vaccine','seasonal_vaccine']]

30% test data | 70% train data

In [27]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=68)