# 1 Business Understanding

# 2. Data Understanding 

# 3. Data Preparation

## 3.1 loading and exploring the dataset

In [4]:
# importing the required libraries
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import statsmodels.api as sm
from scipy.stats import skew, kurtosis

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

### 3.1.1 Training_set_features

In [5]:
# loading the dataset
training_set = pd.read_csv('training_set_features.csv') 

In [6]:
# Check the first 5 rows of the dataframe
print("The first 5 rows of the training_set dataframe are:")
training_set.head()

The first 5 rows of the training_set dataframe are:


Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [None]:
# Check the first 5 rows of the dataframe
print("The first 5 rows of the training_set dataframe are:")
training_set.head()

In [None]:
# Check the data types of the columns in the dataframe
print("The data types of the columns in the training_set dataframe are:")
training_set.dtypes

In [None]:
# Check the shape of the dataframe
print("The shape of the training_set dataframe is:", training_set.shape)

In [None]:
# Check if there are any missing values in the dataframe
print("The number of missing values in each column of the training_set dataframe is:")
training_set.isna().sum()

In [None]:
# Check the summary statistics of the numerical columns in the dataframe
print("The summary statistics of the numerical columns in the training_set dataframe are:")
training_set.describe()

### 3.1.2 Cleaning training set labels

In [None]:
#Addressing the Binary Columns
binary_columns_1 = ['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 'household_adults', 'household_children']
binary_columns_2 = ['opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc']

#Replacing missing values in binary_columns_1 with 0
training_set[binary_columns_1] = training_set[binary_columns_1].fillna(0)

#Replacing missing values in binary_columns_2 with 1
training_set[binary_columns_2] = training_set[binary_columns_2].fillna(1)

In [None]:
#Handling Missing Data 
missing_strings = ['income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'education',
'employment_industry', 'employment_occupation']

training_set[missing_strings] = training_set[missing_strings].fillna("Not Available")

In [None]:
#Checking for missing values in the training features data
missing_values = training_set.isnull().sum()
print("Number of missing values in each column: \n", missing_values)

In [None]:
#Check for Duplicate Rows
has_duplicates = training_set.duplicated().any()
has_duplicates

### 3.2.1 training_set_labels

In [None]:
# loading the dataset
training_label = pd.read_csv('training_set_labels.csv') 

In [None]:
# Check the first 5 rows of the dataframe
print("The first 5 rows of the training_label dataframe are:")
training_label.head()

In [None]:
# Check the first 5 rows of the dataframe
print("The first 5 rows of the training_label dataframe are:")
training_label.tail()

In [None]:
# Check the data types of the columns in the dataframe
print("The data types of the columns in the training_label dataframe are:")
training_label.dtypes

In [None]:
# Check the shape of the dataframe
print("The shape of the training_set dataframe is:", training_label.shape)

In [None]:
# Check if there are any missing values in the dataframe
print("The number of missing values in each column of the training_label dataframe is:")
training_label.isna().sum()

In [None]:
# Check the summary statistics of the numerical columns in the dataframe
print("The summary statistics of the numerical columns in the training_set dataframe are:")
training_label.describe()

In [None]:
#checking for duplicated values
training_label.duplicated().sum()

### 3.3.1 test_set_features

In [None]:
#Importing and parsing the training_set_labels dataset
test_set = pd.read_csv('test_set_features.csv')

In [None]:
#Displaying the first 5 rows of the dataframe
print("The first 5 rows of the test_set dataframe are:")
test_set.head()

In [None]:
#Displaying the last 5 rows of the dataframe
print("The last 5 rows of the test_set dataframe are:")
test_set.tail()

In [None]:
#Displaying the data types of the columns in the dataframe
print("The data types of the columns in the test_set dataframe are:")
print(test_set.dtypes)

In [None]:
#Displaying the shape of the dataframe
print("The shape of the test_set dataframe is:", test_set.shape)


In [None]:
#Displaying the number of missing values in each column of the dataframe
print("The number of missing values in each column of the test_set dataframe is:")
print(test_set.isna().sum())

In [None]:
#Displaying the summary statistics of the numerical columns in the dataframe
print("The summary statistics of the numerical columns in the test_set dataframe are:")
test_set.describe()

In [None]:
#Checking for duplicated values in the dataframe
print("The number of duplicated values in the test_set dataframe is:")
print(test_set.duplicated().sum())

### 3.3.2 Cleaning test_set_features

In [None]:
#Removing the columns associated with the seasonal flu vaccine in the test dataframe
##rearranged_test_set = test_set.drop(['opinion_seas_sick_from_vacc', 'opinion_seas_risk', 'opinion_seas_vacc_effective', 'doctor_recc_seasonal'], axis=1)
#rearranged_test_set.head()

In [None]:
#Handling the Binary Columns
test_set[binary_columns_2] = test_set[binary_columns_2].fillna(1)
test_set[binary_columns_1] = test_set[binary_columns_1].fillna(0)

In [None]:
#Handling missing values in the test set
categorical_features = [
'income_poverty',
'marital_status',
'rent_or_own',
'employment_status',
'hhs_geo_region',
'education',
'employment_industry',
'employment_occupation'
]

test_set[categorical_features] = test_set[categorical_features].fillna('N/A')

In [None]:
#checking for missing values
missing_values = test_set.isna().sum()
print("Number of missing values in each column: \n", missing_values)

## 3.3 Joining dataframes

In [None]:
# Joining the labels and features into one dataframe
joined_df = training_set.merge(training_label, on='respondent_id', how='left')

# Previewing the first 5 rows of the merged dataframe
print("The first 5 rows of the merged dataframe:")
joined_df.head()

In [None]:
# Check the first 5 rows of the dataframe
print("The first 5 rows of the joined_df dataframe are:")
joined_df.tail()

In [None]:
# Printing the shape of the merged dataframe
print("The shape of the merged dataframe:", joined_df.shape)


In [None]:
# Check the data types of the columns in the dataframe
print("The data types of the columns in the joined_df dataframe are:")
joined_df.dtypes

In [None]:
# Checking for missing values
missing = joined_df.isnull().sum()
print("The number of missing values in each column:\n", missing)

In [None]:
# Describing the statistical information of the numerical columns
print("Statistical information of the numerical columns:")
joined_df.describe()

In [None]:
# Checking the number of unique values in each column
#print("Number of unique values in each column:")
#print(joined_df.nunique())

In [None]:
# Encoding categorical columns with low cardinality (less than or equal to 5 unique values)
#from sklearn.preprocessing import LabelEncoder
#for col in joined_df.columns:
 #   if joined_df[col].nunique() <= 5:
 #       le = LabelEncoder()
 #       joined_df[col] = le.fit_transform(joined_df[col])

# One-hot encoding categorical columns with high cardinality (more than 5 unique values)
#joined_df = pd.get_dummies(joined_df, columns=joined_df.columns, prefix=joined_df.columns, drop_first=True)

# Checking the updated number of unique values in each column
#print("Number of unique values in each column after encoding:")
#joined_df.nunique()


In [None]:
# Checking the number of duplicate rows
duplicates = joined_df.duplicated().sum()
print("Number of duplicate rows:", duplicates)

## 4 Exploratory Data analysis

In [None]:
joined_df.head()

In [None]:
# Checking the various statistical measures in the data frame

np.transpose(joined_df.describe())


# 4.1.0 Univariate Analysis


In [None]:
sns.set(rc={'figure.figsize': (12, 10)})
sns.set_palette("viridis")

univariate_plot = sns.distplot(joined_df['h1n1_vaccine'], bins=5)
univariate_plot.set_title("Distribution of H1N1 Vaccine Uptake")

plt.show()


In [None]:
#Checking for skewness and kurtosis
print(f"The h1n1_vaccine variable has a skewness of {skew(joined_df['h1n1_vaccine'])}.")
print(f"The h1n1_vaccine variable has a kurtosis of {kurtosis(joined_df['h1n1_vaccine'])}.")

The results indicate that the distribution of the h1n1_vaccine variable is positively skewed, meaning it has a long right tail. A positive skew is evident when the mean is greater than the median.

The kurtosis of -0.0233 suggests that the distribution is relatively flat. This type of kurtosis is referred to as mesokurtic, which indicates a normal distribution. A mesokurtic distribution has a kurtosis value close to 0. This means that the shape of the h1n1_vaccine variable's distribution is close to a normal distribution, but slightly flatter

# 4.1.2 Checking the distribution of Age groups

In [None]:
sns.countplot(x='age_group', data=joined_df, palette='viridis')
plt.title('Distribution of Age Groups')
plt.xlabel('Age Group')
plt.ylabel('Count')
plt.show()

# 4.2 Bivariate Analysis

In [None]:
# Plot histograms to visualize the distribution of each column
joined_df.hist(bins=50, figsize=(20,15))
plt.tight_layout()
plt.show()

The histograms above  provide insights into the distribution of each column's data. By examining the histograms, we can identify the range of values, the presence of outliers, and the overall distribution of each column
There are a lot predictor variables in this data collection. A correlation matrix is offered in order to conduct EDA on predictor variables with a relationship to the target variable in order to extract the most information out of the EDA process.

In [None]:
#correlation heatmap with viridis color palette
plt.figure(figsize=(15,7))
mask = np.triu(np.ones_like(joined_df.corr(), dtype=bool))
sns.heatmap(joined_df.corr(),annot=True,cmap="viridis",fmt=".2f", mask=mask);
plt.title('Correlation between the columns')
plt.show()

doctor_recc_seasonal and doctor_recc_h1n1 are the most highly correlated

## 6.2.1 Does age_group  affect the intake of H1N1 vaccine ?

In [None]:
order = ['18 - 34 Years', '35 - 44 Years', '45 - 54 Years', '55 - 64 Years', '65+ Years']

vaccine_counts = joined_df['h1n1_vaccine'].value_counts()

total = len(joined_df)

vaccine_proportions = vaccine_counts / total * 100
vaccine_proportions = vaccine_proportions.round(2)

plt.figure(figsize=(10, 6))
sns.countplot(x='age_group', hue='h1n1_vaccine', data=joined_df, palette='viridis', order=order)
plt.title("Age Group and H1N1 Vaccine Relationship")
plt.xlabel("Age Group")
plt.ylabel("Count")
plt.legend(title='Vaccine Taken', labels=[f'No ({vaccine_proportions[0]}%)', f'Yes ({vaccine_proportions[1]}%)'])
plt.show()


The plot shows the distribution of vaccinated and unvaccinated individuals in different age groups. The age group of 65+ Years has the highest number of both unvaccinated (6%) and vaccinated (20%) individuals, followed closely by age groups 18 - 34 Years, 45 - 54 Years, and 55 - 64 Years, each with 4,200 (16%) unvaccinated individuals. The age group of 35 - 44 Years has the lowest number of both vaccinated and unvaccinated individuals, with 3,100 (11.5%) vaccinated and 800 (3%) unvaccinated respondents.

## 6.2.2 Does education  affect the in_take of H1N1 vaccine ?

In [None]:
# Bar plot of vaccination rate by education
sns.countplot(x='education', hue='h1n1_vaccine', data=joined_df, palette='viridis')
plt.xlabel('Education')
plt.ylabel('Vaccination Count')
plt.title('H1N1 Vaccination Rate by Education')
plt.show()

The relationship between education and H1N1 vaccine uptake can be seen in the bar plot. However, it cannot be concluded from the plot alone if education has a significant impact on the uptake of the H1N1 vaccine. College graduates and some college have a higher vaccination count than those of 12 or below 12 years

## 6.2.3 Does Insurance  affect the intake of H1N1 vaccine ?

In [None]:
#Bar plot of vaccination rate by insurance
sns.countplot(x='health_insurance', hue='h1n1_vaccine', data=joined_df, palette='viridis')
plt.xlabel('Health Insurance')
plt.ylabel('Vaccination Count')
plt.title('H1N1 Vaccination Rate by Health Insurance')
plt.show()

The data suggests that there is a higher incidence of H1N1 vaccine uptake among individuals with health insurance compared to those without.

## 6.2.4 Does a region  affect the intake of H1N1 vaccine ?

In [None]:
# Bar plot of vaccination rate by hhs_geo_region
sns.countplot(x='hhs_geo_region', hue='h1n1_vaccine', data=joined_df, palette='viridis')
plt.xlabel('HHS Geo Region')
plt.ylabel('Vaccination Count')
plt.title('H1N1 Vaccination Rate by HHS Geo Region')
plt.xticks(rotation=90)
plt.show()


It can be observed from the bar plot of vaccination rate by HHS geo-region that there is variation in the uptake of H1N1 vaccine across different geo-regions. The exact extent and nature of this variation can be determined by further analysis of the data.

# 6.2.5 Does  Race  affect the intake of H1N1 vaccine ?

In [None]:
# Plotting distribution of race and h1n1_vaccine
sns.countplot(x='race', hue='h1n1_vaccine', data=joined_df, palette='viridis')
plt.xlabel('Race')
plt.ylabel('h1n1_vaccine')
plt.title('Race by h1n1_vaccine')
plt.show()


The plot depicts that the racial distribution of H1N1 vaccine uptake is diverse. The largest group of both those who did and did not receive the vaccine are white respondents, with 62% not being vaccinated and 17% being vaccinated. For black respondents, the plot shows that 7% have not been vaccinated and 1% have. Among Hispanic individuals, 5% have not been vaccinated and 1.5% have. The remaining 6.5% of individuals from other races are comprised of 5% who have not been vaccinated and 1.5% who have been vaccinated against H1N1.

# 6.2.5 Does  employment_status  affect the intake of H1N1 vaccine ?

In [None]:
# Plotting distribution of employment_status and h1n1_vaccine
sns.countplot(x='employment_status', hue='h1n1_vaccine', data=joined_df, palette='viridis')
plt.show()

The bar plot depicts the relationship between employment status and H1N1 vaccine uptake. Among the respondents, those who are employed had the greatest number of individuals who received the vaccine (2,900 or 11%) and those who did not (10,600 or 40%). The next highest group is made up of those who are not part of the labor force, with 8,000 (30%) unvaccinated and 2,300 (8.5%) vaccinated. Lastly, there is a relatively smaller group of unemployed individuals, with 1,200 (4.5%) unvaccinated and 250 (1%) vaccinated.

# 6.2.6 How does opinion on H1N1 vaccine Effectiveness vary by Age group

In [None]:
plt.figure(figsize=(11,10))
sns.countplot(x='age_group', hue='opinion_h1n1_vacc_effective', data=joined_df)
plt.title("Opinion on H1N1 Vaccine Effectiveness by Age Group", fontsize=15)
plt.xlabel("Age Group", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.legend(title='Opinion on H1N1 Vaccine Effectiveness', labels=['Not Effective', 'Somewhat Effective', 'Very Effective'])
plt.show()

In [7]:
# Scatter plot of h1n1_concern vs h1n1_knowledge using seaborn
sns.scatterplot(x='h1n1_concern', y='h1n1_knowledge', data=joined_df, palette='viridis')

# Add axis labels
plt.xlabel('h1n1_concern')
plt.ylabel('h1n1_knowledge')

# Show the plot
plt.show()

NameError: name 'joined_df' is not defined