# H1N1 Flu Prediction

In [45]:
#Importing the Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

sns.set(style="white", context="notebook", palette="deep")
pd.options.display.max_rows = 200

In [2]:
#Importing the Data
train_X = pd.read_csv(r'...\training_set_features.csv')
train_Y = pd.read_csv(r'...\training_set_labels.csv')
test_X = pd.read_csv(r'...\test_set_features.csv')

dataset = pd.concat((train_X, test_X), axis = 0, ignore_index=True)

#### First we need to clean the data and remove all the null values

In [3]:
dataset.isnull().sum()

respondent_id                      0
h1n1_concern                     177
h1n1_knowledge                   238
behavioral_antiviral_meds        150
behavioral_avoidance             421
behavioral_face_mask              38
behavioral_wash_hands             82
behavioral_large_gatherings      159
behavioral_outside_home          164
behavioral_touch_face            256
doctor_recc_h1n1                4320
doctor_recc_seasonal            4320
chronic_med_condition           1903
child_under_6_months            1633
health_worker                   1593
health_insurance               24502
opinion_h1n1_vacc_effective      789
opinion_h1n1_risk                768
opinion_h1n1_sick_from_vacc      770
opinion_seas_vacc_effective      914
opinion_seas_risk               1013
opinion_seas_sick_from_vacc     1058
age_group                          0
education                       2814
race                               0
sex                                0
income_poverty                  8920
m

#### Now we convert the Categorical Variables into groups of Dummy Variables

In [4]:
dataset = pd.get_dummies(dataset, columns=['age_group',
                                           'education',
                                           'race',
                                           'sex',
                                           'income_poverty',
                                           'marital_status',
                                           'rent_or_own',
                                           'employment_status',
                                           'employment_industry',
                                           'employment_occupation',
                                           'hhs_geo_region',
                                           'census_msa',
                                           'health_insurance'],dummy_na=True)
dataset = dataset.drop(['respondent_id'], axis=1)

#### None of the continuous variables have more than 5% null values. Thus imputing with the statistical mode will not change the distribution of the data

In [5]:
mf_imputer = SimpleImputer(strategy="most_frequent")
dataset = pd.DataFrame(mf_imputer.fit_transform(dataset))

In [6]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,109,110,111,112,113,114,115,116,117,118
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
scaler = StandardScaler()
dataset = scaler.fit_transform(dataset)

#### We can reduce the dimensionality by applying PCA over the dataset

In [8]:
pca = PCA(n_components = 118)
pca.fit(dataset)
print(pca.explained_variance_ratio_.cumsum())

[0.05465351 0.09414192 0.12807185 0.16017798 0.18331442 0.20374225
 0.22328407 0.2422337  0.25982166 0.27632395 0.29222597 0.30792893
 0.32300316 0.3375437  0.35157603 0.36550629 0.37922303 0.39268312
 0.40565064 0.41840927 0.43101092 0.44357903 0.45578206 0.46767296
 0.47941967 0.49109893 0.50268177 0.5139817  0.52520192 0.53627986
 0.54724308 0.55803837 0.56861704 0.57898631 0.58920534 0.59928817
 0.609234   0.61898105 0.62867256 0.63827606 0.6477823  0.65719543
 0.66654443 0.67583922 0.68496408 0.69397371 0.70297955 0.71186518
 0.72072551 0.72951992 0.73829713 0.74697341 0.75561167 0.76421213
 0.77269002 0.78107353 0.78942957 0.79753646 0.80549505 0.81341088
 0.82117843 0.828882   0.83639265 0.84372596 0.85094066 0.85813176
 0.86522977 0.87213572 0.87878126 0.88523624 0.89152453 0.89763404
 0.90372361 0.90947155 0.91514905 0.92060775 0.9260516  0.93131944
 0.93651532 0.94157989 0.94637283 0.95113068 0.95542948 0.95964374
 0.96344348 0.96716896 0.97077485 0.97432847 0.97781335 0.9810

#### We aim to keep 98% of the variance

In [9]:
pca_fin = PCA(n_components=90)
dataset = pca_fin.fit_transform(dataset)

In [10]:
dataset = pd.DataFrame(dataset)

In [11]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,80,81,82,83,84,85,86,87,88,89
0,2.477269,-0.939399,-2.810101,1.67745,0.960905,-0.907742,-0.122672,-1.947278,0.287389,-1.336138,...,-0.1265,0.082598,0.109705,-0.122186,-0.062488,-0.842833,-1.009889,-0.677297,0.411287,-0.074
1,-1.099628,-0.248041,1.754716,2.819024,-0.476688,-0.533966,-1.517075,1.717396,0.083797,2.123076,...,-0.948365,0.338174,-0.107095,-0.371576,-0.091988,-1.310901,-0.907987,-0.857585,-0.251214,-0.124386
2,-2.151864,1.75396,-2.422692,1.855613,0.260078,-0.102172,-0.52067,0.288324,-0.006329,-0.491008,...,0.367216,-0.129235,0.259698,0.339762,0.640877,-0.641961,0.007404,-0.071285,-0.504422,-0.059059
3,3.564117,-2.121177,0.11136,1.503419,1.101802,0.880085,-0.830469,0.791518,-0.262625,-0.586774,...,1.021343,-0.184607,-1.845998,-0.350741,0.215428,0.849324,1.044665,0.488705,0.803333,0.131655
4,-2.207815,0.12845,0.54758,-0.965854,-0.269713,-0.799702,-3.623682,-2.019973,2.790949,-0.874182,...,-0.628074,0.439784,0.549139,-0.047243,0.257834,-0.336099,1.493621,0.6103,-0.997458,-0.236407


In [12]:
train_X = dataset.iloc[:26707,:]
test_X = dataset.iloc[26707:,:]

#### We should now split the dataset into training and cross validation sets.

In [23]:
X_train, X_cv, y_train, y_cv = train_test_split(train_X, train_Y, test_size=0.3)

#### Random Forest Classifiers have been used as a model. For a better fit we can increase the number of trees - "n_estimators"

In [35]:
h1n1_class = RandomForestClassifier(n_estimators = 100, n_jobs=2)

In [36]:
h1n1_class.fit(X_train, y_train['h1n1_vaccine'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [37]:
seasonal_class = RandomForestClassifier(n_estimators = 100, n_jobs=2)

In [38]:
seasonal_class.fit(X_train, y_train['seasonal_vaccine'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [39]:
y_pred_h1n1 = h1n1_class.predict(X_cv)
y_pred_seasonal = seasonal_class.predict(X_cv)

#### Now the accuracy of the models can be checked

In [41]:
print(accuracy_score(y_cv["h1n1_vaccine"], y_pred_h1n1))

0.8315237738674653


In [42]:
print(accuracy_score(y_cv["seasonal_vaccine"], y_pred_seasonal))

0.7461624859603145


#### The accuracies arent in the 90s as I would like them to be. Improvements can be made in Feature Engineering and Data Preprocessing