In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


from sklearn.model_selection import GridSearchCV


from sklearn.metrics import roc_curve, roc_auc_score, classification_report, accuracy_score, confusion_matrix



In [2]:
training = pd.read_csv('training_set_features.csv')  # Training set features

In [3]:
for i, row in training[training['employment_occupation'].isna()].iterrows():
    conditions = (training["employment_status"]=="Unemployed")
    if conditions.any():
        training.loc[i, 'employment_occupation'] = 'unemployed_occupation'

In [4]:
for i, row in training[training['employment_occupation'].isna()].iterrows():
    conditions = (training["employment_status"]=="Not in Labor Force")
    if conditions.any():
        training_data.loc[i, 'employment_occupation'] = 'Not_in_laborforce'

In [5]:
for i, row in training[training['employment_occupation'].isna()].iterrows():
    conditions = (training["employment_status"]=="Employed")
    if conditions.any():
        training_data.loc[i, 'employment_occupation'] = 'other_occupation'

In [6]:
for i, row in training[training['employment_industry'].isna()].iterrows():
    conditions = (training["employment_status"]=="Unemployed")
    if conditions.any():
        training.loc[i, 'employment_industry'] = 'unemployed_in_industry'

In [7]:
for i, row in training[training['employment_industry'].isna()].iterrows():
    conditions = (training["employment_status"]=="Not in Labor Force")
    if conditions.any():
        training.loc[i, 'employment_industry'] = 'Not_in_laborforce'

In [8]:
for i, row in training[training['employment_industry'].isna()].iterrows():
    conditions = (training["employment_status"]=="Employed")
    if conditions.any():
        training.loc[i, 'employment_industry'] = 'other_industry'

In [9]:
income_poverty_nan = 'refused_or_unknown'
marital_status_nan = 'refused_or_other_category'
employment_status_nan = 'refused_or_other_category'
rent_or_own_nan = 'other_or_refused'
education_nan = 'refused_or_unknown'


training['income_poverty'].fillna(income_poverty_nan, inplace=True)
training['marital_status'].fillna(marital_status_nan, inplace=True)
training['employment_status'].fillna(employment_status_nan, inplace=True)
training['rent_or_own'].fillna(rent_or_own_nan, inplace=True)
training['education'].fillna(education_nan, inplace=True)

In [10]:
numeric_columns = training.select_dtypes('number').columns

category_columns = ['race', 'sex', 'marital_status', 'rent_or_own',  'hhs_geo_region',
       'census_msa', 'employment_industry', 'employment_occupation']

ordinal_columns = ['age_group', 'education', 'income_poverty', 'employment_status']

In [11]:
training[category_columns].isna().sum()

race                     0
sex                      0
marital_status           0
rent_or_own              0
hhs_geo_region           0
census_msa               0
employment_industry      0
employment_occupation    0
dtype: int64

In [12]:
le = LabelEncoder()

In [13]:
for features in ordinal_columns:
    training[features] = le.fit_transform(training[features])

In [14]:
training[ordinal_columns].head()

Unnamed: 0,age_group,education,income_poverty,employment_status
0,3,1,2,1
1,1,0,2,0
2,0,2,0,0
3,4,0,2,1
4,2,3,0,0


In [15]:
training = pd.get_dummies(training, columns=category_columns, drop_first=True)

In [16]:
training_minus = training.copy()


In [17]:
training_minus.fillna(-1, inplace=True)

In [18]:
training_minus.isna().sum(axis=0)

respondent_id                     0
h1n1_concern                      0
h1n1_knowledge                    0
behavioral_antiviral_meds         0
behavioral_avoidance              0
                                 ..
employment_occupation_vlluhbov    0
employment_occupation_xgwztkwe    0
employment_occupation_xqwwgdyp    0
employment_occupation_xtkaffoo    0
employment_occupation_xzmlyyjv    0
Length: 91, dtype: int64

In [19]:
labels = pd.read_csv('training_set_labels.csv')  # Training set features 

In [24]:
df_minus = pd.merge(training_minus, labels, on='respondent_id')

In [25]:
df_minus = df_minus.drop(['respondent_id'],axis=1)

In [26]:
X = df_minus.drop(['h1n1_vaccine','seasonal_vaccine'],axis=1)

In [27]:
y1 = df_mode[['h1n1_vaccine']]

In [28]:
y2 = df_mode[['seasonal_vaccine']]

In [29]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y1, test_size=0.2, random_state=101)

In [30]:
scaler = StandardScaler()

In [31]:
scaled_X_train1 = scaler.fit_transform(X_train1)

scaled_X_test1 = scaler.transform(X_test1)

In [32]:
svm_h1n1 = SVC()

svm_h1n1.fit(scaled_X_train1, y_train1.values.ravel())


svm_pred_h1n1 = svm_h1n1.predict(scaled_X_test1)


print(classification_report(y_test1, svm_pred_h1n1))

              precision    recall  f1-score   support

           0       0.86      0.96      0.91      4199
           1       0.74      0.41      0.52      1143

    accuracy                           0.84      5342
   macro avg       0.80      0.68      0.71      5342
weighted avg       0.83      0.84      0.82      5342



In [33]:
cm_h1n1 = confusion_matrix(y_test1, svm_pred_h1n1)
print(cm_h1n1)

[[4036  163]
 [ 680  463]]


In [34]:
print(classification_report(y_test1, svm_pred_h1n1))

              precision    recall  f1-score   support

           0       0.86      0.96      0.91      4199
           1       0.74      0.41      0.52      1143

    accuracy                           0.84      5342
   macro avg       0.80      0.68      0.71      5342
weighted avg       0.83      0.84      0.82      5342



In [35]:
from sklearn.metrics import accuracy_score

svm_acuuracy_h1n1 = accuracy_score(y_test1, svm_pred_h1n1)

svm_acuuracy_h1n1

0.8421939348558592

In [45]:
from sklearn.metrics import roc_auc_score

svm_decision_h1n1 = svm_h1n1.decision_function(scaled_X_test1)

auc_roc_h1n1 = roc_auc_score(y_test1, svm_decision_h1n1)

print('AUC-ROC score for tuned SVM H1n1 vaccine:', auc_roc_h1n1)

AUC-ROC score for tuned SVM H1n1 vaccine: 0.8412028694079351


In [37]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y2, test_size=0.2, random_state=101)

In [38]:
scaler = StandardScaler()

scaled_X_train2 = scaler.fit_transform(X_train2)
scaled_X_test2 = scaler.transform(X_test2)

In [39]:
svm_seasonal = SVC()

svm_seasonal.fit(scaled_X_train2, y_train2.values.ravel())


svm_pred_seasonal = svm_seasonal.predict(scaled_X_test2)



In [40]:
cm2 = confusion_matrix(y_test2, svm_pred_seasonal)
print(cm2)

[[2321  502]
 [ 749 1770]]


In [41]:
print(classification_report(y_test1, svm_pred_seasonal))

              precision    recall  f1-score   support

           0       0.88      0.65      0.75      4199
           1       0.35      0.69      0.46      1143

    accuracy                           0.66      5342
   macro avg       0.62      0.67      0.60      5342
weighted avg       0.77      0.66      0.69      5342



In [42]:
from sklearn.metrics import accuracy_score

svm_acuuracy_seasonal = accuracy_score(y_test2, svm_pred_seasonal)

svm_acuuracy_seasonal

0.7658180456757768

In [44]:
from sklearn.metrics import roc_auc_score

svm_decision_seasonal = svm_seasonal.decision_function(scaled_X_test2)

auc_roc_seasonal = roc_auc_score(y_test2, svm_decision_seasonal)

print('AUC-ROC score for tuned SVM seasonal vaccine:', auc_roc_seasonal)

AUC-ROC score for tuned SVM seasonal vaccine: 0.8442526138928277


In [46]:
import joblib


In [None]:
joblib.dump(svm_h1n1, 'svm_h1n1_minus.pkl')
joblib.dump(svm_seasonal, 'svm_seasonal_minus.pkl')
