# Import libraries:

In [1]:
import pickle
import pandas as pd
import numpy as np

# Import Models:

In [4]:
with open('Models/model_h1n1.pkl', 'rb') as b_h1n1:
    h1n1_best = pickle.load(b_h1n1)

with open('Models/model_seasonal.pkl', 'rb') as b_seas:
    seasonal_best = pickle.load(b_seas)


# Data preprocessing with submission dataframe:
## Import submission Dataframe and submission format:

In [5]:
predictions_df = pd.read_csv('data/test_set_features.csv', index_col='respondent_id')
submissions_df = pd.read_csv('Submissions/submission_format.csv', index_col='respondent_id')
print(predictions_df.shape)
print(submissions_df.shape)
predictions_df.head()

(26708, 35)
(26708, 2)


Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"> $75,000",Not Married,Rent,Employed,mlyzmhmf,"MSA, Not Principle City",1.0,0.0,atmlpfrs,hfxkjkmi
26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,Non-MSA,3.0,0.0,atmlpfrs,xqwwgdyp
26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,"> $75,000",Married,Own,Employed,lrircsnp,Non-MSA,1.0,0.0,nduyfdeo,pvmttkik
26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,lrircsnp,"MSA, Not Principle City",1.0,0.0,,
26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,lzgpxyit,Non-MSA,0.0,1.0,fcxhlnwr,mxkfnird


## NaN's

In [6]:
multiple_categorical = [
    "employment_occupation", 
    "employment_industry",
    "employment_status"
    ]

# Replace the NaN's with "Missing"
for col in multiple_categorical:
    predictions_df[col] = predictions_df[col].fillna(value="Missing")
    
    single_categorical =[
    "rent_or_own",
    "marital_status"
]

def convert_own(dataframe):
    if dataframe["rent_or_own"] == "Own":
        return 1
    elif dataframe["rent_or_own"] == "Rent":
        return 0
    return -1

predictions_df["own_home"] = predictions_df.apply(convert_own, axis=1)


def convert_marr(dataframe):
    if dataframe["marital_status"] == "Married":
        return 1
    elif dataframe["marital_status"] == "Not Married":
        return 0
    return -1

predictions_df["married"] = predictions_df.apply(convert_marr, axis=1)

# Removing old features:
for col in single_categorical:
    predictions_df.drop(col, axis=1, inplace=True)
    
# Ordinal features:
ordinal_non_num = [
    "income_poverty",
    "education",
    ]

numerical = [
    "health_insurance",
    "doctor_recc_h1n1",
    "doctor_recc_seasonal",
    "chronic_med_condition",
    "child_under_6_months",
    "health_worker",
    "opinion_seas_sick_from_vacc",
    "opinion_seas_risk",
    "opinion_seas_vacc_effective",
    "opinion_h1n1_sick_from_vacc",
    "opinion_h1n1_vacc_effective",
    "opinion_h1n1_risk",
    "household_adults",
    "household_children",
    "behavioral_avoidance",
    "behavioral_touch_face",
    "h1n1_knowledge",
    "h1n1_concern",
    "behavioral_large_gatherings",
    "behavioral_outside_home",
    "behavioral_antiviral_meds",
    "behavioral_wash_hands",
    "behavioral_face_mask"
    ]

for col in numerical:
    predictions_df[col] = predictions_df[col].fillna(value=-1)
    

### Ordinal Encoding:

In [7]:
def convert_income(dataframe):
    if dataframe["income_poverty"] == "> $75,000":
        return 2
    elif dataframe["income_poverty"] == "<= $75,000, Above Poverty":
        return 1
    elif dataframe["income_poverty"] == "Below Poverty":
        return 0
    return -1

predictions_df["income_lvl"] = predictions_df.apply(convert_income, axis=1)


def convert_edu(dataframe):
    if dataframe["education"] == "College Graduate":
        return 3
    elif dataframe["education"] == "Some College":
        return 2
    elif dataframe["education"] == "12 Years":
        return 1
    elif dataframe["education"] == "< 12 Years":
        return 0
    return -1

predictions_df["edu_lvl"] = predictions_df.apply(convert_edu, axis=1)

# Removing old features:
for col in ordinal_non_num:
    predictions_df.drop(col, axis=1, inplace=True)

### One hot encoding:

In [8]:
fts_to_onehot = predictions_df.select_dtypes(exclude='number').columns

predictions_df = pd.get_dummies(predictions_df, columns=fts_to_onehot, drop_first=True, dtype=int)
predictions_df.head()

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0


### Dropping highly correlated feature:

In [9]:
predictions_df.drop("employment_occupation_dcjcmpih", axis=1, inplace=True)

### Last comprovations:

In [10]:
# Make sure there are no NaN values
assert predictions_df.isna().sum().sum() == 0

In [11]:
# Make sure all columns are numeric
assert predictions_df.select_dtypes(include='number').shape[1] == predictions_df.shape[1]

# Predicting:
## H1N1 Vaccine:
**Note**: We use the expression *"1 - predict_proba"* to get the probabilities to be 1 (as expected) 

In [12]:
submissions_df["h1n1_vaccine"] = 1 - h1n1_best.predict_proba(predictions_df)

## Seasonal Vaccine:

In [13]:
submissions_df["seasonal_vaccine"] = 1 - seasonal_best.predict_proba(predictions_df)

## Saving predictions:

In [14]:
submissions_df.to_csv('Submissions/submission_ml_2.csv', index=True)

This submission has a ROC AUC score of **0.851** (best one right now *0.8658*).

# With Stacking Classifier:
## Loading models:

In [28]:
with open('Models/model_h1n1_st.pkl', 'rb') as st_h1n1:
    h1n1_st = pickle.load(st_h1n1)

with open('Models/model_seas_st.pkl', 'rb') as st_seas:
    seasonal_st = pickle.load(st_seas)

ValueError: <class 'numpy.random._mt19937.MT19937'> is not a known BitGenerator module.

## Predictions:

In [None]:
submissions_df["h1n1_vaccine"] = 1 - h1n1_st.predict_proba(predictions_df)
submissions_df["seasonal_vaccine"] = 1 - seasonal_st.predict_proba(predictions_df)

## Saving Predictions:

In [None]:
submissions_df.to_csv('Submissions/submission_ml_3.csv', index=True)

# Neural Netwrocks:
## Import models:

In [17]:
with open('Models/nn_model_h1n1.pkl', 'rb') as h1n1:
    nn_h1n1 = pickle.load(h1n1)

with open('Models/nn_model_seasonal.pkl', 'rb') as seas:
    nn_seas = pickle.load(seas)


## Predictions:

In [27]:
submissions_df["h1n1_vaccine"] = nn_h1n1.predict(predictions_df)[:,1]
submissions_df["seasonal_vaccine"] = nn_seas.predict(predictions_df)[:,1]

[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 580us/step
[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 595us/step


## Saving Predictions:

In [19]:
submissions_df.to_csv('Submissions/submission_nn_1.csv', index=True)

# Catboost Classifier:
## Import models:

In [20]:
with open('Models/model_h1n1_cb.pkl', 'rb') as h1n1:
    cat_h1n1 = pickle.load(h1n1)

with open('Models/model_seas_cb.pkl', 'rb') as seas:
    cat_seas = pickle.load(seas)

## Predictions:

In [21]:
submissions_df["h1n1_vaccine"] = 1 - cat_h1n1.predict_proba(predictions_df)
submissions_df["seasonal_vaccine"] = 1 - cat_seas.predict_proba(predictions_df)

## Save predictions:

In [22]:
submissions_df.to_csv('Submissions/submission_ml_4.csv', index=True)

# Hybrid method:
As CatBosst Classifier has been proved to be quite robust

In [23]:
submissions_df["h1n1_vaccine"] = nn_h1n1.predict(predictions_df)[:,1]
submissions_df["seasonal_vaccine"] = 1 - cat_seas.predict_proba(predictions_df)

submissions_df.to_csv('Submissions/submission_hy_1.csv', index=True)

[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 575us/step


In [24]:
submissions_df["h1n1_vaccine"] = 1 - cat_h1n1.predict_proba(predictions_df)
submissions_df["seasonal_vaccine"] = nn_seas.predict(predictions_df)[:,1]

submissions_df.to_csv('Submissions/submission_hy_2.csv', index=True)

[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 617us/step


# Neural Network Test:

In [25]:
with open('Models/nn_hf_model_h1n1.pkl', 'rb') as h1n1:
    hf_nn_h1n1 = pickle.load(h1n1)

In [26]:
submissions_df["h1n1_vaccine"] = hf_nn_h1n1.predict(predictions_df)[:,1]
submissions_df["seasonal_vaccine"] = nn_seas.predict(predictions_df)[:,1]

submissions_df.to_csv('Submissions/submission_nn_2.csv', index=True)

[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 623us/step
[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 581us/step
