In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [2]:
df_train = pd.read_csv('training_set_features.csv')
df_test = pd.read_csv('test_set_features.csv')
y_train = pd.read_csv('training_set_labels.csv')

In [3]:
df_train.head(3)

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo


In [4]:
df_train.isnull().sum()

respondent_id                      0
xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

In [5]:
df_train.dtypes

respondent_id                    int64
xyz_concern                    float64
xyz_knowledge                  float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_xyz                float64
doctor_recc_seasonal           float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
health_insurance               float64
opinion_xyz_vacc_effective     float64
opinion_xyz_risk               float64
opinion_xyz_sick_from_vacc     float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                       object
race                            object
sex                      

In [6]:
df_test.dtypes

respondent_id                    int64
xyz_concern                    float64
xyz_knowledge                  float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_xyz                float64
doctor_recc_seasonal           float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
health_insurance               float64
opinion_xyz_vacc_effective     float64
opinion_xyz_risk               float64
opinion_xyz_sick_from_vacc     float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                       object
race                            object
sex                      

In [7]:
fill_with_unavailable = ["education","income_poverty","employment_industry","employment_occupation","employment_status","marital_status","rent_or_own"]
                

for col in fill_with_unavailable:
    df_train[col] = df_train[col].fillna(value="Unavailable")
    df_test[col] = df_test[col].fillna(value="Unavailable")

In [8]:
ages = {"18 - 34 Years" :0 ,
                     "35 - 44 Years" : 1,
                     "45 - 54 Years" : 2,
                     "55 - 64 Years": 3,
                     "65+ Years" : 4}
    
education = {"< 12 Years" :1 ,
                     "12 Years" :2,
                     "Some College" : 3,
                     "College Graduate": 4,
                     "Unknown" : 0}

income_poverty =  {"Unknown" :0,
                    "Below Poverty" :1 ,
                     "<= $75,000" :2,
                     "Above Poverty" : 3,
                     "$75,000": 4 }


df_train['age_group'] = df_train.age_group.map(ages)
df_test['age_group'] = df_test.age_group.map(ages)

df_train['education'] = df_train.education.map(education)
df_test['education'] = df_test.education.map(education)

df_train['income_poverty'] = df_train.income_poverty.map(income_poverty)
df_test['income_poverty'] = df_test.income_poverty.map(income_poverty)

In [9]:
def fill_na(value):
    if value.dtype is np.dtype(float):
        return value.fillna(value.mean())
    elif value.dtype is np.dtype(object):
        return value.fillna(value.mode())
    else:
        return value

In [10]:
df_train = df_train.apply(fill_na)
df_test = df_test.apply(fill_na)

In [11]:
df_train.isnull().sum()

respondent_id                  0
xyz_concern                    0
xyz_knowledge                  0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_xyz                0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_xyz_vacc_effective     0
opinion_xyz_risk               0
opinion_xyz_sick_from_vacc     0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                    0
employment_status              0
hhs_geo_re

In [12]:
from category_encoders import CountEncoder

In [13]:
df_train.select_dtypes('object')

Unnamed: 0,race,sex,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,employment_industry,employment_occupation
0,White,Female,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,Unavailable,Unavailable
1,White,Male,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",pxcmvdjn,xgwztkwe
2,White,Male,Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",rucpziij,xtkaffoo
3,White,Female,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",Unavailable,Unavailable
4,White,Female,Married,Own,Employed,qufhixun,"MSA, Not Principle City",wxleyezf,emcorrxb
...,...,...,...,...,...,...,...,...,...
26702,White,Female,Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,Unavailable,Unavailable
26703,White,Male,Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",fcxhlnwr,cmhcxjea
26704,White,Female,Not Married,Own,Unavailable,lzgpxyit,"MSA, Not Principle City",Unavailable,Unavailable
26705,Hispanic,Female,Married,Rent,Employed,lrircsnp,Non-MSA,fcxhlnwr,haliazsg


In [14]:
categorical_cols = df_train.select_dtypes('object').columns

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
X, y = df_train.drop(columns='respondent_id'), y_train.drop(columns='respondent_id')
X_test = df_test.drop(columns='respondent_id')
y1, y2 = y['xyz_vaccine'], y['seasonal_vaccine']

In [17]:
from sklearn.pipeline import Pipeline

In [18]:
rfc_1_pipeline = Pipeline(steps=[
    ('encoder', CountEncoder()),
    ('rfc', RandomForestClassifier())
])


rfc_1_pipeline.fit(X, y1)

In [19]:
from sklearn.metrics import (
    roc_auc_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

from sklearn.model_selection import cross_val_score

In [20]:
cv_rfc_1_scores_from_pipe = cross_val_score(rfc_1_pipeline, X, y1, scoring='roc_auc', cv=10, n_jobs=-1, verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    7.4s remaining:    4.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.1s finished


In [21]:
np.mean(cv_rfc_1_scores_from_pipe)*100

85.9693887869821

In [22]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from scipy.stats import uniform, randint

In [23]:
distr_params = {
    'ml__max_features': randint(5, 20),
    'ml__min_samples_leaf':randint(2, 6),
    'ml__max_depth': randint(5, 20)
}

ml_pipeline = Pipeline(steps=[
    ('encoder', CountEncoder()),
    ('ml', RandomForestClassifier())
])


randomized_rfc_1 = RandomizedSearchCV(ml_pipeline, distr_params,cv=10,n_iter=60, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=21,).fit(X,y1)
randomized_rfc_2 = RandomizedSearchCV(ml_pipeline, distr_params,cv=10,n_iter=60, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=21).fit(X,y2)

Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits


In [30]:
randomized_rfc_1.best_score_

0.8661883045144638

In [27]:
best_rfc_1 = randomized_rfc_1.best_estimator_
best_rfc_1

In [31]:
best_rfc_2 = randomized_rfc_2.best_estimator_
best_rfc_2.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('encoder',
                 CountEncoder(cols=['race', 'sex', 'marital_status',
                                    'rent_or_own', 'employment_status',
                                    'hhs_geo_region', 'census_msa',
                                    'employment_industry',
                                    'employment_occupation'],
                              combine_min_nan_groups=True)),
                ('ml',
                 RandomForestClassifier(max_depth=11, max_features=9,
                                        min_samples_leaf=4))])>

In [32]:
y_test1 = best_rfc_1.predict_proba(X_test)[:, 1]
y_test2 = best_rfc_2.predict_proba(X_test)[:, 1]

In [35]:
y_test_df = pd.DataFrame(y_test1, columns =['xyz_vaccine'], dtype = float) 
arr = np.arange(26707, 53415)
y_test_df['respondent_id'] = arr.tolist()
y_test_df['seasonal_vaccine'] = y_test2.tolist()
y_test_df['seasonal_vaccine'] = y_test_df['seasonal_vaccine'].astype(float)
y_test_df = y_test_df[['respondent_id','xyz_vaccine','seasonal_vaccine']]
y_test_df.to_csv("Data_Hack_Baibhav_Malviya.csv", index=False)

In [34]:
y_test_df

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,26707,0.192122,0.168032
1,26708,0.026998,0.053303
2,26709,0.211099,0.718724
3,26710,0.639603,0.899835
4,26711,0.334273,0.492342
...,...,...,...
26703,53410,0.317901,0.473978
26704,53411,0.101284,0.332443
26705,53412,0.181131,0.257602
26706,53413,0.039712,0.404748
