In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
%matplotlib inline

In [None]:
test = pd.read_csv('test_set_features.csv')
test['label'] = 'test'
train = pd.read_csv('training_set_features.csv')
train['label'] = 'train'
labels = pd.read_csv('training_set_labels.csv')
h1n1 = labels['h1n1_vaccine']
seasonal = labels['seasonal_vaccine']
combine = pd.concat([train, test], axis = 0)
ID = test['respondent_id']
combine = combine.drop(['respondent_id'], axis = 1)
full_train = pd.concat([labels, train], axis = 1)

In [None]:
combine_cols = combine.columns
labels_cols = labels.columns

In [None]:
#check column names
print(combine_cols)
print(labels_cols)

In [None]:
#check the shape of the dataframes
print(train.shape)
print(test.shape)
print(labels.shape)
print(combine.shape)

In [None]:
#look at the response variables
labels.describe()

In [None]:
#26707 total rows
#21.2% got h1n1_vaccine
#46.6% got seasonal vaccine

In [None]:
combine.describe()

In [None]:
combine.dtypes

In [None]:
all_data_na = (combine.isnull().sum()/len(combine))
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:40]
missing_data = pd.DataFrame({'Missing Ratio':all_data_na})
missing_data.head(40)

In [None]:
#looks like employment occupation and employment industry is not code for not in work force where blank.
combine['employment_industry'] = combine['employment_industry'].fillna('abcde')
combine['employment_occupation'] = combine['employment_occupation'].fillna('fghij')

In [None]:
label_enc = ('age_group', 'education', 'income_poverty', 'household_adults', 'household_children')
one_hot = ('race', 'sex', 'martial_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa')

In [None]:
# process columns, apply LabelEncoder to categorical features

from sklearn.preprocessing import LabelEncoder
for c in label_enc:
    lbl = LabelEncoder() 
    lbl.fit(list(combine[c].values)) 
    combine[c] = lbl.transform(list(combine[c].values))

# shape        
print('Shape all_data: {}'.format(combine.shape))

In [None]:
#proces columns, apply OneHotEncoder to categorical features that are not 
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(combine[one_hot]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

In [None]:
combine.head()

In [None]:
all_data_na = (combine.isnull().sum()/len(combine))
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:40]
missing_data = pd.DataFrame({'Missing Ratio':all_data_na})
missing_data.head(40)

In [None]:
fig, ax = plt.subplots(figsize = (15,12))
plt.xticks(rotation='90')
sns.barplot(x=all_data_na.index, y=all_data_na)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)

In [None]:
#anything with under 2% missing, just input the mode
cols = ('opinion_seas_sick_from_vacc',
        'opinion_seas_risk',
        'opinion_seas_vacc_effective',
        'opinion_h1n1_vacc_effective',
        'opinion_h1n1_sick_from_vacc',
        'opinion_h1n1_risk',
        'household_adults',
        'household_children',
        'behavioral_avoidance',
        'behavioral_touch_face',
        'h1n1_knowledge',
        'h1n1_concern',
        'behavioral_outside_home',
        'behavioral_large_gatherings',
        'behavioral_antiviral_meds',
        'behavioral_wash_hands',
        'behavioral_face_mask')

for c in cols:
    combine[c] = combine[c].fillna(combine[c].mode()[0])
    

In [None]:
all_data_na = (combine.isnull().sum()/len(combine))
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:40]

fig, ax = plt.subplots(figsize = (15,12))
plt.xticks(rotation='90')
sns.barplot(x=all_data_na.index, y=all_data_na)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)

In [None]:
corrmat = combine.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)

In [None]:
cols = ['doctor_recc_seasonal', 'doctor_recc_h1n1', 'chronic_med_condition', 'child_under_6_months',
                           'health_worker', 'health_insurance']
before = []
for c in cols:
    w = len(combine[c][combine[c]==1])
    wo = len(combine[c][combine[c]==0])
    pct_w = w / (w + wo)
    before.append(pct_w)
    print('percentage with', c, pct_w)

In [None]:
#build model to fill values
# Oversample and plot imbalanced dataset with SMOTE
#from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
#from imblearn.under_sampling import RandomUnderSampler
from numpy import where
from sklearn.model_selection import RepeatedStratifiedKFold
#from sklearn import metrics
from sklearn.model_selection import cross_val_score
from numpy import mean

#try a decision tree classifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
roc = []

for index, value in enumerate(cols):
    cols = ['doctor_recc_seasonal', 'doctor_recc_h1n1', 'chronic_med_condition', 'child_under_6_months',
                           'health_worker', 'health_insurance']
    
    a = cols[index]
    cols.pop(index)
    X = combine.drop(columns = cols)
    W = X[X[a].notnull()]
    y = W[a]
    X = X.drop([a], axis = 1)
    W = W.drop([a], axis = 1)
    
    oversample = SMOTE()
    W, y = oversample.fit_resample(W,y)
    
    model = DecisionTreeClassifier()
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, W, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    print('mean roc', a, mean(scores))
    #fit model
    model.fit(W,y)
    
    #make predictions
    combine['preds'] = model.predict(X)
    combine[a] = np.where(combine[a].isnull(), 
                                           combine['preds'], combine[a])
    combine = combine.drop(['preds'], axis = 1)

In [None]:
combine.isnull().sum().any()

In [None]:
#all NAs are gone

In [None]:
#create baseline model with logistic regression
train = combine[combine['label'] == 1]
test = combine[combine['label'] == 0]
train = train.drop(['label'], axis = 1)
test = test.drop(['label'], axis = 1)
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
print(train.shape)
print(test.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, h1n1, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_pred_h1n1 = logreg.predict_proba(X_test)
y_pred_h1n1 = pd.DataFrame(y_pred_h1n1, columns = ['first_class', 'second_class'])
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
y_pred_h1n1_final = logreg.predict_proba(test)
y_pred_h1n1_final = pd.DataFrame(y_pred_h1n1_final, columns = ['first_class', 'second_class'])
y_pred_h1n1_final.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, seasonal, test_size=0.3, random_state=0)
logreg2 = LogisticRegression()
logreg2.fit(X_train, y_train)

In [None]:
y_pred_seasonal = logreg2.predict_proba(X_test)
y_pred_seasonal = pd.DataFrame(y_pred_seasonal, columns = ['first_class', 'second_class'])
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
y_pred_seasonal_final = logreg2.predict_proba(test)
y_pred_seasonal_final = pd.DataFrame(y_pred_seasonal_final, columns = ['first_class', 'second_class'])
y_pred_seasonal_final.head()

In [None]:
response = pd.concat([ID, y_pred_h1n1_final['second_class'],y_pred_seasonal_final['second_class']], axis = 1)
response.columns = ['respondent_id','h1n1_vaccine', 'seasonal_vaccine']

response.shape

In [None]:
response.to_csv('predictions.csv', index=False)

In [None]:
response.head()

In [None]:
#.8258
410 place of 1903.