In [1]:
import numpy as np
import os
import math
import scipy
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.feature_selection import SelectKBest,f_classif
from sklearn.linear_model import LogisticRegression
import category_encoders as ce
import warnings
warnings.simplefilter(action='ignore', category = FutureWarning)
pd.set_option('mode.chained_assignment', None)

%matplotlib inline

In [2]:
train_features_path = "../Data/Train/training_set_features.csv"
train_labels_path = "../Data/Train//training_set_labels.csv"
test_features_path = "../Data/Test/test_set_features.csv"

In [3]:
train_features = pd.read_csv(train_features_path)
train_labels = pd.read_csv(train_labels_path)
test_features = pd.read_csv(test_features_path)

In [21]:
test_features.head(3)

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,seasonal_vaccine,h1n1_vaccine
0,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1,0,7,0,1.0,0.0,1,7,0,0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,1,2,3.0,0.0,1,20,0,0
2,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0,0,5,2,1.0,0.0,10,12,1,1


In [5]:
train_labels.head(3)

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0


In [6]:
#let's merge those two dataframes on respondent_id column
train_data = train_features.merge(train_labels,on = "respondent_id")
train_data.set_index('respondent_id',inplace = True)
test_features.set_index('respondent_id',inplace = True)

In [7]:
# train_data.head()
test_features.index = range(len(test_features))
test_features.head()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"> $75,000",Not Married,Rent,Employed,mlyzmhmf,"MSA, Not Principle City",1.0,0.0,atmlpfrs,hfxkjkmi
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,Non-MSA,3.0,0.0,atmlpfrs,xqwwgdyp
2,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,"> $75,000",Married,Own,Employed,lrircsnp,Non-MSA,1.0,0.0,nduyfdeo,pvmttkik
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,lrircsnp,"MSA, Not Principle City",1.0,0.0,,
4,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,lzgpxyit,Non-MSA,0.0,1.0,fcxhlnwr,mxkfnird


In [8]:
print(train_data.shape)
print(test_features.shape)

(26707, 37)
(26708, 35)


In [9]:
# Creating a DataFrame of missing values of each column of both train and test set
null_values_train = pd.DataFrame(train_data.isnull().sum(),columns = ['Train'])
null_values_test  = pd.DataFrame(test_features.isnull().sum(),columns = ['Test'])

null_values = null_values_train.join(null_values_test)
null_values

Unnamed: 0,Train,Test
h1n1_concern,92,85.0
h1n1_knowledge,116,122.0
behavioral_antiviral_meds,71,79.0
behavioral_avoidance,208,213.0
behavioral_face_mask,19,19.0
behavioral_wash_hands,42,40.0
behavioral_large_gatherings,87,72.0
behavioral_outside_home,82,82.0
behavioral_touch_face,128,128.0
doctor_recc_h1n1,2160,2160.0


<i>
    So we see from the above dataframe created that we have a missing value even in test data and that the count of missing values for a a particular column is almost same in both test and train data so what i am thinking here is to fill those missing places, not by an conventional method but by using machine learning instead and check out what's going to work or not
</i>

In [10]:
def get_not_null_col(data):
    '''
    This function takes dataframe as input and returns the column name which
    doesnt contain missing values in it
    '''
    return data[(data.Train == 0) & (data.Test == 0)].index.tolist()
not_null = get_not_null_col(null_values)

null_values.drop(not_null,axis = 0,inplace = True)
null_values.drop(['h1n1_vaccine','seasonal_vaccine'],axis = 0,inplace = True)

In [11]:
# train_data.isnull()
def twice_null_check(data):
#     null = data.iloc[3].health_insurance
    '''
    This function takes the dataframe as input and checks that
    whether more than two column of the same row are missing or
    not
    '''
    null_index = []
    for col in null_values.index:
        for val,index in zip(data[col],data.index):
#             print(val)
            if pd.isna(val):
                null_index.append(index)
#     print(null_index)          
    from collections import Counter
    
    count_index = Counter(null_index)
    
    cnt = 0
    for i in count_index.values():
        if i == 2 or i ==3:
            cnt += 1
    print(cnt)
    max_ = float("-inf")
    for val in count_index:
        if max_ < val:
            max_ = val
            
    if max_ == 1:
        return True
    else:
        return False
print(twice_null_check(train_data))
print(twice_null_check(test_features))

11343
False
11284
False


<i>
    So we saw that there many rows where there are more than 2 missing values in a row both in train and test set, the only idea I am getting now is we will use the columns which doen't have the missing values to find the missing values in the columns and check out, because I am not getting any further idea....
</i>

In [12]:
def convert_numerical(data,not_null):
    '''
    Takes the list of the column name which doesn't contain the missing value
    and if the that column is object datatype then it Label Encodes it,even though
    we can use other good method like categorical_encoders firstly we will stick with
    this after wards let's see what will happen
    '''
    label_enc = LabelEncoder()
    dummy_data = data[not_null]
    for col in dummy_data.columns:
        if data[col].dtype == object:
            data[col] = label_enc.fit_transform(data[col])
convert_numerical(train_data,not_null)
convert_numerical(test_features,not_null)
# train_data.head()

In [13]:
def train_not_null(list_not,data,null_feature):
    '''
    What this function mainly does it takes the list of column name from the dataset
    which doesn't contain any missing value in it, don't include the Labels part as 
    it can lead to the leakage of information and then the data and the missing value
    column name which need to be filled with algorithm.
    
        list_not    ------> list of the column name which doesn't contain missing value
          data      ------> data in the form dataframe
       null_feature ------> Feature/Column name which need to be filled with the algorithm
    '''
    not_null_features = data[list_not]
    null_labels = data[null_feature]
    
    index = [ind for val,ind in zip(null_labels,null_labels.index) if pd.isna(val)]
#     print(index)
    train_index = [ind for ind in not_null_features.index if ind not in index]
#     print(train_index)
    not_null_train = not_null_features.iloc[train_index]
    null_labels_train = null_labels.iloc[train_index]
#     print(null_labels_train.head())
    if null_labels_train.dtypes == object:
        null_labels_train = LabelEncoder().fit_transform(null_labels_train)

    not_null_test = not_null_features.iloc[index]
    
    classifier = LogisticRegression(C = 0.01,max_iter = 500,solver = 'lbfgs').fit(not_null_train,null_labels_train)
    data[null_feature].iloc[index] = classifier.predict(not_null_test)
    data[null_feature].iloc[train_index] = null_labels_train
    print("Finished the column ----> {}".format(null_feature))
    
for col in null_values.index:
    train_not_null(list_not = not_null,data = train_data,null_feature = col)
for col in null_values.index:
    train_not_null(list_not = not_null,data = test_features,null_feature = col)

Finished the column ----> h1n1_concern
Finished the column ----> h1n1_knowledge
Finished the column ----> behavioral_antiviral_meds
Finished the column ----> behavioral_avoidance
Finished the column ----> behavioral_face_mask
Finished the column ----> behavioral_wash_hands
Finished the column ----> behavioral_large_gatherings
Finished the column ----> behavioral_outside_home
Finished the column ----> behavioral_touch_face
Finished the column ----> doctor_recc_h1n1
Finished the column ----> doctor_recc_seasonal
Finished the column ----> chronic_med_condition
Finished the column ----> child_under_6_months
Finished the column ----> health_worker
Finished the column ----> health_insurance
Finished the column ----> opinion_h1n1_vacc_effective
Finished the column ----> opinion_h1n1_risk
Finished the column ----> opinion_h1n1_sick_from_vacc
Finished the column ----> opinion_seas_vacc_effective
Finished the column ----> opinion_seas_risk
Finished the column ----> opinion_seas_sick_from_vacc
Fi

In [14]:
def train_test_data_split(data,feature):
    '''
    This function takes the dataframe as input and the target label and based
    on the target label it divides the dataframe firstly into features and
    labels and then divides those into train and test split with test size
    being 0.3 proportion
    '''
    if feature == 'seasonal_vaccine':
        features = data.drop([feature,'h1n1_vaccine'],axis = 1)
        labels   = data[[feature]]
    else:
        features = data.drop([feature],axis = 1)
        labels   = data[[feature]]
        
    features_train,features_test,labels_train,labels_test = train_test_split(features,labels,test_size = 0.3)
    
    return features_train,features_test,labels_train,labels_test

x_train,x_test,y_train,y_test = train_test_data_split(train_data,feature = 'seasonal_vaccine')
x_train1,x_test1,y_train1,y_test1 = train_test_data_split(train_data,feature = 'h1n1_vaccine')

In [15]:
def baseline_model(xtrain,xtest,ytrain,ytest):
    '''
    This function just creates the baseline model by taking the
    train and test set and then those are fed into the LogisticRegression
    model, using sklearn framework and calculates the accuracy
    and roc(Reciever Operating Characteristics) score
    '''
    classifier = LogisticRegression(C = 1,penalty = 'l2').fit(xtrain,ytrain)
    predict = classifier.predict(xtest)
    
    print('Accuracy of the Model is {}'.format(accuracy_score(ytest,predict)))
    print('ROC Score of the Model is {}'.format(roc_auc_score(ytest,predict)))
    
baseline_model(x_train,x_test,y_train,y_test)
baseline_model(x_train1,x_test1,y_train1,y_test1)

  y = column_or_1d(y, warn=True)


Accuracy of the Model is 0.7761138150505429
ROC Score of the Model is 0.7733548760077433


  y = column_or_1d(y, warn=True)


Accuracy of the Model is 0.8543616622987645
ROC Score of the Model is 0.7387926324230891


In [40]:
def final_submission(train,test):
    '''
    This function helps to get us the output for the competition
    test data runs the model and returns the probability
    '''
    feature1_train = train.drop(['seasonal_vaccine','h1n1_vaccine'],axis = 1)
    feature2_train = train.drop('h1n1_vaccine',axis =  1)
    
    label1_train = train[['seasonal_vaccine']]
    label2_train = train[['h1n1_vaccine']]
    
    classifier1 = LogisticRegression().fit(feature1_train,label1_train)
    classifier2 = LogisticRegression().fit(feature2_train,label2_train)
    
    copy_test = test.copy()
    
    del copy_test['seasonal_vaccine']
    del copy_test['h1n1_vaccine']
#     print(copy_test.head())
    
    prob_seasonal = classifier1.predict_proba(copy_test)
    copy_test['seasonal_vaccine'] = classifier1.predict(copy_test)
    prob_h1n1 = classifier2.predict_proba(copy_test)
    copy_test['h1n1_vaccine'] = classifier2.predict(copy_test)
#     print(classifier2.classes_)
    return copy_test,prob_h1n1,prob_seasonal

x,prob1,prob2 = final_submission(train_data,test_features)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[0 1]


In [44]:
prob1[:,1]

(array([0.03397362, 0.03695052, 0.58623448, ..., 0.06877285, 0.01817328,
        0.70651023]), array([[0.96602638, 0.03397362],
        [0.96304948, 0.03695052],
        [0.41376552, 0.58623448],
        [0.43084785, 0.56915215]]))

In [56]:
def submit_to_webpage(probability1,probability2,filename):
    '''
    This function takes the probability returned from the previous
    function and takes the filename as also input and creates that
    file as per the submission format in the Submission folder in the
    comma separated Values(csv) format
    '''
    test = pd.read_csv('../Data/Test/test_set_features.csv')
#     print(test.index)

    file_dataframe = pd.DataFrame(probability1[:,1],
                                 columns = ['h1n1_vaccine'])
    
    file_dataframe['seasonal_vaccine'] = probability2[:,1]
    file_dataframe['respondent_id'] = test['respondent_id']
    file_dataframe.set_index('respondent_id',inplace = True)
    
    file_dataframe.to_csv('../Submission/'+ filename +'.csv')
    return file_dataframe

y = submit_to_webpage(prob1,prob2,"first_sub")
y.head()

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.033974,0.253301
26708,0.036951,0.046552
26709,0.586234,0.699814
26710,0.569152,0.880344
26711,0.408647,0.500382


<i>
    So my first submission was made to the site I got a roc mean score of <b>0.8232</b> and has been <b>ranked 21</b> in the globe for this <b>Flu based Competition</b>, basically not much of a step is done here so after seeing the result it's clear that filling those missing values was one of the most important task in this competition and my approach to fill those missing values was an accurate one.</i><hr>
    <i>Now as of now my next step is to implement the feature engineering methods make some further analysis,and check whether that is better than the baseline model. And also I think some of the features like the people where they work and there medical condition also has a big impact as per the guidelines given on the H1N1 flue webpage so also need to exploit those columns and make a better model than this, also to have a rank below 10
     </i>

In [None]:
# def census_msa():
#     percentage_non_msa = len(train_data[(train_data['census_msa'] == 'Non-MSA') & (train_data['h1n1_vaccine'] == 1)])/len(train_data[train_data['census_msa'] == 'Non-MSA'])
#     print("Percentage of People who took H1N1 Vaccine in Non-MSA City is {}".format(percentage_non_msa))

#     percentage_msa = len(train_data[(train_data['census_msa'] == 'MSA, Not Principle  City') & (train_data['h1n1_vaccine'] == 1)])/len(train_data[train_data['census_msa'] == 'MSA, Not Principle  City'])
#     print("Percentage of People who took H1N1 Vaccine in MSA, Not Principle City is {}".format(percentage_non_msa))

#     percentage_non_msa = len(train_data[(train_data['census_msa'] == 'MSA, Principle City') & (train_data['h1n1_vaccine'] == 1)])/len(train_data[train_data['census_msa'] == 'MSA, Principle City'])
#     print("Percentage of People who took H1N1 Vaccine in MSA, Principle City is {}".format(percentage_non_msa))

#     print("-"*95)
    
#     percentage_non_msa = len(train_data[(train_data['census_msa'] == 'Non-MSA') & (train_data['seasonal_vaccine'] == 1)])/len(train_data[train_data['census_msa'] == 'Non-MSA'])
#     print("Percentage of People who took H1N1 Vaccine in Non-MSA City is {}".format(percentage_non_msa))
    
#     percentage_msa = len(train_data[(train_data['census_msa'] == 'MSA, Not Principle  City') & (train_data['seasonal_vaccine'] == 1)])/len(train_data[train_data['census_msa'] == 'MSA, Not Principle  City'])
#     print("Percentage of People who took H1N1 Vaccine in MSA, Not Principle City is {}".format(percentage_non_msa))
    
#     percentage_non_msa = len(train_data[(train_data['census_msa'] == 'MSA, Principle City') & (train_data['seasonal_vaccine'] == 1)])/len(train_data[train_data['census_msa'] == 'MSA, Principle City'])
#     print("Percentage of People who took H1N1 Vaccine in MSA, Principle City is {}".format(percentage_non_msa))
# # census_msa()

In [22]:
# test = pd.read_csv('../Data/Test/test_set_features.csv')
# train_data.dtypes

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"> $75,000",Not Married,Rent,Employed,mlyzmhmf,"MSA, Not Principle City",1.0,0.0,atmlpfrs,hfxkjkmi
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,Non-MSA,3.0,0.0,atmlpfrs,xqwwgdyp
2,26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,"> $75,000",Married,Own,Employed,lrircsnp,Non-MSA,1.0,0.0,nduyfdeo,pvmttkik
3,26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,lrircsnp,"MSA, Not Principle City",1.0,0.0,,
4,26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,lzgpxyit,Non-MSA,0.0,1.0,fcxhlnwr,mxkfnird


<i>Here we can see that proportion of people taking either of the vaccines in all three types of city remains almost the same, so we can clearly say that the type of city entirely doesn't matter for vaccination to be taken by it's citizen so this variable is not required at all, so let's remove that variable</i>