# Import Packages

# Strategy


> For each categorical variable:

1) Separate into train and test

2) Determine the mean value of the target within each label of the categorical variable using the train set

3) Use that mean target value per label as the prediction (using the test set) and calculate the roc-auc



> For each numerical variable:

1) Separate into train and test

2) Divide the variable into 100 quantiles

3) Calculate the mean target within each quantile using the training set 

4) Use that mean target value / bin as the prediction (using the test set) and calculate the roc-auc





In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [20]:
data = pd.read_csv('/content/titanic.csv')


In [21]:
# Variable preprocessing:
# then I will narrow down the different cabins by selecting only the
# first letter, which represents the deck in which the cabin was located
# captures first letter of string (the letter of the cabin)
data['Cabin'] = data['Cabin'].str[0]
data['Cabin'].unique()

array([nan, 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

#Feature selection on categorical variables

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['Pclass', 'Sex', 'Embarked', 'Cabin', 'Survived']],
    data['Survived'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((623, 5), (268, 5))

In [25]:
X_train.info()
categorical=data.select_dtypes(include ='object').columns
categorical
#'pclass', 'sex', 'embarked', 'cabin'

<class 'pandas.core.frame.DataFrame'>
Int64Index: 623 entries, 857 to 684
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Pclass    623 non-null    int64 
 1   Sex       623 non-null    object
 2   Embarked  621 non-null    object
 3   Cabin     152 non-null    object
 4   Survived  623 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 29.2+ KB


Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')

In [None]:
data.select_dtypes(exclude ='object')

In [27]:
def mean_encoding(df_train, df_test, categorical_vars):
     # temporary copy of the original dataframes
    df_train_temp = df_train.copy()
    df_test_temp = df_test.copy()
    for col in categorical_vars:
       target_mean_dict =data.groupby([col])['Survived'].mean().to_dict()
       # replace the categories by the mean of the target
       df_train_temp[col] = df_train[col].map(target_mean_dict)
       df_test_temp[col] = df_test[col].map(target_mean_dict)
    df_train_temp.drop(['Survived'], axis=1, inplace=True)
    df_test_temp.drop(['Survived'], axis=1, inplace=True)

    return df_train_temp, df_test_temp   

In [29]:
categorical_vars = ['Pclass', 'Sex', 'Embarked', 'Cabin']

X_train_enc, X_test_enc = mean_encoding(X_train, X_test, categorical_vars)

X_train_enc.head()

Unnamed: 0,Pclass,Sex,Embarked,Cabin
857,0.62963,0.188908,0.336957,0.75
52,0.62963,0.742038,0.553571,0.757576
386,0.242363,0.188908,0.336957,
124,0.62963,0.188908,0.336957,0.757576
578,0.242363,0.742038,0.553571,


In [31]:
roc_values = []

for feature in categorical_vars:
    #X_train[feat].fillna(0).to_frame()
    roc_values.append(roc_auc_score(y_test, X_test_enc[feature].fillna(0).to_frame())) 

In [32]:
m1 = pd.Series(roc_values)
m1.index = categorical_vars
m1.sort_values(ascending=False)

Sex         0.771667
Pclass      0.680476
Cabin       0.644673
Embarked    0.577500
dtype: float64

In [36]:
m1
selected_features = m1[m1 > 0.6].index
selected_features

Index(['Pclass', 'Sex', 'Cabin'], dtype='object')