## Feature selection
4. Fisher's score and Chi2 Contingency

In [79]:
## importing the libraries

import pandas as pd
import numpy as np
import scipy.stats as stats

In [80]:
## Loading the dataset

import seaborn as sns
data=sns.load_dataset('titanic')

In [81]:
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [82]:
## Missing values. 

data.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [83]:
### We will take all our categorical features 

Cat_features= [x for x in data.columns if data[x].dtype=='O']

In [84]:
Cat_features

['sex', 'embarked', 'who', 'embark_town', 'alive']

In [85]:
data[Cat_features].isnull().sum()

sex            0
embarked       2
who            0
embark_town    2
alive          0
dtype: int64

In [86]:
for x in Cat_features:
    print(x,data[x].unique())

sex ['male' 'female']
embarked ['S' 'C' 'Q' nan]
who ['man' 'woman' 'child']
embark_town ['Southampton' 'Cherbourg' 'Queenstown' nan]
alive ['no' 'yes']


In [87]:
## Filling nan values with highes categorical feature 


a = data['embarked'].value_counts().index[0]

data['embarked'] = data['embarked'].fillna(a)

b = data['embark_town'].value_counts().index[0]

data['embark_town'] = data['embark_town'].fillna(b)

In [88]:
data[Cat_features].isnull().sum()

sex            0
embarked       0
who            0
embark_town    0
alive          0
dtype: int64

In [89]:
## Dependent and indenpendent features(## We consider only categorical features for Chi2 test.)

X = data[Cat_features]

y= data['survived']

In [90]:
X['alive'] = np.where(X.loc[:,'alive']=='yes',1,0)
X['sex'] = np.where(X.loc[:,'sex']=='male',1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [91]:
X.head(20)

Unnamed: 0,sex,embarked,who,embark_town,alive
0,1,S,man,Southampton,0
1,0,C,woman,Cherbourg,1
2,0,S,woman,Southampton,1
3,0,S,woman,Southampton,1
4,1,S,man,Southampton,0
5,1,Q,man,Queenstown,0
6,1,S,man,Southampton,0
7,1,S,child,Southampton,0
8,0,S,woman,Southampton,1
9,0,C,child,Cherbourg,1


In [92]:
### let's perform label encoding on embarked and embarked_town and who 
ordinal_label = {k: i for i, k in enumerate(X['embarked'].unique(), 0)}
X['embarked'] = X['embarked'].map(ordinal_label)

ordinal_label2 = {k: i for i, k in enumerate(X['embark_town'].unique(), 0)}
X['embark_town'] = X['embark_town'].map(ordinal_label2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [94]:
ordinal_label3 = {k: i for i, k in enumerate(X['who'].unique(), 0)}
X['who'] = X['who'].map(ordinal_label3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [95]:
X.head()

Unnamed: 0,sex,embarked,who,embark_town,alive
0,1,0,0,0,0
1,0,1,1,1,1
2,0,0,1,0,1
3,0,0,1,0,1
4,1,0,0,0,0


In [96]:
## Train-test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=2)

### Fisher Score- Chisquare Test For Feature Selection
Compute chi-squared stats between each non-negative feature and class.

This score should be used to evaluate categorical variables in a classification task.
This score can be used to select the n_features features with the highest values for the test chi-squared statistic from X, which must contain only non-negative features such as booleans or frequencies (e.g., term counts in document classification), relative to the classes.

Recall that the chi-square test measures dependence between stochastic variables, so using this function “weeds out” the features that are the most likely to be independent of class and therefore irrelevant for classification. The Chi Square statistic is commonly used for testing relationships between categorical variables.

It compares the observed distribution of the different classes of target Y among the different categories of the feature, against the expected distribution of the target classes, regardless of the feature categories.

In [97]:


## Perform chi2 test
### chi2 returns 2 values
### Fisher score and the pvalue
from sklearn.feature_selection import chi2
f_p_values=chi2(X_train,y_train)

In [98]:
f_p_values

(array([ 70.66614278,   4.88198002, 134.51265157,   4.88198002,
        389.        ]),
 array([4.23091518e-17, 2.71384711e-02, 4.22023462e-31, 2.71384711e-02,
        1.36640417e-86]))

In [99]:
p_values=pd.Series(f_p_values[1])
p_values.index=X_train.columns
p_values

sex            4.230915e-17
embarked       2.713847e-02
who            4.220235e-31
embark_town    2.713847e-02
alive          1.366404e-86
dtype: float64

In [105]:

p_values.sort_values(ascending=False)

embarked       2.713847e-02
embark_town    2.713847e-02
sex            4.230915e-17
who            4.220235e-31
alive          1.366404e-86
dtype: float64

## As per Chi2 cotingency we have the highest values for features shown above

## Fisher's score

In [104]:
f_score =pd.Series(f_p_values[0])
f_score.index=X_train.columns
f_score.sort_values(ascending=False)

embarked         4.881980
embark_town      4.881980
sex             70.666143
who            134.512652
alive          389.000000
dtype: float64

## As per Fisher's score we have the highest values for features shown above