# Importance of features selection 
### 1. Model accuracy
### 2. Reduces chances of overfitting
### 3. Save data collection cost
### 4. Simple models

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

# 1. Correlation Coefficient Technique

In [2]:
iris = sns.load_dataset('iris')

In [3]:
iris.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa


In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
lb = LabelEncoder()

In [6]:
iris['species'] = lb.fit_transform(iris['species'])

In [7]:
iris.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


In [8]:
correleation_matrix = iris.corr()

In [9]:
correleation_matrix

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
sepal_length,1.0,-0.11757,0.871754,0.817941,0.782561
sepal_width,-0.11757,1.0,-0.42844,-0.366126,-0.426658
petal_length,0.871754,-0.42844,1.0,0.962865,0.949035
petal_width,0.817941,-0.366126,0.962865,1.0,0.956547
species,0.782561,-0.426658,0.949035,0.956547,1.0


In [10]:
correleation_with_target = correleation_matrix['species'].sort_values(ascending=False)

In [11]:
correleation_with_target

species         1.000000
petal_width     0.956547
petal_length    0.949035
sepal_length    0.782561
sepal_width    -0.426658
Name: species, dtype: float64

In [12]:
selected_feature = correleation_with_target[abs(correleation_with_target)>0.5].index

In [13]:
selected_feature

Index(['species', 'petal_width', 'petal_length', 'sepal_length'], dtype='object')

In [14]:
new_df = iris[selected_feature]

In [15]:
new_df

Unnamed: 0,species,petal_width,petal_length,sepal_length
0,0,0.2,1.4,5.1
1,0,0.2,1.4,4.9
2,0,0.2,1.3,4.7
3,0,0.2,1.5,4.6
4,0,0.2,1.4,5.0
...,...,...,...,...
145,2,2.3,5.2,6.7
146,2,1.9,5.0,6.3
147,2,2.0,5.2,6.5
148,2,2.3,5.4,6.2


# 2. Chi square test techniques

In [16]:
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.preprocessing import KBinsDiscretizer

In [17]:
data = load_breast_cancer()

In [18]:
df = pd.DataFrame(data.data,columns=data.feature_names)

In [19]:
df['target'] = data.target

In [20]:
df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0


In [21]:
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')

In [22]:
df_discretized = pd.DataFrame(discretizer.fit_transform(df.iloc[:,:-1]), columns=df.columns[:-1])

In [23]:
df_discretized.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,2.0,0.0,2.0,1.0,2.0,3.0,3.0,3.0,3.0,3.0,...,3.0,0.0,3.0,2.0,3.0,3.0,2.0,4.0,2.0,2.0
1,3.0,1.0,3.0,2.0,1.0,0.0,1.0,1.0,1.0,0.0,...,3.0,1.0,2.0,2.0,1.0,0.0,0.0,3.0,1.0,1.0
2,3.0,1.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,1.0,...,2.0,1.0,2.0,1.0,2.0,1.0,1.0,4.0,2.0,1.0


In [24]:
df_discretized['target'] = df['target']

In [25]:
df_discretized.shape

(569, 31)

In [26]:
df_discretized['target'].value_counts()

target
1    357
0    212
Name: count, dtype: int64

In [27]:
x = df_discretized.drop('target',axis=1)
y = df_discretized['target']

In [28]:
chi2_selector =  SelectKBest(chi2,k=10)

In [29]:
X_kbest = chi2_selector.fit_transform(x,y)

In [30]:
selected_features = x.columns[chi2_selector.get_support()]

In [31]:
selected_features

Index(['mean perimeter', 'mean area', 'mean concavity', 'mean concave points',
       'worst radius', 'worst perimeter', 'worst area', 'worst compactness',
       'worst concavity', 'worst concave points'],
      dtype='object')

In [32]:
new_df = df[selected_features]
new_df['target'] = df['target']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['target'] = df['target']


In [33]:
new_df.head()

Unnamed: 0,mean perimeter,mean area,mean concavity,mean concave points,worst radius,worst perimeter,worst area,worst compactness,worst concavity,worst concave points,target
0,122.8,1001.0,0.3001,0.1471,25.38,184.6,2019.0,0.6656,0.7119,0.2654,0
1,132.9,1326.0,0.0869,0.07017,24.99,158.8,1956.0,0.1866,0.2416,0.186,0
2,130.0,1203.0,0.1974,0.1279,23.57,152.5,1709.0,0.4245,0.4504,0.243,0
3,77.58,386.1,0.2414,0.1052,14.91,98.87,567.7,0.8663,0.6869,0.2575,0
4,135.1,1297.0,0.198,0.1043,22.54,152.2,1575.0,0.205,0.4,0.1625,0


In [34]:
new_df['target'].value_counts()

target
1    357
0    212
Name: count, dtype: int64

# 3. ANOVA

In [35]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, f_classif

In [36]:
data = load_iris()

In [37]:
df = pd.DataFrame(data.data,columns=data.feature_names)

In [38]:
df['target'] = data.target

In [39]:
df.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


In [40]:
x = df.drop('target',axis=1)
y = df['target']

In [41]:
anova_selector = SelectKBest(f_classif,k=2)

In [42]:
X_kbest = anova_selector.fit_transform(x,y)

In [43]:
selected_features = x.columns[anova_selector.get_support()]

In [44]:
new_df = df[selected_features]
new_df['target'] = df['target']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['target'] = df['target']


In [45]:
new_df.head(3)

Unnamed: 0,petal length (cm),petal width (cm),target
0,1.4,0.2,0
1,1.4,0.2,0
2,1.3,0.2,0


# 4. Mutual Information

In [46]:
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectKBest,mutual_info_classif

In [56]:
data = load_breast_cancer()

In [57]:
df = pd.DataFrame(data.data,columns=data.feature_names)
df['target'] = data.target
df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0


In [66]:
df.shape

(569, 31)

In [58]:
x = df.drop('target',axis=1)
y = df['target']

In [67]:
mi_selector = SelectKBest(mutual_info_classif,k=10) #  k =10 means its select top 10 features

In [60]:
x_kbest = mi_selector.fit_transform(x,y)

In [61]:
selected_features = x.columns[mi_selector.get_support()]

In [62]:
selected_features

Index(['mean radius', 'mean perimeter', 'mean area', 'mean concavity',
       'mean concave points', 'area error', 'worst radius', 'worst perimeter',
       'worst area', 'worst concave points'],
      dtype='object')

In [63]:
new_df = df[selected_features]
new_df['target'] = df['target']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['target'] = df['target']


In [64]:
new_df.head()

Unnamed: 0,mean radius,mean perimeter,mean area,mean concavity,mean concave points,area error,worst radius,worst perimeter,worst area,worst concave points,target
0,17.99,122.8,1001.0,0.3001,0.1471,153.4,25.38,184.6,2019.0,0.2654,0
1,20.57,132.9,1326.0,0.0869,0.07017,74.08,24.99,158.8,1956.0,0.186,0
2,19.69,130.0,1203.0,0.1974,0.1279,94.03,23.57,152.5,1709.0,0.243,0
3,11.42,77.58,386.1,0.2414,0.1052,27.23,14.91,98.87,567.7,0.2575,0
4,20.29,135.1,1297.0,0.198,0.1043,94.44,22.54,152.2,1575.0,0.1625,0


In [65]:
new_df.shape

(569, 11)

# 5. Variance Threshold

In [68]:
from sklearn.datasets import load_iris

In [69]:
data = load_iris()
df = pd.DataFrame(data.data,columns=data.feature_names)
df['target'] = data.target

In [70]:
df.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


In [72]:
variences = df.var()
variences

sepal length (cm)    0.685694
sepal width (cm)     0.189979
petal length (cm)    3.116278
petal width (cm)     0.581006
target               0.671141
dtype: float64

In [73]:
from sklearn.feature_selection import VarianceThreshold

In [74]:
x = df.drop('target',axis=1)
y = df['target']

In [75]:
selector = VarianceThreshold(threshold=0.2)

In [76]:
x_transformed = selector.fit_transform(x)

In [77]:
selected_features = x.columns[selector.get_support()]

In [78]:
new_df = df[selected_features]
new_df['target'] = df['target']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['target'] = df['target']


In [79]:
new_df.head(3)

Unnamed: 0,sepal length (cm),petal length (cm),petal width (cm),target
0,5.1,1.4,0.2,0
1,4.9,1.4,0.2,0
2,4.7,1.3,0.2,0
