<a href="https://colab.research.google.com/github/BhavikDudhrejiya/Feature-Selection-Methods/blob/main/Feature_selection_by_mutual_information_gain_on_classification_%26_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Feature selection by mutual information gain on classification

In [72]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif, mutual_info_regression, SelectKBest, SelectPercentile
from sklearn.datasets import load_breast_cancer

In [73]:
#Loading data
data = load_breast_cancer()

In [74]:
#Checking attributes
data.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [75]:
#Splitting data into X & y
X = pd.DataFrame(data.data, columns = data.feature_names)
y = data.target

#Checking the shape of the data
X.shape, y.shape

((569, 30), (569,))

In [76]:
#Splitting data into train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

#Checking shape of the data
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((455, 30), (114, 30), (455,), (114,))

# Mutual information

In [78]:
#Selecting features according to a percentile of the highest scores
sel_cls = SelectPercentile(mutual_info_classif, percentile=10)
sel_cls.fit(X_train, y_train)

SelectPercentile(percentile=10,
                 score_func=<function mutual_info_classif at 0x7f999db154d0>)

In [79]:
#Checking result 
sel_cls.get_support()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False,  True,  True, False, False, False,
       False, False, False])

In [80]:
#Extracting feature according to a percentile of the highest scores
X_train.columns[sel_cls.get_support()]

Index(['worst radius', 'worst perimeter', 'worst area'], dtype='object')

In [81]:
#Transforming the data
X_train_mi = sel_cls.transform(X_train)
X_test_mi = sel_cls.transform(X_test)

#Checking the shape of the data
X_train_mi.shape, X_test_mi.shape

((455, 3), (114, 3))

In [84]:
#Creating function who train a model and extract accuracy
def classifier(X_train, X_test, y_train, y_test):
  rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
  rf.fit(X_train, y_train)
  y_pred = rf.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  return {'Accuracy':accuracy}

In [85]:
%%time
#Extracting accuracy
classifier(X_train_mi, X_test_mi, y_train, y_test)

CPU times: user 195 ms, sys: 40.9 ms, total: 235 ms
Wall time: 342 ms


{'Accuracy': 0.9122807017543859}

In [86]:
%%time
#Extracting accuracy
classifier(X_train, X_test, y_train, y_test)

CPU times: user 280 ms, sys: 29.4 ms, total: 309 ms
Wall time: 346 ms


{'Accuracy': 0.956140350877193}

#Feature selection by mutual information gain on regression

In [61]:
#Importing libraries
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [45]:
#Loading data
data = load_boston()

In [46]:
#Checking attribtes of the data
data.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [47]:
#Splitting data into X & y
X = pd.DataFrame(data.data, columns = data.feature_names)
y = data.target

#Checking the shape of the data
X.shape, y.shape

((506, 13), (506,))

In [48]:
#Splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#Checking the shape of the data
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((404, 13), (102, 13), (404,), (102,))

In [53]:
#Select features according to the k highest scores
sel_reg = SelectKBest(mutual_info_regression, k=9)
sel_reg.fit(X_train, y_train)

SelectKBest(k=9, score_func=<function mutual_info_regression at 0x7f999db15440>)

In [54]:
#Checking the result
sel_reg.get_support()

array([ True, False,  True, False,  True,  True,  True,  True, False,
        True,  True, False,  True])

In [55]:
#Extractin the features are more important
X_train.columns[sel_reg.get_support()]

Index(['CRIM', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'LSTAT'], dtype='object')

In [63]:
#Removing less important features from the data
X_train_mi1 = sel_reg.transform(X_train)
X_test_mi1 = sel_reg.transform(X_test)

#Checking the shape of the data
X_train_mi1.shape, X_test_mi1.shape

((404, 9), (102, 9))

In [69]:
#Creating function who train a model and extract accuracy
def regressor(X_train, X_test, y_train, y_test):
  rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
  rf.fit(X_train, y_train)
  y_pred = rf.predict(X_test)
  r2 = r2_score(y_test, y_pred)
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  print('R2',r2, 'RMSE',rmse)

In [70]:
%%time
#Extracting accuracy on the data have high important features
regressor(X_train_mi1, X_test_mi1, y_train, y_test)

R2 0.8873076693301188 RMSE 2.8747443262273733
CPU times: user 375 ms, sys: 26 ms, total: 401 ms
Wall time: 346 ms


In [71]:
%%time
#Extracting accuracy on on the original data
regressor(X_train, X_test, y_train, y_test)

R2 0.8896648705127477 RMSE 2.844519724098308
CPU times: user 429 ms, sys: 24.5 ms, total: 453 ms
Wall time: 446 ms
