<a href="https://colab.research.google.com/github/AhmedKaramDev/Features-Selection-Tutorial/blob/master/Chi_square_features_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Selection: Filter Methods - Supervised Learning - Chi Square 

### Import Needed Packages

In [None]:
import pandas as pd # to read data
from sklearn.feature_selection import SelectKBest # the function that select the features
from sklearn.feature_selection import chi2 # the features will be filtered based on chi square function 

In [None]:
# load and see samples of the data
data = pd.read_csv("mobile_data.csv")
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [None]:
# check how many labels we have
data.price_range.value_counts()

3    500
2    500
1    500
0    500
Name: price_range, dtype: int64

In [None]:
X = data.iloc[:,0:20]  # select the features
y = data.iloc[:,-1]    # select the target column (price range)

# ----------------- Explain How Chi Square works (optional)----------------

In [None]:
from sklearn.preprocessing import LabelBinarizer
import numpy as np
from sklearn.utils.extmath import safe_sparse_dot

In [None]:
Y = LabelBinarizer().fit_transform(y)
if Y.shape[1] == 1:
    Y = np.append(1 - Y, Y, axis=1)

observed = safe_sparse_dot(Y.T, X)  # n_classes * n_features

feature_count = X.sum(axis=0).reshape(1, -1)
class_prob = Y.mean(axis=0).reshape(1, -1)
expected = np.dot(class_prob.T, feature_count)

In [None]:
Y = LabelBinarizer().fit_transform(y)

In [None]:
Y

array([[0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       ...,
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [0, 0, 0, 1]])

In [None]:
safe_sparse_dot(Y.T, X)

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,558451.0,243.0,775.1,250.0,2042.0,259.0,15587.0,245.1,70276.0,2300.0,4787.0,268204.0,575135.0,392657.0,6162.0,2841.0,5306.0,373.0,262.0,248.0
1,614434.0,245.0,744.3,255.0,2170.0,262.0,16058.0,262.0,70255.0,2149.0,4962.0,333446.0,625954.0,839745.0,6106.0,2772.0,5681.0,378.0,261.0,252.0
2,614160.0,243.0,764.9,249.0,2249.0,247.0,15460.0,245.4,71807.0,2341.0,5009.0,316142.0,617023.0,1291408.0,6005.0,2857.0,5486.0,387.0,235.0,252.0
3,689992.0,259.0,760.2,265.0,2158.0,275.0,16988.0,251.0,68160.0,2251.0,5075.0,372424.0,684919.0,1724616.0,6340.0,3064.0,5549.0,385.0,248.0,262.0


In [None]:
print(data[data.price_range==0]['blue'].sum())
print(data[data.price_range==1]['blue'].sum())
print(data[data.price_range==2]['blue'].sum())
print(data[data.price_range==3]['blue'].sum())

243
245
243
259


In [None]:
np.array(X).sum(axis=0).reshape(1, -1) # is equal to -->

array([[2.477037e+06, 9.900000e+02, 3.044500e+03, 1.019000e+03,
        8.619000e+03, 1.043000e+03, 6.409300e+04, 1.003500e+03,
        2.804980e+05, 9.041000e+03, 1.983300e+04, 1.290216e+06,
        2.503031e+06, 4.248426e+06, 2.461300e+04, 1.153400e+04,
        2.202200e+04, 1.523000e+03, 1.006000e+03, 1.014000e+03]])

In [None]:
X.sum(axis=0)

battery_power    2477037.0
blue                 990.0
clock_speed         3044.5
dual_sim            1019.0
fc                  8619.0
four_g              1043.0
int_memory         64093.0
m_dep               1003.5
mobile_wt         280498.0
n_cores             9041.0
pc                 19833.0
px_height        1290216.0
px_width         2503031.0
ram              4248426.0
sc_h               24613.0
sc_w               11534.0
talk_time          22022.0
three_g             1523.0
touch_screen        1006.0
wifi                1014.0
dtype: float64

In [None]:
Y.sum(axis=0)

array([500, 500, 500, 500])

In [None]:
Y.mean(axis=0).reshape(1, -1)

array([[0.25, 0.25, 0.25, 0.25]])

In [None]:
np.dot(class_prob.T, feature_count) # 0.25 from class probability is the critical value that we will use

# -----------------------End Optional Part--------------------------------

In [None]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)

In [None]:
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

In [None]:
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)

In [None]:
featureScores.head()

Unnamed: 0,0,0.1
0,battery_power,14129.866576
1,blue,0.723232
2,clock_speed,0.648366
3,dual_sim,0.631011
4,fc,10.135166


In [None]:
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features

            Specs          Score
13            ram  931267.519053
11      px_height   17363.569536
0   battery_power   14129.866576
12       px_width    9810.586750
8       mobile_wt      95.972863
6      int_memory      89.839124
15           sc_w      16.480319
16      talk_time      13.236400
4              fc      10.135166
14           sc_h       9.614878


In [None]:
featureScores.nlargest(10,'Score')['Specs']

13              ram
11        px_height
0     battery_power
12         px_width
8         mobile_wt
6        int_memory
15             sc_w
16        talk_time
4                fc
14             sc_h
Name: Specs, dtype: object

## Apply the selected features with ML model and comapre the difference

### With Normal Data

In [None]:
from sklearn import svm
from sklearn.metrics import classification_report
from datetime import datetime
from sklearn.model_selection import train_test_split

In [None]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=200)

In [None]:
normal_start_time = datetime.now()
classifer = svm.SVC()
classifer.fit(X_train, y_train)
normal_predicted_y = classifer.predict(X_test)
normal_end_time = datetime.now()

### With Feature Selection

In [None]:
fs_start_time = datetime.now()
clf_f = svm.SVC()
clf_f.fit(X_train[featureScores.nlargest(10,'Score')['Specs']], y_train)
fs_predicted_y = clf_f.predict(X_test[featureScores.nlargest(10,'Score')['Specs']])
fs_end_time = datetime.now()

### Evaluate the Data

In [None]:
# with all features
print(classification_report(y_test,normal_predicted_y))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       131
           1       0.95      0.94      0.95       133
           2       0.95      0.90      0.92       118
           3       0.93      0.97      0.95       118

    accuracy                           0.95       500
   macro avg       0.95      0.95      0.95       500
weighted avg       0.95      0.95      0.95       500



In [None]:
# with selected features
print(classification_report(y_test,fs_predicted_y))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       131
           1       0.95      0.94      0.95       133
           2       0.95      0.90      0.92       118
           3       0.93      0.97      0.95       118

    accuracy                           0.95       500
   macro avg       0.95      0.95      0.95       500
weighted avg       0.95      0.95      0.95       500



In [None]:
all_f_time = (normal_end_time - normal_start_time).microseconds
fs_time = (fs_end_time - fs_start_time).microseconds
if fs_time < all_f_time:
    print("Features selected model take less time")
else:
    print("All Features model take less time")

Features selected model take less time
