### Feature Engineering -- 3 Technique
Feature Engineering is a process of selecting those feature from your dataset that contribute the most to the prediction Variable

Dataset --> cancer.csv

#### Univariate Feature Selection -- 1st Technique

In [1]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
import pandas as pd
import seaborn as sns
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

#For regression: f_regression, mutual_info_regression
#For classification: chi2, f_classif, mutual_info_classif

In [2]:
#Loading the Dataset
Dataset = pd.read_csv("C://Users/Akaash/Downloads/cancer.csv")
Dataset.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
#Separating the Input / Output variable
array = Dataset.values
X = array[:,1:]
Y = array[:,0]

In [4]:
# feature extraction - using chi2
test = SelectKBest(score_func=chi2, k=20)   # K = 20 will give 20 best feature
fit = test.fit(X, Y)

In [5]:
# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)

[2.661e+02 9.390e+01 2.011e+03 5.399e+04 1.499e-01 5.403e+00 1.971e+01
 1.054e+01 2.574e-01 7.431e-05 3.468e+01 9.794e-03 2.506e+02 8.759e+03
 3.266e-03 6.138e-01 1.045e+00 3.052e-01 8.036e-05 6.371e-03 4.917e+02
 1.744e+02 3.665e+03 1.126e+05 3.974e-01 1.931e+01 3.952e+01 1.349e+01
 1.299e+00 2.315e-01]


Inference: from the Univariate feature Selection , Using Chi2 method to get best feature, it gave that chi2 scores above, through this we can determine the best feature who has the Highest score.

#### Recursive Feature Elimination -- 2nd Technique

In [6]:
#Importing the Required libraries
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")  #--to ignore warnings

In [7]:
#Loading the Dataset
Dataset = pd.read_csv("C://Users/Akaash/Downloads/cancer.csv")
Dataset.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [8]:
#Separating the Input / Output variable
array = Dataset.values
X = array[:,1:]
Y = array[:,0]

In [9]:
# feature extraction -- Using LogisticRegression
model = LogisticRegression(max_iter=400)
rfe = RFE(model, 20)
fit = rfe.fit(X, Y)

In [10]:
#Num Features: 
fit.n_features_

20

In [11]:
#Feature Ranking:
fit.ranking_

array([ 1,  1,  1, 10,  1,  1,  1,  1,  1,  7,  2,  1,  1,  1,  6, 11,  8,
        3,  4,  9,  1,  1,  1,  5,  1,  1,  1,  1,  1,  1])

In [12]:
#Selected Features:
fit.support_

array([ True,  True,  True, False,  True,  True,  True,  True,  True,
       False, False,  True,  True,  True, False, False, False, False,
       False, False,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True])

Inference: from the Recursive Feature Elimination , Using Logistic Regression method to get best feature, it gave boolean value above by, so which ever is True are Best feature

#### Feature Importance using Decision Tree -- 3rd Technique

In [13]:
#Importing the Required Libraries
import pandas as pd
from sklearn.tree import  DecisionTreeClassifier

In [14]:
#Loading the Dataset
Dataset = pd.read_csv("C://Users/Akaash/Downloads/cancer.csv")
Dataset.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [15]:
#Separating the Input / Output variable
array = Dataset.values
X = array[:,1:]
Y = array[:,0]

In [16]:
# feature extraction -- Using the DecisionTreeClassifier()
model = DecisionTreeClassifier()
model.fit(X, Y)
print(model.feature_importances_)

[0.    0.042 0.    0.    0.    0.    0.009 0.    0.006 0.    0.006 0.
 0.006 0.002 0.001 0.038 0.    0.    0.    0.    0.696 0.046 0.    0.011
 0.014 0.    0.009 0.107 0.    0.007]


Inference: from the Decision Tree , Using DecisionTreeClassifier() method to get best feature, it gave that importances Score above by So which ever Feature has the highest score is the Best Feature

#### inference: From All Three Technique Choose the Feature Which Occurs in all three Technique