## Feature Selection for sklearn in-built dataset.

In [11]:
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,f1_score,classification_report
from sklearn.neighbors import KNeighborsClassifier

In [13]:
df=load_wine()
X=df.data
y=df.target
feature_names=df.feature_names
target=df.target

In [15]:
print(df.DESCR)
# print(df.feature_names)
# print(df.target_names)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

:Number of Instances: 178
:Number of Attributes: 13 numeric, predictive attributes and the class
:Attribute Information:
    - Alcohol
    - Malic acid
    - Ash
    - Alcalinity of ash
    - Magnesium
    - Total phenols
    - Flavanoids
    - Nonflavanoid phenols
    - Proanthocyanins
    - Color intensity
    - Hue
    - OD280/OD315 of diluted wines
    - Proline
    - class:
        - class_0
        - class_1
        - class_2

:Summary Statistics:

                                Min   Max   Mean     SD
Alcohol:                      11.0  14.8    13.0   0.8
Malic Acid:                   0.74  5.80    2.34  1.12
Ash:                          1.36  3.23    2.36  0.27
Alcalinity of Ash:            10.6  30.0    19.5   3.3
Magnesium:                    70.0 162.0    99.7  14.3
Total Phenols:                0.98  3.88    2.29  0.63
Flavanoids:                   0.34  5.08    2.03  1.00

In [17]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.33,random_state=1)
st=StandardScaler()
X_train_new=st.fit_transform(X_train)
X_test_new=st.transform(X_test)
model=KNeighborsClassifier()
model.fit(X_train_new,y_train)
pred_train=model.predict(X_train_new)
pred_test=model.predict(X_test_new)
print('Accuracy Score of Training:- ',accuracy_score(y_train,pred_train))
print('Accuracy Score of Testing:- ',accuracy_score(y_test,pred_test))

Accuracy Score of Training:-  0.9915966386554622
Accuracy Score of Testing:-  0.9491525423728814


## Feature Selection.

In [20]:
from sklearn.feature_selection import f_classif

In [38]:
df=load_wine()
X=df.data
y=df.target
feature_names=df.feature_names
target=df.target
f_value,p_value=f_classif(X,y)
z=[i for i,j in enumerate(f_value) if j>90]
X_new=X[:,z]
X_train,X_test,y_train,y_test=train_test_split(X_new,y,test_size=.33,random_state=1)
st=StandardScaler()
X_train_new=st.fit_transform(X_train)
X_test_new=st.transform(X_test)
model=KNeighborsClassifier()
model.fit(X_train_new,y_train)
pred_train=model.predict(X_train_new)
pred_test=model.predict(X_test_new)
print('Accuracy Score of Training:- ',accuracy_score(y_train,pred_train))
print('Accuracy Score of Testing:- ',accuracy_score(y_test,pred_test))

Accuracy Score of Training:-  0.9915966386554622
Accuracy Score of Testing:-  0.9661016949152542


In [40]:
confusion_matrix(y_train,pred_train)

array([[35,  0,  0],
       [ 0, 48,  1],
       [ 0,  0, 35]], dtype=int64)

In [42]:
p1=35/35
p2=48/48
p3=35/36
print(p1,'\n',p2,'\n',p3)
print(precision_score(y_train,pred_train,average=None))
r1=35/35
r2=48/49
r3=35/35
print(r1,'\n',r2,'\n',r3)
print(recall_score(y_train,pred_train,average=None))
f1_1=(2*p1*r1)/(p1+r1)
f1_2=(2*p2*r2)/(p2+r2)
f1_3=(2*p3*r3)/(p3+r3)
print(f1_1,'\n',f1_2,'\n',f1_3)
print(f1_score(y_train,pred_train,average=None))
acc_score=(35+48+35)/(35+49+35)
print(accuracy_score(y_train,pred_train))

1.0 
 1.0 
 0.9722222222222222
[1.         1.         0.97222222]
1.0 
 0.9795918367346939 
 1.0
[1.         0.97959184 1.        ]
1.0 
 0.9896907216494846 
 0.9859154929577464
[1.         0.98969072 0.98591549]
0.9915966386554622


In [44]:
print(classification_report(y_train,pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      0.98      0.99        49
           2       0.97      1.00      0.99        35

    accuracy                           0.99       119
   macro avg       0.99      0.99      0.99       119
weighted avg       0.99      0.99      0.99       119



In [46]:
confusion_matrix(y_test,pred_test)

array([[24,  0,  0],
       [ 2, 20,  0],
       [ 0,  0, 13]], dtype=int64)

In [53]:
p1=24/26
p2=20/20
p3=13/13
print(p1,'\n',p2,'\n',p3)
print(precision_score(y_test,pred_test,average=None))
r1=24/24
r2=20/22
r3=13/13
print(r1,'\n',r2,'\n',r3)
print(recall_score(y_test,pred_test,average=None))
f1_1=(2*p1*r1)/(p1+r1)
f1_2=(2*p2*r2)/(p2+r2)
f1_3=(2*p3*r3)/(p3+r3)
print(f1_1,'\n',f1_2,'\n',f1_3)
print(f1_score(y_test,pred_test,average=None))
acc_score=(24+20+13)/(24+2+20+13)
print(accuracy_score(y_test,pred_test))

0.9230769230769231 
 1.0 
 1.0
[0.92307692 1.         1.        ]
1.0 
 0.9090909090909091 
 1.0
[1.         0.90909091 1.        ]
0.9600000000000001 
 0.9523809523809523 
 1.0
[0.96       0.95238095 1.        ]
0.9661016949152542


In [55]:
print(classification_report(y_test,pred_test))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96        24
           1       1.00      0.91      0.95        22
           2       1.00      1.00      1.00        13

    accuracy                           0.97        59
   macro avg       0.97      0.97      0.97        59
weighted avg       0.97      0.97      0.97        59

