# S04T02: Recursive feature elimination (RFE) com Random Forest

## The one with libs

In [None]:
import pickle
import numpy as np
import time
import pandas as pd
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score,cohen_kappa_score, confusion_matrix

## The one who loads the train and test sets 

In [None]:
def read_pickle(name):
    with (open(name, 'rb')) as openfile:
        while True:
            try:
                one_instance = pickle.load(openfile)
            except EOFError:
                break
    one_instance = np.asanyarray(one_instance)
    return one_instance

In [None]:
path = "/content/drive/My Drive/Arquivos 10 classes/Handcrafted/58_carac/"

In [None]:
ls "/content/drive/My Drive/Arquivos 10 classes/Handcrafted/58_carac"

handcrafted_features_Tests_.pickle  handcrafted_labels_Tests_.pickle
handcrafted_features_Train_.pickle  handcrafted_labels_Train_.pickle


In [None]:
x_test = read_pickle(path+'handcrafted_features_Tests_.pickle')
x_train = read_pickle(path+'handcrafted_features_Train_.pickle')
y_test = read_pickle(path+'handcrafted_labels_Tests_.pickle')
y_train = read_pickle(path+'handcrafted_labels_Train_.pickle')

## The one who captures the best features according to accuracy

In [None]:
initial_time = time.time()
classifier = RandomForestClassifier(n_estimators=100)

# Cria o RFE que avalaia as features conforme acurácia
rfecv = RFECV(estimator=classifier, step=1, scoring='accuracy')

# Fit RFE  
rfecv.fit(x_train, y_train)

# RFE 
rfecv.transform(x_train)

# Números das melhores features
print(rfecv.n_features_)
finally_time = time.time()

print("Time of execution:", (finally_time - initial_time)/60)

58
Time of execution: 33.49832484324773


In [None]:
print(rfecv.support_)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True]


In [None]:
#Dropando as colunas menos relevantes
df_train = pd.DataFrame(x_train)
df_train_selected = df_train.loc[:, rfecv.support_]
print(df_train_selected.shape)

df_test = pd.DataFrame(x_test)
df_test_selected = df_test.loc[:, rfecv.support_]
print(df_test_selected.shape)

(15394, 58)
(3756, 58)


## The one who trains the model with selected features

In [None]:
classifier.fit(df_train_selected,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## The one who tests the NEW model

In [None]:
pred = classifier.predict(df_test_selected)

## The one who shows the results

In [None]:
print(pred.shape)

(3756,)


In [None]:
print('Acuracia: ',accuracy_score(pred,y_test))
print('Kappa: ', cohen_kappa_score(pred,y_test))
print('Matriz de confusão da rede = \n', confusion_matrix(pred,y_test))

Acuracia:  0.5298189563365282
Kappa:  0.469821718643182
Matriz de confusão da rede = 
 [[219  19  89  13  42  29  60  17   0   0]
 [ 17 270   9   6  89  67  11  51  19  15]
 [ 35   2 163   4  28   5  54   1   0   0]
 [  0   0   0   0   0   0   0   0   0   0]
 [ 32  45  97  20 253  19  54  30   6   2]
 [  2  30   0   0   7 183   3  12  11   9]
 [ 98   2  87   4  29  12 239   0   0   0]
 [ 11  46  20   6  10  30   6 280  37  12]
 [  0  14   0   0   1   9   0  16  75  19]
 [  7  45   0   0   4  78   2  11  89 308]]


# Save pickles train and test

In [None]:
path_save = "/content/drive/My Drive/Arquivos 10 classes/Handcrafted/New RFE/"

In [None]:
pickle.dump(df_train_selected, open(path_save+"handcrafted_rfe_train.pickle", "wb"))
pickle.dump(df_test_selected, open(path_save+"handcrafted_rfe_test.pickle", "wb"))

# Lendo os arquivos pickle treino e test

In [None]:
x_test_rfe = read_pickle(path_save+'handcrafted_no_aug_with_select_rfe_test_1.pickle')
x_train_rfe = read_pickle(path_save+'handcrafted_no_aug_with_select_rfe_train_1.pickle')

# Mostrando os dados

In [None]:
for i,date in enumerate(x_test_rfe):
  print(i, date)

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
 -5.90026283e+00 -5.88227797e+00 -7.85259438e+00  1.15281567e-01
 -1.90845740e+00 -3.74499261e-02  6.48338985e+00  6.09179258e+00
 -3.49530578e+00 -1.31542206e+00  1.08575380e+00 -1.12640977e+00
  4.09979969e-01  1.28210000e+04  4.27506000e-01  4.90084082e-01
  2.69337230e-01  2.15209919e+03  2.12281562e+03  1.86963142e+01
  1.47012440e+01  1.78309547e+01  1.78692549e+01  1.98562176e+01
  1.78028180e+01  3.51593247e+01  1.82918869e-02  4.32991842e+03
  1.18316420e+00  1.51491441e-03  1.02357355e-01]
2892 [-3.90314140e+01  9.54645920e+01 -2.31641617e+01  4.72764816e+01
  5.28654873e-01  2.46155109e+01 -1.05101604e+01  1.59151430e+01
 -6.34211826e+00  1.42284460e+01 -9.29748535e+00  1.53889532e+01
 -7.51600552e+00  4.97177410e+00 -1.07814388e+01  8.69756317e+00
 -4.97716761e+00  8.74536991e+00 -1.42135823e+00  8.53596020e+00
 -5.67926264e+00  1.00117326e+00 -5.16193533e+00 -4.21599060e-01
 -3.23662519e-01 -6.9254193

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
for i,date in enumerate(x_train_rfe):
  print(i, date)

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
  4.75427485e+00 -3.85348916e+00  2.62048554e+00 -7.47986078e+00
 -3.42543077e+00 -5.71864665e-01 -1.26073763e-01  6.02943420e-01
 -9.32834721e+00  6.83500000e+03  3.37701768e-01  3.99924666e-01
  2.38422371e-01  1.77915703e+03  2.09817950e+03  2.93951026e+01
  1.83128769e+01  2.07816620e+01  2.13273264e+01  2.15290996e+01
  1.99935853e+01  3.47179838e+01  6.20016083e-03  3.96146139e+03
  9.46565998e-01  9.73099211e-03  1.23752296e-01]
15325 [-2.37885925e+02  1.57584503e+02  4.20634232e+01  2.60593758e+01
 -5.40761089e+00  2.96372223e+01 -1.43983698e+01  7.64580297e+00
 -2.30853996e+01  8.48939323e+00 -1.02885513e+01  1.01524901e+00
 -1.12607412e+01  2.10343218e+00 -7.11502409e+00  2.85668588e+00
 -8.67667580e+00  4.04907417e+00 -6.66119003e+00  6.49753511e-01
 -1.39149780e+01  3.09707642e+00 -4.72041225e+00 -5.69673204e+00
  2.12543917e+00 -1.56916738e+00 -6.65108347e+00  1.80572081e+00
  4.06265354e+00 -1.336714