# **Seleção de features (características/atributos)**

**Seleção de features baseada em modelo**

In [None]:
from sklearn.datasets import load_diabetes

diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target
print(diabetes.DESCR)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import RidgeCV

ridge = RidgeCV(alphas=np.logspace(-6, 6, num=5)).fit(X, y)
importance = np.abs(ridge.coef_)
feature_names = np.array(diabetes.feature_names)
plt.figure(figsize=(10,8))
plt.bar(height=importance, x=feature_names)
plt.title("Feature importances via coefficients")
plt.show()

In [None]:
from sklearn.feature_selection import SelectFromModel
from time import time

threshold = np.sort(importance)[-3] + 0.01

tic = time()
sfm = SelectFromModel(ridge, threshold=threshold).fit(X, y)
toc = time()
print(f"Features selected by SelectFromModel: {feature_names[sfm.get_support()]}")
print(f"Done in {toc - tic:.3f}s")

In [None]:
importance

In [None]:
threshold

**Eliminação recursiva de feature com cross-validation**

In [None]:
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=500,
    n_features=15,
    n_informative=3,
    n_redundant=2,
    n_repeated=0,
    n_classes=8,
    n_clusters_per_class=1,
    class_sep=0.8,
    random_state=42,
)

**RFECV - Recursive feature elimination with cross-validation**

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

min_features_to_select = 1  # Minimum number of features to consider
clf = LogisticRegression()
cv = StratifiedKFold(5)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")

In [None]:
import matplotlib.pyplot as plt

n_scores = len(rfecv.cv_results_["mean_test_score"])
plt.figure(figsize=(10,8))
plt.xlabel("Number of features selected")
plt.ylabel("Mean test accuracy")
plt.errorbar(
    range(min_features_to_select, n_scores + min_features_to_select),
    rfecv.cv_results_["mean_test_score"],
    yerr=rfecv.cv_results_["std_test_score"],
)
plt.title("Recursive Feature Elimination \nwith correlated features")
plt.show()

**Select from Model**

In [None]:
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
X, y = load_iris(return_X_y=True)
X.shape

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)
X_new.shape

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
X, y = load_iris(return_X_y=True)
X.shape

clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(X, y)
clf.feature_importances_

model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
X_new.shape

In [None]:
clf.feature_importances_

In [None]:
#wine, 13 features
from sklearn.datasets import load_wine

X, y = load_wine(return_X_y=True)
X.shape

In [None]:
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(X, y)
clf.feature_importances_

In [None]:
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
X_new.shape

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split

# partir o conjunto de dados em treino e teste
X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=0.3,random_state=1)

# Cria um modelo de knn e o treina com os dados (X, y)
knn = KNeighborsClassifier()

knn.fit(X_treino, y_treino)

#Verifica acurácia nos dados de treino
acc_treino = knn.score(X_treino, y_treino)*100
print(f'Acurácia no treino: {acc_treino:.2f}%')

#Verifica acurácia nos dados de teste
acc_teste = knn.score(X_teste, y_teste)*100
print(f'Acurácia no teste: {acc_teste:.2f}%')

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split

# partir o conjunto de dados em treino e teste
X_treino, X_teste, y_treino, y_teste = train_test_split(X_new, y, test_size=0.3,random_state=1)

# Cria um modelo de knn e o treina com os dados (X, y)
knn = KNeighborsClassifier()

knn.fit(X_treino, y_treino)

#Verifica acurácia nos dados de treino
acc_treino = knn.score(X_treino, y_treino)*100
print(f'Acurácia no treino: {acc_treino:.2f}%')

#Verifica acurácia nos dados de teste
acc_teste = knn.score(X_teste, y_teste)*100
print(f'Acurácia no teste: {acc_teste:.2f}%')

**Sequential feature selection**

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
import numpy as np
from sklearn.datasets import load_iris
from sklearn.ensemble import ExtraTreesClassifier

iris = load_iris()
X, y = load_iris(return_X_y=True)

feature_names = np.array(iris.feature_names)
clf = ExtraTreesClassifier(n_estimators=50)

sfs_forward = SequentialFeatureSelector(clf, n_features_to_select=2,
                                        direction='forward').fit(X,y)
feature_names[sfs_forward.get_support()]

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
import numpy as np
from sklearn.datasets import load_iris
from sklearn.ensemble import ExtraTreesClassifier

iris = load_iris()
X, y = load_iris(return_X_y=True)

feature_names = np.array(iris.feature_names)
clf = ExtraTreesClassifier(n_estimators=50)

bfs_backward = SequentialFeatureSelector(clf, n_features_to_select=2,
                                        direction='backward').fit(X,y)
feature_names[bfs_backward.get_support()]

# **Balanceamento de classes e geração de dados artificiais** -> Ex. TCC



##Dataset no tempo - Carregamento

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
dados = pd.read_csv('dados_voice.csv')

In [None]:
dados.head()

In [None]:
dados.shape

In [None]:
dados.columns

In [None]:
dados.drop('Unnamed: 0',axis=1,inplace=True)
dados.head()

In [None]:
X = dados.drop(['idade','target','fuma_1','fuma_2','fuma_3','qtde/dia','val_genero','target'],axis=1) #Atributos numéricos
y = dados['target']

X_np = X.to_numpy() #Convertendo em array numpy
X_np

In [None]:
y

In [None]:
plt.plot(X_np[0])

In [None]:
plt.plot(X_np[100])

## Transformação dos dados para PSD - Gerando novo dataset

**Visualizando registros manualmente**

In [None]:
import scipy
from scipy.signal import welch

(f1, S1)= scipy.signal.welch(X_np[0,:], fs=8000.0, nperseg=4096)
(f2, S2)= scipy.signal.welch(X_np[1,:], fs=8000.0, nperseg=4096)
(f3, S3)= scipy.signal.welch(X_np[7,:], fs=8000.0, nperseg=4096)

plt.figure(figsize=(5,3))
plt.semilogy(f1, S1,'b')
plt.semilogy(f2, S2,'r')
plt.semilogy(f3, S3,'g')
plt.xlabel('frequency [Hz]')
plt.ylabel('PSD [V²/Hz]')
plt.savefig('destination_path.eps', format='eps', bbox_inches ="tight")
plt.show()

In [None]:
S1.shape

**Iterando registros e transformação de escala (V²/Hz para dB)**

In [None]:
def psd(signals, fs=8000.0, nperseg=4*1024):
  n = signals.shape[0]
  tmp = np.zeros((n, 2049))

  for i in range(n):
    _, S= scipy.signal.welch(signals[i, :], fs=8000.0, nperseg=4*1024)
    tmp[i, :] = 20*np.log10(S)

  fig = plt.figure(figsize=(6,5))
  p1 = plt.plot(_, tmp[0,:],'b')
  p2 = plt.plot(_, tmp[1,:],'r')
  p3 = plt.plot(_, tmp[7,:],'g')
  plt.xlabel('Frequência [Hz]')
  plt.ylabel('PSD [dB]')
  plt.legend((p1[0],p2[0],p3[0]),('Disfonia','Saudável','Laringite'))
  plt.savefig('psd_classes.eps', format='eps', bbox_inches ="tight")
  plt.show()
  df = pd.DataFrame(tmp)
  return df

In [None]:
X_psd = psd(X_np) #Dataframe da PSD


In [None]:
X_psd = psd(X_np) #Dataframe da PSD
X_psd

## Smote (PSD)

In [None]:
# problema de desbalanceamento
y.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter
import numpy as np

In [None]:
strategy = {0:113, 1:113, 2:113}  #Estratégia para balanceamento

In [None]:
sm = SMOTE(random_state=42,sampling_strategy=strategy)
X_res, y_res = sm.fit_resample(X_psd,y)  #X_res e y_res compõem o novo dataset balanceado

X_treino, X_teste, y_treino, y_teste = train_test_split(X_res, y_res, test_size=0.2,random_state=1)

In [None]:
y_res.value_counts()

In [None]:
y_treino.value_counts()

In [None]:
y_teste.value_counts()

## Normalização

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scaler = StandardScaler()
scaler.fit(X_treino)
X_norm_treino = scaler.transform(X_treino)

In [None]:
X_norm_treino = pd.DataFrame(X_norm_treino)

In [None]:
X_norm_treino

In [None]:
X_norm_teste = scaler.transform(X_teste)
X_norm_teste

## Random Forest - Smote

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold

rf = RandomForestClassifier(random_state=42)
kf = KFold(n_splits=5)
param_rand = {'n_estimators':[50,100,150,200,250,500,700,800],
              'criterion':['gini', 'entropy']} #remover log_loss

rs = RandomizedSearchCV(estimator=rf, param_distributions=param_rand, cv=kf,
                  scoring='accuracy', n_iter=10)

rf_randomcv = rs.fit(X_treino, y_treino)

In [None]:
rf_randomcv.best_params_

In [None]:
rf_randomcv.best_score_

In [None]:
rf_otm = RandomForestClassifier().set_params(**rf_randomcv.best_params_)

rf_otm.fit(X_treino, y_treino)
rf_otm.score(X_treino, y_treino)

In [None]:
rf_otm.score(X_teste, y_teste)

In [None]:
y_treino_pred = rf_otm.predict(X_treino)

In [None]:
from sklearn.metrics import classification_report

target_names = ['Disfonia', 'Saudável', 'Laringite']
print(classification_report(y_treino, y_treino_pred, target_names=target_names))

In [None]:
y_teste_pred = rf_otm.predict(X_teste)

In [None]:
from sklearn.metrics import classification_report

target_names = ['Disfonia', 'Saudável', 'Laringite']
print(classification_report(y_teste, y_teste_pred, target_names=target_names))

## SVC - Smote

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV

svc = SVC(random_state=42)
kf = KFold(n_splits=5)
param_rand = {'C':[0.25,0.5,0.75,1],
              'kernel':['linear','poly','rbf','sigmoid']}

rs = RandomizedSearchCV(estimator=svc, param_distributions=param_rand, cv=kf,
                  scoring='accuracy', n_iter=10)

svc_randomcv = rs.fit(X_treino, y_treino)

In [None]:
svc_randomcv.best_params_

In [None]:
svc_randomcv.best_score_

In [None]:
svc_otm = SVC().set_params(**svc_randomcv.best_params_)

svc_otm.fit(X_treino, y_treino)
svc_otm.score(X_treino, y_treino)

In [None]:
svc_otm.score(X_teste, y_teste)

In [None]:
y_treino_pred = svc_otm.predict(X_treino)

In [None]:
from sklearn.metrics import classification_report

target_names = ['Disfonia', 'Saudável', 'Laringite']
print(classification_report(y_treino, y_treino_pred, target_names=target_names))

In [None]:
y_teste_pred = svc_otm.predict(X_teste)

In [None]:
from sklearn.metrics import classification_report

target_names = ['Disfonia', 'Saudável', 'Laringite']
print(classification_report(y_teste, y_teste_pred, target_names=target_names))

## MLP - Normalizado + Smote

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scaler = StandardScaler()
scaler.fit(X_treino)
X_norm_treino = scaler.transform(X_treino)

In [None]:
X_norm_treino = pd.DataFrame(X_norm_treino)

In [None]:
X_norm_teste = scaler.transform(X_teste)
X_norm_teste = pd.DataFrame(X_norm_teste)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV

mlp = MLPClassifier(random_state=42, verbose=True, tol=0.00001, n_iter_no_change=20)
kf = KFold(n_splits=5)
param_rand = {'hidden_layer_sizes':[(100,),(50,),(25,),(200,)],
              'max_iter':[1000, 1500, 2000],
              'activation': ['tanh', 'relu']}

rs = RandomizedSearchCV(estimator=mlp, param_distributions=param_rand, cv=kf,
                  scoring='accuracy', n_iter=10)

mlp_randomcv = rs.fit(X_norm_treino, y_treino)

In [None]:
mlp_randomcv.best_params_

In [None]:
mlp_randomcv.best_score_

In [None]:
mlp_otima = MLPClassifier().set_params(**mlp_randomcv.best_params_)

mlp_otima.fit(X_norm_treino, y_treino)
mlp_otima.score(X_norm_treino, y_treino)

In [None]:
mlp_otima.score(X_norm_teste, y_teste)

In [None]:
y_treino_pred = mlp_otima.predict(X_norm_treino)

In [None]:
from sklearn.metrics import classification_report

target_names = ['Disfonia', 'Saudável', 'Laringite']
print(classification_report(y_treino, y_treino_pred, target_names=target_names))

In [None]:
y_teste_pred = mlp_otima.predict(X_norm_teste)

In [None]:
from sklearn.metrics import classification_report

target_names = ['Disfonia', 'Saudável', 'Laringite']
print(classification_report(y_teste, y_teste_pred, target_names=target_names))