In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.lines import Line2D
import seaborn as sns

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


## Tratamento de dados

In [3]:
df = pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/MASS/biopsy.csv',index_col=0)
df.head()

Unnamed: 0_level_0,ID,V1,V2,V3,V4,V5,V6,V7,V8,V9,class
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1000025,5,1,1,1,2,1.0,3,1,1,benign
2,1002945,5,4,4,5,7,10.0,3,2,1,benign
3,1015425,3,1,1,1,2,2.0,3,1,1,benign
4,1016277,6,8,8,1,3,4.0,3,7,1,benign
5,1017023,4,1,1,3,2,1.0,3,1,1,benign


In [4]:
df.isnull().sum() / len(df)

ID       0.00000
V1       0.00000
V2       0.00000
V3       0.00000
V4       0.00000
V5       0.00000
V6       0.02289
V7       0.00000
V8       0.00000
V9       0.00000
class    0.00000
dtype: float64

In [5]:
df['V6'] = df[['V6']].fillna(df['V6'].mean())
df.isnull().sum() / len(df)

ID       0.0
V1       0.0
V2       0.0
V3       0.0
V4       0.0
V5       0.0
V6       0.0
V7       0.0
V8       0.0
V9       0.0
class    0.0
dtype: float64

## Previsão para o percentil

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler

X = df.drop(columns=['ID','class'])
y = df['class']

scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=123)

clf = KNeighborsClassifier(n_neighbors = 9)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print( y_pred[0:10], '...' )
print( clf.score(X_test,y_test) )

['benign' 'benign' 'malignant' 'benign' 'benign' 'malignant' 'benign'
 'benign' 'benign' 'benign'] ...
0.9380952380952381


In [8]:
# Se considerarmos 0.93 um bom resultado podemos então aplicar o modelo para novos casos. Por exemplo, podemos fazer predição considerando pacientes hipotéticos valores das medidas v1-v9 dos tumores nos percentis  [0.10,0.25,0.75,0.90] 

X_new = pd.DataFrame( df.drop(columns=['ID','class']).quantile([0.10, 0.25, 0.75, 0.90]) ).reset_index(drop=True)
display(X_new)
X_new_scaled = scaler.transform(X_new)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9
0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
1,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0
2,6.0,5.0,5.0,4.0,4.0,5.0,5.0,4.0,1.0
3,9.0,9.0,8.0,8.0,6.0,10.0,7.0,9.0,3.0


In [9]:
clf.predict(X_new_scaled)

array(['benign', 'benign', 'malignant', 'malignant'], dtype=object)

# Escolha de Hiperparametros

Num exemplo anterior empregamos o modelo Knn com k=9, uma escolha arbitrária, e a função de distância euclidiana. Será que haveria hiperparâmetros que apresentassem um desempenho melhor?

A escolha de melhores hiperparâmetros é em geral por experimentação uma vez que não existem hiperparâmetros melhores apriori para quaisquer conjuntos de dados. A ideia, então, é criarmos os diferentes modelos e avaliarmos o desempenho de cada um para obtermos os melhores hiperparâmetros.

Podemos então adaptar o nosso código do modelo Knn anterior para, por exemplo, variar os hiperparâmetros k, no range de valores de 2 a 10 e experimentar o resultado das funções distância 'euclidean' e 'manhattan'. manhattan

###### teoria

In [10]:
## NO BRAÇO

from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

df = pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/MASS/biopsy.csv',index_col=0)
df['V6'] = df[['V6']].fillna(df['V6'].mean())

X = df.drop(columns=['ID','class'])
y = df['class']

scaler.fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=123)

for k, d in [(k,d) for k in range(2,11) for d in ['euclidean','manhattan']]:

  clf = neighbors.KNeighborsClassifier(n_neighbors = k, metric= d )

  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  print( k, d, np.round( clf.score(X_test,y_test), 4) )

2 euclidean 0.9095
2 manhattan 0.9286
3 euclidean 0.9429
3 manhattan 0.9381
4 euclidean 0.9381
4 manhattan 0.9333
5 euclidean 0.9429
5 manhattan 0.9333
6 euclidean 0.9381
6 manhattan 0.9286
7 euclidean 0.9429
7 manhattan 0.9333
8 euclidean 0.9381
8 manhattan 0.9333
9 euclidean 0.9381
9 manhattan 0.9381
10 euclidean 0.9381
10 manhattan 0.9429


Embora tendo escolhido os conjuntos de treinamento e teste de forma aleatória o resultado acima, pode depender do par (treinamento, teste) escolhido.

> *Tire o parâmetro `random_state=123` do código acima e veja que a cada nova execução diferentes valores de acuracidade são produzidos para os mesmos parâmetros. Assim, para obtermos uma medida mais efetiva dos modelos, precisamos executar sobre um grande número de diferentes conjuntos de teste.*

Para não considerarmos o resultado de uma única amostra, podemos fazer várias execuções a obter a média dos valores sobre várias amostras, o que será uma medida independente de um par específico de dados e uma melhor aproximação do resultado esperado do modelo.



In [11]:
scores_means = {}

for k, d in [(k,d) for k in range(2,9) for d in ['euclidean','manhattan']]:

  scores = []
  for i in range(0,25):

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=123)

    clf = KNeighborsClassifier(n_neighbors = k, metric= d )

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    scores.append(clf.score(X_test,y_test))

  scores_means[(k,d)] = np.mean(scores)

scores_means = pd.DataFrame(scores_means.values(), index=scores_means.keys()).reset_index()
scores_means.columns = ['k','metric','score']

display(scores_means)
print('\nBest result:\n')
display(scores_means.nlargest(1,'score'))

Unnamed: 0,k,metric,score
0,2,euclidean,0.909524
1,2,manhattan,0.928571
2,3,euclidean,0.942857
3,3,manhattan,0.938095
4,4,euclidean,0.938095
5,4,manhattan,0.933333
6,5,euclidean,0.942857
7,5,manhattan,0.933333
8,6,euclidean,0.938095
9,6,manhattan,0.928571



Best result:



Unnamed: 0,k,metric,score
2,3,euclidean,0.942857


Desse modo, concluímos que para o nosso conjunto de dados os melhores resultados com o modelo knn são obtidos com os parâmetros k=3 e a métrica Euclidiana.

In [13]:
# para arvore de decisão
from sklearn.tree import DecisionTreeClassifier

scores_means = {}

for max_depth, criterion in [(max_depth, criterion) for max_depth in range(2,6) for criterion in ['gini','entropy']]:

  scores = []
  for i in range(0,25):

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=123)

    clf = DecisionTreeClassifier(max_depth=max_depth, criterion=criterion)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    scores.append(clf.score(X_test,y_test))

  scores_means[(max_depth, criterion)] = np.mean(scores)

scores_means = pd.DataFrame(scores_means.values(), index=scores_means.keys()).reset_index()
scores_means.columns = ['max_depth', 'criterion' ,'score']

display(scores_means)
print('\nBest result:\n')
display(scores_means.nlargest(1,'score'))

Unnamed: 0,max_depth,criterion,score
0,2,gini,0.933333
1,2,entropy,0.92381
2,3,gini,0.94019
3,3,entropy,0.92381
4,4,gini,0.921714
5,4,entropy,0.919048
6,5,gini,0.925905
7,5,entropy,0.920952



Best result:



Unnamed: 0,max_depth,criterion,score
2,3,gini,0.94019



Aqui o melhor modelo de Árvore de Decisão apresenta um score inferior ao do modelo de K-Vizinhos mais Próximos como k=3 e métrica euclidiana e, assim, optaríamos por este último se levarmos somente o critério de acuracidade como o critério de seleção do melhor modelo, e uma alternativa comum é empregarmos o F1, e o procedimento poderia ser o mesmo.

## Pratica



O último refinamento que faremos no procedimento de busca de melhores hiperparâmetros consiste em empregarmos GridSearchCV() do scikit-learn. Essa função automatiza a busca de melhores hiperparâmetros que fizemos acima implementando de forma manual os diferentes estimadores para um espaço de hiperparâmetros em for k, d in [(k,d) for k in range(2,9) for d in ['euclidean','manhattan']]:... e podendo ser aplicada a qualquer estimador.


Qualquer parâmetro de um estimador pode ser otimizado desta maneira e para encontrar os nomes e valores dos parâmetros de um determinado estimador você pode empregar o método estimator.get_params().

In [18]:
clf = neighbors.KNeighborsClassifier()
print(clf.get_params)
# e do mesmo modo os hiperparâmetros da regressão logística são também obtidos
from sklearn.linear_model import LogisticRegression
print(LogisticRegression().get_params)


from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=3, criterion='entropy')
clf.get_params

<bound method BaseEstimator.get_params of KNeighborsClassifier()>
<bound method BaseEstimator.get_params of LogisticRegression()>


<bound method BaseEstimator.get_params of DecisionTreeClassifier(criterion='entropy', max_depth=3)>

In [19]:
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

df = pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/MASS/biopsy.csv',index_col=0)
df['V6'] = df[['V6']].fillna(df['V6'].mean())

X = df.drop(columns=['ID','class'])
y = df['class']

scaler.fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=123)

base_estimator = neighbors.KNeighborsClassifier()
param_grid = {'n_neighbors': [3,4,5,6,7,8,9,10], 'metric': ['euclidean','manhattan']}

clf = GridSearchCV(base_estimator, param_grid, cv=5, scoring='accuracy')

# Por padrão as funções de pesquisa em grade, como o GridSearchCV() empregam o score padrão do estimador como função de pontuação (no caso de classificação é o accuracy), mas deixamos explícito o parâmetro pois você poderia querer empregar uma função de pontuação dos estimadores baseada em outra métrica.

clf.fit(X_train, y_train)

# print(clf.cv_results_)
print(clf.best_estimator_)

print()
print("Detailed classification report:")
print()
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print()



KNeighborsClassifier(metric='manhattan')

Detailed classification report:

              precision    recall  f1-score   support

      benign       0.94      0.96      0.95       138
   malignant       0.91      0.89      0.90        72

    accuracy                           0.93       210
   macro avg       0.93      0.92      0.93       210
weighted avg       0.93      0.93      0.93       210




In [20]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

df = pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/MASS/biopsy.csv',index_col=0)
df['V6'] = df[['V6']].fillna(df['V6'].mean())

X = df.drop(columns=['ID','class'])
y = df['class']

scaler.fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=123)

base_estimator = RandomForestClassifier(random_state=123)
param_grid = {'n_estimators':[3,4,5,6],'criterion':['gini','entropy'],'max_depth':[2,3,4,5]}

clf = GridSearchCV(base_estimator, param_grid, cv=5, scoring='accuracy')
clf.fit(X_train, y_train)

print(clf.cv_results_)
print(clf.best_estimator_)

print()
print("Detailed classification report:")
print()
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print()





{'mean_fit_time': array([0.00652928, 0.00570097, 0.00681386, 0.00757079, 0.00458994,
       0.00600152, 0.0065135 , 0.00782247, 0.00463891, 0.00564442,
       0.02208323, 0.00770912, 0.00472965, 0.00576663, 0.00668273,
       0.00782661, 0.00458245, 0.00566788, 0.00647326, 0.00737758,
       0.00458941, 0.00573297, 0.00656924, 0.00761051, 0.00465937,
       0.00581107, 0.00669041, 0.00768285, 0.00470223, 0.00570092,
       0.00676513, 0.00773587]), 'std_fit_time': array([1.26768844e-03, 3.75386569e-04, 3.86738529e-04, 2.35699667e-04,
       6.00937044e-05, 6.92466418e-04, 3.84156185e-05, 4.38801902e-04,
       3.19320724e-05, 9.10415135e-05, 3.05477411e-02, 9.54454341e-05,
       6.68133091e-05, 1.10020810e-04, 1.67766736e-05, 1.20808499e-04,
       7.20963591e-05, 2.00688088e-04, 8.05780588e-05, 6.71696524e-05,
       8.51632499e-05, 9.61218321e-05, 4.03158979e-05, 1.21878626e-04,
       6.30807803e-05, 1.99629549e-04, 5.82230862e-05, 3.05262941e-05,
       1.14618065e-04, 3.59803891e

  _data = np.array(data, dtype=dtype, copy=copy,


* Selecionar um estimador (um classificador ou um regressor)
* Definir um espaço de hiperparâmetros que desejamos avaliar
* Definir uma função de pontuação (score function)
* Empregar um esquema de validação cruzada

# Cross Validation

acc_scores = cross_val_score(clf, X_train, y_train, cv = 10)


![imagem](https://scikit-learn.org/stable/_images/grid_search_cross_validation.png)

In [14]:
from sklearn.model_selection import cross_val_score

df = pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/MASS/biopsy.csv',index_col=0)
df['V6'] = df[['V6']].fillna(df['V6'].mean())

X = df.drop(columns=['ID','class'])
y = df['class']

scaler.fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=123)

clf = neighbors.KNeighborsClassifier(n_neighbors = 3, metric= 'euclidean' )

acc_scores = cross_val_score(clf, X_train, y_train, cv = 10)

print(acc_scores, '\n')
print("accuracy: %0.3f +/- %0.3f" % (acc_scores.mean(), acc_scores.std() * 2),'\n')

for metric in ['accuracy','f1_macro','precision_macro','recall_macro']:
  scores = cross_val_score(clf, X_train, y_train, cv = 4, scoring=metric)
  print(metric + ": %0.3f +/- %0.3f" % (scores.mean(), scores.std() * 2))




[0.97959184 0.93877551 1.         0.97959184 0.95918367 0.95918367
 1.         0.93877551 0.97959184 0.95833333] 

accuracy: 0.969 +/- 0.042 

accuracy: 0.971 +/- 0.008
f1_macro: 0.968 +/- 0.009
precision_macro: 0.970 +/- 0.009
recall_macro: 0.967 +/- 0.018


#### Exemplo 2


In [None]:
df = sns.load_dataset('penguins')
df.dropna(inplace=True)

df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


In [None]:
## Treinamento e teste
# As features (X) e o alvo (y)
X = df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
y = df['species']

# Dividindo em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [16]:
# Modelos
# Criando o dicionário de modelos que iremos testar
models = {
    'RandomForest': RandomForestClassifier(),
    'DecisionTree': DecisionTreeClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier()
}


O obtenha a acuracidade média dos modelos para um cv de 5 partições. Explore a saída do estimador cross_val_score().

In [None]:
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5)
    print(f"{name}, score = {scores.mean():.4f}")

RandomForest, score = 0.9760
DecisionTree, score = 0.9580


In [None]:
scores

array([0.97014925, 0.98507463, 0.91044776, 0.98484848, 0.93939394])


Acima foram empregadas partições diferentes para cada modelo. Empregue os estimadores KFold (ou alternativamente o StratifiedKFold), para fixar as partições e obtenha o novo score dos modelos fazendo a seleção agora pela métrica f1_macro.

In [15]:
kf = KFold(n_splits=5, shuffle=True, random_state=1)

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=kf, scoring='f1_macro')
    print(f"{name}, score = {scores.mean():.4f}")

RandomForest, score = 0.9756
DecisionTree, score = 0.9666


In [17]:
# Empregue o código anterior para selecionar programaticamente o melhor modelo, treinar, aplicar ao conjunto de teste e obter a acuracidade nesse conjunto.

models_scores = {}

for name, model in models.items():
    models_scores[model] = cross_val_score(model, X, y, cv=kf, scoring='f1_macro').mean()

models_scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{RandomForestClassifier(): np.float64(0.9721484905798631),
 DecisionTreeClassifier(): np.float64(0.960080404872922),
 LogisticRegression(max_iter=1000): np.float64(0.9823596229934367),
 KNeighborsClassifier(): np.float64(0.6890947070491956)}

In [18]:
best_model = max(models_scores, key=models_scores.get)
print(best_model)


LogisticRegression(max_iter=1000)


In [19]:
model = best_model
y_pred = model.fit(X_train,y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia: {accuracy:.2f}")

Acurácia: 0.99


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
model.get_params()


{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 1000,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}


Empregando o `GridSearchCV`. O `GridSearchCV` permite automatizar todas essas operações.

Empregue o exemplo de código abaixo para corrigir, como fizemos antes, o uso de partições diferentes na avaliação dos modelos (use o `KFold`).

In [21]:
# Criando o pipeline com pré-processamento e modelo
pipeline = Pipeline([('model', None)])
# pipeline = Pipeline([('scaler', StandardScaler()), ('model', None)])

# Definindo o dicionário de parâmetros para o GridSearchCV (somente diferentes modelos)
param_grid = [
    {'model': [models['RandomForest']]},
    {'model': [models['DecisionTree']], 'model__max_depth': [5,6,7]},
    {'model': [models['LogisticRegression']]},
    {'model': [models['KNN']], 'model__n_neighbors': [3, 5, 7, 9]}
]

kf = KFold(n_splits=5, shuffle=True, random_state=1)

# Usando o GridSearchCV para encontrar o melhor modelo
grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='accuracy')
# grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='f1_macro')

grid_search.fit(X_train, y_train)

# Exibindo o melhor modelo encontrado
print(f"Melhor modelo: {grid_search.best_estimator_['model']}")

# Avaliando o desempenho no conjunto de teste
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia no teste: {accuracy:.2f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Melhor modelo: LogisticRegression(max_iter=1000)
Acurácia no teste: 0.99


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### + hiperparametros braçal

In [15]:
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

df = pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/MASS/biopsy.csv',index_col=0)
df['V6'] = df[['V6']].fillna(df['V6'].mean())

X = df.drop(columns=['ID','class'])
y = df['class']

scaler.fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=123)

scores_means = {}

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=123)

for k, d in [(k,d) for k in range(2,9) for d in ['euclidean','manhattan']]:

  clf = neighbors.KNeighborsClassifier(n_neighbors = k, metric= d )

  acc_scores = cross_val_score(clf, X_train, y_train, cv = 5)

  scores_means[(k,d)] = acc_scores.mean()

scores_means = pd.DataFrame(scores_means.values(), index=scores_means.keys()).reset_index()
scores_means.columns = ['k','metric','score']

# display(scores_means)
print('\nBest result:\n')
display(scores_means.nlargest(1,'score'))



Best result:



Unnamed: 0,k,metric,score
7,5,manhattan,0.975489


In [16]:
# Aplicando o melhor modelo

k = scores_means.nlargest(1,'score').k.values[0]
metric = scores_means.nlargest(1,'score').metric.values[0]

clf = neighbors.KNeighborsClassifier(n_neighbors = k, metric= metric )
print(clf)

clf.fit(X_train, y_train)

print("accuracy: %0.3f" % clf.score(X_test,y_test),'\n')

KNeighborsClassifier(metric='manhattan', n_neighbors=np.int64(5))
accuracy: 0.933 

