In [None]:
import pandas as pd

uri = "https://gist.githubusercontent.com/guilhermesilveira/e99a526b2e7ccc6c3b70f53db43a87d2/raw/1605fc74aa778066bf2e6695e24d53cf65f2f447/machine-learning-carros-simulacao.csv"
dados = pd.read_csv(uri).drop(columns=["Unnamed: 0"], axis=1)
dados.head()

In [None]:

# situação horrível de "azar" onde as classes estão ordenadas por padrão

dados_azar = dados.sort_values("vendido", ascending=True)
x_azar = dados_azar[["preco", "idade_do_modelo","km_por_ano"]]
y_azar = dados_azar["vendido"]
dados_azar.head()

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.dummy import DummyClassifier
import numpy as np
SEED = 301
np.random.seed(SEED)

modelo = DummyClassifier()
results = cross_validate(modelo, x_azar, y_azar, cv = 10, return_train_score=False)
media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()
print("Accuracy com dummy stratified, 10 = [%.2f, %.2f]" % ((media - 2 * desvio_padrao)*100, (media + 2 * desvio_padrao) * 100))

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier

SEED = 301
np.random.seed(SEED)

modelo = DecisionTreeClassifier(max_depth=2)
results = cross_validate(modelo, x_azar, y_azar, cv = 10, return_train_score=False)
media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()
print("Accuracy com cross validation, 10 = [%.2f, %.2f]" % ((media - 2 * desvio_padrao)*100, (media + 2 * desvio_padrao) * 100))

In [None]:
# gerando dados elatorios de modelo de carro para simulacao de agrupamento ao usar nosso estimador

np.random.seed(SEED)
dados['modelo'] = dados.idade_do_modelo + np.random.randint(-2, 3, size=10000)
dados.modelo = dados.modelo + abs(dados.modelo.min()) + 1
dados.head()

In [None]:
def imprime_resultados(results):
  media = results['test_score'].mean() * 100
  desvio = results['test_score'].std() * 100
  print("Accuracy médio %.2f" % media)
  print("Intervalo [%.2f, %.2f]" % (media - 2 * desvio, media + 2 * desvio))

In [None]:
# GroupKFold para analisar como o modelo se comporta com novos grupos

from sklearn.model_selection import GroupKFold

SEED = 301
np.random.seed(SEED)

cv = GroupKFold(n_splits = 10)
modelo = DecisionTreeClassifier(max_depth=2)
results = cross_validate(modelo, x_azar, y_azar, cv = cv, groups = dados.modelo, return_train_score=False)
imprime_resultados(results)

In [None]:
# GroupKFold em um pipeline com StandardScaler e SVC

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

SEED = 301
np.random.seed(SEED)

scaler = StandardScaler()
modelo = SVC()

pipeline = Pipeline([('transformacao',scaler), ('estimador',modelo)])

cv = GroupKFold(n_splits = 10)
results = cross_validate(pipeline, x_azar, y_azar, cv = cv, groups = dados.modelo, return_train_score=False)
imprime_resultados(results)

In [None]:
SEED = 301
np.random.seed(SEED)

cv = GroupKFold(n_splits=10)
modelo = DecisionTreeClassifier(max_depth=2)
results = cross_validate(modelo,x_azar,y_azar,cv=cv,groups= dados.modelo, return_train_score=False)
imprime_resultados(results)

In [None]:
from sklearn.tree import export_graphviz
import graphviz

modelo.fit(x_azar,y_azar)
features = x_azar.columns
dot_data = export_graphviz(modelo, out_file=None, filled=True, rounded=True,
                           class_names=["Não","Sim"],
                           feature_names= features)
graph = graphviz.Source(dot_data)
graph

Testando parametros

In [None]:
def roda_arvore_de_decisao(max_depth):
    SEED = 301
    np.random.seed(SEED)

    cv = GroupKFold(n_splits=10)
    modelo = DecisionTreeClassifier(max_depth=max_depth)
    results = cross_validate(modelo,x_azar,y_azar,cv=cv,groups= dados.modelo, return_train_score=True)
    train_score = results['train_score'].mean() * 100
    test_score = results['test_score'].mean()* 100
    print("max_depth = %d, media = %.2f , teste = %.2f" % (max_depth,test_score, train_score))
    tabela = [max_depth,test_score, train_score]
    return tabela





In [None]:
resultados = [roda_arvore_de_decisao(i) for i in range(1,33)]
resultados = pd.DataFrame(resultados,columns=["max_depth","train","test"])
resultados.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.lineplot(x= "max_depth",y= "train", data = resultados)
sns.lineplot(x= "max_depth",y="test",data = resultados)
plt.legend(["Treino","Teste"])

In [None]:
resultados.sort_values("train",ascending=False).head()

In [None]:
def roda_arvore_de_decisao(max_depth,min_samples_leaf):
    SEED = 301
    np.random.seed(SEED)

    cv = GroupKFold(n_splits=10)
    modelo = DecisionTreeClassifier(max_depth=max_depth,min_samples_leaf=min_samples_leaf)
    results = cross_validate(modelo,x_azar,y_azar,cv=cv,groups= dados.modelo, return_train_score=True)
    train_score = results['train_score'].mean() * 100
    test_score = results['test_score'].mean()* 100
    print("max_depth = %d, min_samples_leaf = %.2f , media = %.2f , teste = %.2f" % (max_depth,min_samples_leaf,test_score, train_score))
    tabela = [max_depth,min_samples_leaf,test_score, train_score]
    return tabela

In [None]:
def busca():
    resultados = []
    for max_depth in range(1,33):
        for min_samples_leaf in [32, 64 , 128 ,256]:
            tabela = roda_arvore_de_decisao(max_depth,min_samples_leaf)
            resultados.append(tabela)
    resultados = pd.DataFrame(resultados,columns=["max_depth","min_samples_leaf","train","test"])
    return resultados

resultados = busca()
resultados.head()

In [None]:
resultados.sort_values(['test'],ascending=False).head()


In [None]:
resultados.corr()

Explorando 3 dimensões de hiper parametro

In [None]:
def roda_arvore_de_decisao(max_depth,min_samples_leaf,min_samples_split):
    SEED = 301
    np.random.seed(SEED)

    cv = GroupKFold(n_splits=10)
    modelo = DecisionTreeClassifier(max_depth=max_depth,min_samples_leaf=min_samples_leaf,min_samples_split=min_samples_split)
    results = cross_validate(modelo,x_azar,y_azar,cv=cv,groups= dados.modelo, return_train_score=True)
    fit_time = results['fit_time'].mean()
    score_time = results['score_time'].mean()
    train_score = results['train_score'].mean() * 100
    test_score = results['test_score'].mean()* 100
    #print("max_depth = %d, min_samples_leaf = %.2f , min_samples_split = %d , media = %.2f , teste = %.2f" % (max_depth,min_samples_leaf,min_samples_split,test_score, train_score))
    tabela = [max_depth,min_samples_leaf,min_samples_split,test_score, train_score,fit_time,score_time]
    return tabela

In [None]:
def busca():
    resultados = []
    for max_depth in range(1,33):
        for min_samples_leaf in [32, 64 , 128 ,256]:
            for min_samples_split in [32 , 64 , 128 ,256]:
                tabela = roda_arvore_de_decisao(max_depth,min_samples_leaf,min_samples_split)
                resultados.append(tabela)
    resultados = pd.DataFrame(resultados,columns=["max_depth","min_samples_leaf","min_samples_split","train","test","fit_time","score_time"])
    return resultados

In [None]:
resultados = busca()
resultados.head()

In [None]:
corr = resultados.corr()

In [None]:
sns.set(style="white")

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool_)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
from sklearn.model_selection import GridSearchCV

SEED=301
np.random.seed(SEED)

espaco_de_parametros = {
    "max_depth" : [3, 5],
    "min_samples_split": [32, 64, 128],
    "min_samples_leaf": [32, 64, 128],
    "criterion": ["gini", "entropy"]

}

busca = GridSearchCV(DecisionTreeClassifier(),
                    espaco_de_parametros,
                    cv = GroupKFold(n_splits = 10))

busca.fit(x_azar, y_azar,groups = dados.modelo)
resultados = pd.DataFrame(busca.cv_results_)
resultados.head()

In [None]:
print(busca.best_params_)
print(busca.best_score_ * 100)

In [None]:
melhor = busca.best_estimator_
melhor

In [None]:
from sklearn.metrics import accuracy_score
#evitar essa abordagem pois estara sendo otimista
predicoes = melhor.predict(x_azar)
accuracy = accuracy_score(predicoes,y_azar)
print(accuracy)

como ter uma estimativa sem esse vicio nos dados que eu ja vi?

no caso de cross validation com busca de hiper parametros , fazemos uma nova validação cruzada.
chama-se nested cross validation

In [None]:
from sklearn.model_selection import cross_val_score
#Como o Pandas não suporta nested validation com o GroupKFold, não conseguiremos prever o resultado para novos grupos.
#cross_val_score(busca,x_azar,y_azar,cv=GroupKFold(n_splits=10),groups=dados.modelo)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

SEED=301
np.random.seed(SEED)

espaco_de_parametros = {
    "max_depth" : [3, 5],
    "min_samples_split": [32, 64, 128],
    "min_samples_leaf": [32, 64, 128],
    "criterion": ["gini", "entropy"]

}

busca = GridSearchCV(DecisionTreeClassifier(),
                    espaco_de_parametros,
                    cv = KFold(n_splits = 5, shuffle=True))

busca.fit(x_azar, y_azar)
resultados = pd.DataFrame(busca.cv_results_)
resultados.head()

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(busca, x_azar, y_azar, cv = KFold(n_splits=5, shuffle=True))
scores

In [None]:
def imprime_score(scores):
  media = scores.mean() * 100
  desvio = scores.std() * 100
  print("Accuracy médio %.2f" % media)
  print("Intervalo [%.2f, %.2f]" % (media - 2 * desvio, media + 2 * desvio))

In [None]:
from sklearn.tree import export_graphviz
import graphviz


features = x_azar.columns
dot_data = export_graphviz(melhor, out_file=None, filled=True, rounded=True,
                          class_names=["não","sim"],
                          feature_names=features)
graph = graphviz.Source(dot_data)
graph

Busca Aleatória: RandomSearch

In [None]:
from sklearn.model_selection import RandomizedSearchCV

SEED=301
np.random.seed(SEED)

espaco_de_parametros = {
    "max_depth" : [3, 5],
    "min_samples_split": [32, 64, 128],
    "min_samples_leaf": [32, 64, 128],
    "criterion": ["gini", "entropy"]

}

busca = RandomizedSearchCV(DecisionTreeClassifier(),
                    espaco_de_parametros, 
                    n_iter = 16,
                    cv = KFold(n_splits = 5),
                          random_state = SEED)


busca.fit(x_azar, y_azar,groups = dados.modelo)
resultados = pd.DataFrame(busca.cv_results_)
resultados.head()

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(busca, x_azar, y_azar, cv = KFold(n_splits=5, shuffle=True))
imprime_score(scores)

In [None]:
melhor = busca.best_estimator_
print(melhor)

Customaizando o espaço de hiper parametros

In [None]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
SEED=301
np.random.seed(SEED)

espaco_de_parametros = {
    "max_depth" : [3, 5 , 10,15,20,30,None],
    "min_samples_split": randint(32,128),
    "min_samples_leaf": [32, 64, 128],
    "criterion": ["gini", "entropy"]

}

busca = RandomizedSearchCV(DecisionTreeClassifier(),
                    espaco_de_parametros, 
                    n_iter = 16,
                    cv = KFold(n_splits = 5),
                          random_state = SEED)


busca.fit(x_azar, y_azar,groups = dados.modelo)
resultados = pd.DataFrame(busca.cv_results_)
resultados.head()

In [None]:
scores = cross_val_score(busca, x_azar, y_azar, cv = KFold(n_splits=5, shuffle=True))
imprime_score(scores)
melhor = busca.best_estimator_
print(melhor)

In [None]:
resultados_ordenados_pela_media = resultados.sort_values("mean_test_score")
resultados_ordenados_pela_media.sort_values("mean_test_score",inplace=True,ascending=False)
resultados_ordenados_pela_media

In [None]:
for indice,linha in resultados_ordenados_pela_media.iterrows():
    print("%.3f +-(%.3f) %s" % (linha.mean_test_score,linha.std_test_score*2,linha.params))

exploração mais a fundo de forma aleatoria

In [None]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
SEED=301
np.random.seed(SEED)

espaco_de_parametros = {
    "max_depth" : [3, 5 , 10,15,20,30,None],
    "min_samples_split": randint(32,128),
    "min_samples_leaf": [32, 64, 128],
    "criterion": ["gini", "entropy"]

}

busca = RandomizedSearchCV(DecisionTreeClassifier(),
                    espaco_de_parametros, 
                    n_iter = 64,
                    cv = KFold(n_splits = 5),
                          random_state = SEED)


busca.fit(x_azar, y_azar,groups = dados.modelo)
resultados = pd.DataFrame(busca.cv_results_)
resultados.head()

In [None]:
scores = cross_val_score(busca, x_azar, y_azar, cv = KFold(n_splits=5, shuffle=True))
imprime_score(scores)
melhor = busca.best_estimator_
print(melhor)

In [None]:
resultados_ordenados_pela_media = resultados.sort_values("mean_test_score")
resultados_ordenados_pela_media.sort_values("mean_test_score",inplace=True,ascending=False)
resultados_ordenados_pela_media
for indice,linha in resultados_ordenados_pela_media.iterrows():
    print("%.3f +-(%.3f) %s" % (linha.mean_test_score,linha.std_test_score*2,linha.params))

Comparando GridSearchCV com RandomizedSearch (1ºComparação)

In [None]:
from sklearn.ensemble import RandomForestClassifier
import time
SEED=301
np.random.seed(SEED)

espaco_de_parametros = {
    "n_estimators" : [10,100],
    "max_depth" : [3, 5],
    "min_samples_split": [32, 64, 128],
    "min_samples_leaf": [32, 64, 128],
    "criterion": ["gini", "entropy"],
    "bootstrap": [True,False]
}

tic = time.time()
busca = GridSearchCV(RandomForestClassifier(),
                    espaco_de_parametros,
                    cv = KFold(n_splits = 5, shuffle=True))
tac = time.time()
tempo_que_passou = tac-tic


busca.fit(x_azar, y_azar)
resultados = pd.DataFrame(busca.cv_results_)
resultados.head()

In [None]:
resultados_ordenados_pela_media = resultados.sort_values("mean_test_score")
resultados_ordenados_pela_media.sort_values("mean_test_score",inplace=True,ascending=False)
resultados_ordenados_pela_media
for indice,linha in resultados_ordenados_pela_media[:5].iterrows():
    print("%.3f +-(%.3f) %s" % (linha.mean_test_score,linha.std_test_score*2,linha.params))

In [None]:
scores = cross_val_score(busca, x_azar, y_azar, cv = KFold(n_splits=5, shuffle=True))
imprime_score(scores)
melhor = busca.best_estimator_
print(melhor)

In [None]:
SEED=301
np.random.seed(SEED)

espaco_de_parametros = {
    "max_depth" : [3, 5 , 10,15,20,30,None],
    "min_samples_split": randint(32,128),
    "min_samples_leaf": [32, 64, 128],
    "criterion": ["gini", "entropy"]

}
tic = time.time()
busca = RandomizedSearchCV(DecisionTreeClassifier(),
                    espaco_de_parametros, 
                    n_iter = 20,
                    cv = KFold(n_splits = 5,shuffle=True),
                          random_state = SEED)
busca.fit(x_azar, y_azar,groups = dados.modelo)
tac = time.time()
tempo_que_passou = tac - tic
print("tempo %.2f segundos" % tempo_que_passou)
resultados = pd.DataFrame(busca.cv_results_)
resultados.head()

In [None]:
resultados_ordenados_pela_media = resultados.sort_values("mean_test_score")
resultados_ordenados_pela_media.sort_values("mean_test_score",inplace=True,ascending=False)
resultados_ordenados_pela_media
for indice,linha in resultados_ordenados_pela_media[:5].iterrows():
    print("%.3f +-(%.3f) %s" % (linha.mean_test_score,linha.std_test_score*2,linha.params))

In [None]:
scores = cross_val_score(busca, x_azar, y_azar, cv = KFold(n_splits=5, shuffle=True))
imprime_score(scores)
melhor = busca.best_estimator_
print(melhor)

In [79]:
from sklearn.ensemble import RandomForestClassifier
import time
SEED=301
np.random.seed(SEED)

espaco_de_parametros = {
    "n_estimators" : randint(10,101),
    "max_depth" : randint(3,6),
    "min_samples_split": randint(32,129),
    "min_samples_leaf": randint(32,129),
    "criterion": ["gini", "entropy"],
    "bootstrap": [True,False]
}

tic = time.time()
busca = RandomizedSearchCV(RandomForestClassifier(),
                    espaco_de_parametros,
                    n_iter = 80,
                    cv = KFold(n_splits = 5, shuffle=True))
busca.fit(x_azar, y_azar)
tac = time.time()
tempo_que_passou = tac-tic


resultados = pd.DataFrame(busca.cv_results_)
resultados.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.2635,0.003925,0.013201,0.000245,False,gini,3,50,93,89,"{'bootstrap': False, 'criterion': 'gini', 'max...",0.7715,0.762,0.775,0.7855,0.758,0.7704,0.009744,75
1,0.2261,0.003486,0.012,2e-06,True,gini,3,88,104,80,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.7715,0.7575,0.7925,0.7855,0.7575,0.7729,0.014277,63
2,0.2132,0.003655,0.0118,0.000246,True,gini,3,126,84,77,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.7685,0.7695,0.7925,0.7795,0.7575,0.7735,0.011781,59
3,0.1476,0.008704,0.0089,0.000733,True,gini,3,57,73,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.772,0.767,0.783,0.7865,0.758,0.7733,0.010429,60
4,0.241201,0.005835,0.011799,0.000401,True,gini,4,52,88,68,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.772,0.7685,0.793,0.785,0.7585,0.7754,0.01222,30


In [80]:
resultados_ordenados_pela_media = resultados.sort_values("mean_test_score")
resultados_ordenados_pela_media.sort_values("mean_test_score",inplace=True,ascending=False)
resultados_ordenados_pela_media
for indice,linha in resultados_ordenados_pela_media[:5].iterrows():
    print("%.3f +-(%.3f) %s" % (linha.mean_test_score,linha.std_test_score*2,linha.params))

0.779 +-(0.025) {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 84, 'min_samples_split': 89, 'n_estimators': 48}
0.778 +-(0.031) {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 32, 'min_samples_split': 96, 'n_estimators': 18}
0.778 +-(0.032) {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 121, 'min_samples_split': 47, 'n_estimators': 27}
0.778 +-(0.024) {'bootstrap': False, 'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 96, 'min_samples_split': 98, 'n_estimators': 11}
0.777 +-(0.029) {'bootstrap': True, 'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 63, 'min_samples_split': 88, 'n_estimators': 69}


se eu não posso ou não consigo usar cross validation

In [87]:
# 0.6 treino
# 0.2 teste       -> dev teste
# 0.2 validacao

from sklearn.model_selection import train_test_split
SEED = 301
np.random.seed(SEED)


x_treino_teste, x_validacao, y_treino_teste, y_validacao =   train_test_split(x_azar,y_azar,test_size=0.2,shuffle=True,stratify=y_azar)

print(x_treino_teste.shape)
print(y_treino_teste.shape)
print(x_validacao.shape)
print(y_validacao.shape)


(8000, 3)
(8000,)
(2000, 3)
(2000,)


In [88]:
from sklearn.model_selection import StratifiedShuffleSplit

SEED = 301
np.random.seed(SEED)

espaco_de_parametros = {
    "n_estimators" : randint(10,101),
    "max_depth" : randint(3,6),
    "min_samples_split": randint(32,129),
    "min_samples_leaf": randint(32,129),
    "criterion": ["gini", "entropy"],
    "bootstrap": [True, False]
}
split = StratifiedShuffleSplit(n_splits=1,test_size=0.25)

tic = time.time()
busca = RandomizedSearchCV(RandomForestClassifier(),
                           espaco_de_parametros,
                           n_iter=5,
                           cv=split)
busca.fit(x_azar,y_azar)
tac = time.time()
tempo_que_passou = tac = tic
print("Tempo %.2f segundos" % tempo_que_passou)

resultados = pd.DataFrame(busca.cv_results_)
resultados.head()

Tempo 1682966822.47 segundos


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,mean_test_score,std_test_score,rank_test_score
0,0.302498,0.0,0.017002,0.0,False,gini,3,50,93,89,"{'bootstrap': False, 'criterion': 'gini', 'max...",0.7836,0.7836,0.0,2
1,0.241001,0.0,0.015002,0.0,True,gini,3,88,104,80,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.7828,0.7828,0.0,3
2,0.234,0.0,0.014502,0.0,True,gini,3,126,84,77,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.7824,0.7824,0.0,4
3,0.151997,0.0,0.01,0.0,True,gini,3,57,73,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.7776,0.7776,0.0,5
4,0.2345,0.0,0.015501,0.0,True,gini,4,52,88,68,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.7844,0.7844,0.0,1


In [91]:
#validação
tic = time.time()
scores = cross_val_score(busca, x_validacao, y_validacao, cv = split)
tac = time.time()
tempo_passado = tac- tic
print("Tempo %.2f segundos " % tempo_passado)


melhor = busca.best_estimator_
print(scores)
print(melhor)

Tempo 0.52 segundos 
[0.792]
RandomForestClassifier(max_depth=4, min_samples_leaf=52, min_samples_split=88,
                       n_estimators=68)
