# Configuração Inicial

In [1]:
import pathlib
import sys
import os
requirements_filename = 'requirements_ap7.txt'
if str(pathlib.Path().resolve().name) == "notebooks":
    root = str(pathlib.Path().resolve().parent.parent)+os.sep
    sys.path.append(root)
    requirements_path = root + 'requirements/'+requirements_filename
else:
    ! git clone --branch ap7 https://github.com/Fabio-Trindade/Eng-Aprendizado-Maquina.git
    root = str(pathlib.Path().resolve())
    src_path = root + '/Eng-Aprendizado-Maquina/'
    requirements_path = src_path + 'requirements/'+requirements_filename
    sys.path.append(root + '/Eng-Aprendizado-Maquina/')

! pip install -r $requirements_path



# Imports

In [2]:
from src.utils.util_read_file import UtilReadFile
from src.constants.KPaths import KPaths
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn import svm
from sklearn.linear_model import LogisticRegression
import pickle


In [3]:
root = KPaths.path_root

# Considerações Iniciais

Assim como na [Atividade 6](https://github.com/Fabio-Trindade/Eng-Aprendizado-Maquina/blob/ap6/src/notebooks/3869-Fabio-AP6.ipynb), neste *notebook* a etapa de análise exploratória será feita de forma mais direta, uma vez que o passo a passo já foi feito na [Atividade 5](https://github.com/Fabio-Trindade/Eng-Aprendizado-Maquina/blob/ap5/src/notebooks/3869-Fabio-AP5.ipynb) para este mesmo *dataset* (*breastcancerwisconsin*). Além disso, será utilizado PCA com 15 componentes para transformar os dados, uma vez que esta configuração forneceu o melhor resultado nestas atividades anteriores.

# Criação e Manipulação do *dataset*

## Remoção das Colunas Irrelevantes 

In [4]:
df = UtilReadFile.read_csv_with_pandas(root + '/datasets/breastcancerwisconsin.csv')
df = df.drop(['id','Unnamed: 32'],axis=1)

## Transformação da Coluna *Diagnosis*

In [5]:
mapp = {'B':0,'M':1}
df['diagnosis_bin'] = df['diagnosis'].map(mapp)

In [6]:
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis_bin
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


## Definindo Colunas de *features* e *labels*

In [7]:
feature_columns = list(set(df.columns)-set(['diagnosis','diagnosis_bin']))
label_column = 'diagnosis_bin'

## Aplicando PCA com 15 Componentes

### Normalizando os Dados

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[feature_columns] = scaler.fit_transform(df[feature_columns])

### Reduzindo a Dimensionalidade dos Dados

In [9]:
from sklearn.decomposition import PCA

pca = PCA(n_components=15)
pca_df = pca.fit_transform(df[feature_columns])
feature_columns = [f'PCA{i+1}' for i in range(15)]

pca_df = pd.DataFrame(data=pca_df, columns=feature_columns)
pca_df[label_column] = df[label_column]

In [10]:
pca_df

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,PCA11,PCA12,PCA13,PCA14,PCA15,diagnosis_bin
0,9.192837,1.948583,-1.123166,3.633731,-1.195110,1.411424,2.159370,-0.398407,-0.157118,-0.877402,0.262955,-0.859014,0.103387,-0.690803,-0.601796,1
1,2.387802,-3.768172,-0.529293,1.118264,0.621775,0.028656,0.013358,0.240988,-0.711905,1.106995,0.813120,0.157923,-0.943529,-0.653476,0.008976,1
2,5.733896,-1.075174,-0.551748,0.912083,-0.177086,0.541452,-0.668166,0.097374,0.024066,0.454275,-0.605604,0.124387,-0.410627,0.016680,0.483420,1
3,7.122953,10.275589,-3.232790,0.152547,-2.960878,3.053422,1.429911,1.059565,-1.405440,-1.116975,-1.151513,1.011316,-0.933271,-0.487418,-0.168847,1
4,3.935302,-1.948072,1.389767,2.940639,0.546747,-1.226495,-0.936213,0.636376,-0.263805,0.377704,0.651360,-0.110515,0.387948,-0.539181,0.310320,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,6.439315,-3.576817,2.459487,1.177314,-0.074824,-2.375193,-0.596130,-0.035471,0.987929,0.256989,-0.062651,0.123342,-0.051723,-0.404291,0.652751,1
565,3.793382,-3.584048,2.088476,-2.506028,-0.510723,-0.246710,-0.716326,-1.113360,-0.105207,-0.108632,0.244804,0.222753,-0.192637,0.015556,-0.069975,1
566,1.256179,-1.902297,0.562731,-2.089227,1.809991,-0.534447,-0.192758,0.341887,0.393917,0.520877,-0.840512,0.096473,0.157418,0.285691,0.090998,1
567,10.374794,1.672010,-1.877029,-2.356031,-0.033742,0.567936,0.223082,-0.280239,-0.542035,-0.089296,-0.178628,-0.697461,1.225195,0.218697,0.229590,1


# Aplicação dos Algoritmos

Para classificar a coluna *diagnosis*, serão utilizados os algoritmos *Logistic Regression* e *SVM* para treinar dois modelos. 

### Função de avaliação

In [11]:
from sklearn.metrics import accuracy_score

def evaluate(model,features,labels):
    out = model.predict(features)
    return accuracy_score(labels,out)

### Funções de Treinamento

In [12]:
def train_with_cross_validation(model,features,labels):
    result = cross_validate(model, features, labels, cv=10,scoring='accuracy',return_estimator=True,return_train_score=True)
    test_scores = result['test_score']
    best_class = result['estimator'][np.argmax(test_scores)]
    return test_scores,best_class

def train_with_grid_search(model,params,features,labels):
    grid_search = GridSearchCV(model, params, cv=10, scoring='accuracy')
    return grid_search.fit(features, labels)

## Separação dos Dados em Treino e Teste

In [13]:
df_train, df_test = train_test_split(pca_df, test_size=0.2, random_state=42)
df_train = df_train.reset_index().drop('index',axis=1)
df_test = df_test.reset_index().drop('index',axis=1)

In [14]:
df_test.head()

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,PCA11,PCA12,PCA13,PCA14,PCA15,diagnosis_bin
0,-0.78119,0.652849,-0.643657,0.214468,-0.43768,-0.160489,0.187235,-0.06819,0.533758,0.189562,0.086236,-0.052929,0.075326,0.010613,0.09724,0
1,2.704596,-4.437142,0.307345,0.488693,0.372495,-0.291801,0.283493,0.086264,0.327155,0.033346,-0.429226,0.205136,-0.372151,-0.179519,-0.292029,1
2,1.432069,-1.049604,-1.262011,0.533189,-0.626697,-0.785916,-0.621555,-0.214944,0.669595,0.13106,0.107123,0.201035,0.446399,0.160282,0.41825,1
3,-0.915804,2.479013,0.362607,0.261967,0.138472,-0.991236,-0.051203,-0.093964,-0.626973,0.109568,0.19702,0.097849,0.334178,0.240527,-0.388024,0
4,-1.665475,2.389618,1.502249,0.875951,0.484546,-1.189518,-0.677755,-0.146643,-0.291648,0.102733,0.460337,-0.872408,0.133983,0.292088,-0.534175,0


In [15]:
df_test.head()

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,PCA11,PCA12,PCA13,PCA14,PCA15,diagnosis_bin
0,-0.78119,0.652849,-0.643657,0.214468,-0.43768,-0.160489,0.187235,-0.06819,0.533758,0.189562,0.086236,-0.052929,0.075326,0.010613,0.09724,0
1,2.704596,-4.437142,0.307345,0.488693,0.372495,-0.291801,0.283493,0.086264,0.327155,0.033346,-0.429226,0.205136,-0.372151,-0.179519,-0.292029,1
2,1.432069,-1.049604,-1.262011,0.533189,-0.626697,-0.785916,-0.621555,-0.214944,0.669595,0.13106,0.107123,0.201035,0.446399,0.160282,0.41825,1
3,-0.915804,2.479013,0.362607,0.261967,0.138472,-0.991236,-0.051203,-0.093964,-0.626973,0.109568,0.19702,0.097849,0.334178,0.240527,-0.388024,0
4,-1.665475,2.389618,1.502249,0.875951,0.484546,-1.189518,-0.677755,-0.146643,-0.291648,0.102733,0.460337,-0.872408,0.133983,0.292088,-0.534175,0


## Criação dos Algoritmos

### Random Forest

In [16]:
lr_class = LogisticRegression()

### SVM

In [17]:
class_svm = svm.SVC(kernel='linear')

## Treinamento e Avaliação dos Modelos

### Logistic Regression

#### Treinamento com Validação Cruzada


In [18]:
scores, lr_class = train_with_cross_validation(lr_class,df_train[feature_columns],df_train[label_column])

In [19]:
print("Scores", scores)
print("Média dos scores:", scores.mean())

Scores [0.97826087 0.97826087 0.97826087 0.93478261 1.         1.
 0.97777778 0.97777778 0.95555556 0.97777778]
Média dos scores: 0.9758454106280194


#### Avaliação

In [20]:
evaluate(lr_class,df_test[feature_columns],df_test[label_column])

0.9912280701754386

### SVM

#### Treinamento com Validação Cruzada

In [21]:
scores, class_svm = train_with_cross_validation(class_svm,df_train[feature_columns],df_train[label_column])

In [22]:
print("Scores", scores)
print("Média dos scores:", scores.mean())

Scores [0.97826087 0.97826087 0.97826087 0.93478261 1.         1.
 0.97777778 0.97777778 0.93333333 0.97777778]
Média dos scores: 0.9736231884057972


### Avaliação

In [23]:
evaluate(class_svm,df_test[feature_columns],df_test[label_column])

0.9912280701754386

### Ajuste de Hiper-parâmetros

Abaixo é utilizado o método Grid Search para ajustar os hiper-parâmetros.

#### Logistic Regression - Grid Search

In [24]:
gs_lr = train_with_grid_search(LogisticRegression(penalty='l2'),  {'C': [0.1, 1, 10, 50],},df_train[feature_columns], df_train[label_column]) 
print("Melhor parâmetro:", gs_lr.best_params_)
print("Acurácia nos dados de teste", gs_lr.score(df_test[feature_columns], df_test[label_column]))

Melhor parâmetro: {'C': 1}
Acurácia nos dados de teste 0.9912280701754386


#### SVM - Grid Search

In [25]:
gs_lr = train_with_grid_search(svm.SVC(),  {'C': [0.1, 1, 10, 50],
              'gamma': [0.001, 0.01, 0.1, 1],
              'kernel': ['linear', 'rbf', 'poly']},df_train[feature_columns], df_train[label_column]) 
print("Melhor parâmetro:", gs_lr.best_params_)
print("Acurácia nos dados de teste", gs_lr.score(df_test[feature_columns], df_test[label_column]))

Melhor parâmetro: {'C': 0.1, 'gamma': 0.001, 'kernel': 'linear'}
Acurácia nos dados de teste 0.9824561403508771


### Conclusão

Apesar da acurácia para o modelo treinado pelo algortimo Logistc Regression continuar a mesma, a acurácia do modelo treinado pelo SVM diminuiu suavemente. Ambos os modelos obtiveram o mesmo resultado levando em consideração o melhor resultado, portanto, o SVM na sua melhor configuração foi escolhido para ser persisitido e implantado em um servidor da AWS.

# Salvando PCA, Scaler e Modelo

In [26]:
with open(root+'/checkpoints/svm.pkl', 'wb') as f:
    pickle.dump(class_svm, f)

with open(root+'/checkpoints/pca.pkl', 'wb') as f:
    pickle.dump(pca, f)

with open(root+'/checkpoints/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)