# 1. Importação de bibliotecas

In [2]:
import numpy as np
import pandas as pd

from google.colab import drive

from math import log, pi, sqrt

import statistics
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report
from sklearn.metrics import pairwise_distances
from sklearn.metrics import euclidean_distances
from scipy.stats import multivariate_normal
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

# 2. Leitura dos dados

In [4]:
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
path = '/content/drive/My Drive/[2020.1] APRENDIZADO DE MÁQUINA/TRABALHO/02. Arquivos/01. Dados/01. Dados para treino e teste/'
x_train = pd.read_csv(path + 'x_train.csv', index_col=False)
x_test = pd.read_csv(path + 'x_test.csv', index_col=False)
y_train = pd.read_csv(path + 'y_train.csv', index_col=False, usecols=['classification'])
y_test = pd.read_csv(path + 'y_test.csv', index_col=False, usecols=['classification'])

x_train.drop(columns=['Unnamed: 0'], inplace=True)
x_test.drop(columns=['Unnamed: 0'], inplace=True)

y_train = y_train['classification'].to_numpy()
y_test = y_test['classification'].to_numpy()

x_train = x_train.values
x_test = x_test.values

# 3. k Vizinhos mais Próximos

In [4]:
class KNN():    
    def __init__(self, k=3):
        self.k = k
        
    def distance_euclidean(self, a, b):
        return pairwise_distances(X=[a], Y=[b], metric='euclidean')

    def distance_manhattan(self, a, b):
        return pairwise_distances(X=[a], Y=[b], metric='manhattan')

    def distance_minkowski(self, a, b):
        return pairwise_distances(X=[a], Y=[b], metric='minkowski')

    def get_classes(self, y):
        return pd.array(y).unique()

    def fit(self, x_train, y_train, function = None):
        # Realizar grid search
        self.x_train = x_train
        self.y_train = y_train
            
        if function == 'euclidean':
            self.function = self.distance_euclidean
        elif function == 'manhattan':
            self.function = self.distance_manhattan
        elif function == 'minkowski':
            self.function = self.distance_minkowski    
        else:
            self.function = self.distance_euclidean

    def predict_1nn(self, x_test):
        classes = self.get_classes(self.y_train)
        distances_to_x_test = sorted([[self.function(self.x_train[i], x_test), self.y_train[i]] for i in range(len(self.x_train))]) 
        k_neighbors = [row[1] for row in distances_to_x_test][:self.k]
        return statistics.mode(k_neighbors)

    def predict(self, x_test):
        return [self.predict_1nn(xi_test) for xi_test in x_test]

## Treinamento do modelo

In [5]:
list_of_k = [3, 5, 7, 9, 11]
list_of_distances = ['euclidean', 'manhattan', 'minkowski']

for k_item in list_of_k:
    for distance_item in list_of_distances:
      knn = KNN(k=k_item)
      knn.fit(x_train, y_train, function = distance_item)
      label_prevista = knn.predict(x_test)

      print(f'Report for k={k_item} and {distance_item} distance\n')
      CR = classification_report(y_test, label_prevista)
      print(CR)
      print('\n\n')

Report for k=3 and euclidean distance

              precision    recall  f1-score   support

           0       0.64      0.71      0.67       130
           1       0.72      0.65      0.68       147

    accuracy                           0.68       277
   macro avg       0.68      0.68      0.68       277
weighted avg       0.68      0.68      0.68       277




Report for k=3 and manhattan distance

              precision    recall  f1-score   support

           0       0.65      0.73      0.69       130
           1       0.73      0.65      0.69       147

    accuracy                           0.69       277
   macro avg       0.69      0.69      0.69       277
weighted avg       0.69      0.69      0.69       277




Report for k=3 and minkowski distance

              precision    recall  f1-score   support

           0       0.64      0.71      0.67       130
           1       0.72      0.65      0.68       147

    accuracy                           0.68       277
   ma

In [6]:
report = classification_report(y_test, label_prevista, output_dict=True)

In [7]:
report

{'0': {'f1-score': 0.7535211267605634,
  'precision': 0.6948051948051948,
  'recall': 0.823076923076923,
  'support': 130},
 '1': {'f1-score': 0.7407407407407408,
  'precision': 0.8130081300813008,
  'recall': 0.6802721088435374,
  'support': 147},
 'accuracy': 0.7472924187725631,
 'macro avg': {'f1-score': 0.7471309337506521,
  'precision': 0.7539066624432478,
  'recall': 0.7516745159602303,
  'support': 277},
 'weighted avg': {'f1-score': 0.7467387558402965,
  'precision': 0.7575338283271718,
  'recall': 0.7472924187725631,
  'support': 277}}

## Salvando os resultados

In [14]:
resultados = pd.DataFrame({
    'precision': report['weighted avg']['precision'], 
    'recall': report['weighted avg']['recall'], 
    'f1score': report['weighted avg']['f1-score'], 
    'y_predict': [label_prevista],
    'y_real': [y_test]
})
resultados.to_json('/content/drive/My Drive/[2020.1] APRENDIZADO DE MÁQUINA/TRABALHO/05. Resultados/5.1. Resultados dos modelos/2. KNN/k_vizinhos_mais_proximos.json')

# AutoML

In [14]:
!apt-get install default-jre
!java -version

Reading package lists... Done
Building dependency tree       
Reading state information... Done
default-jre is already the newest version (2:1.11-68ubuntu1~18.04.1).
default-jre set to manually installed.
The following package was automatically installed and is no longer required:
  libnvidia-common-440
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
openjdk version "11.0.7" 2020-04-14
OpenJDK Runtime Environment (build 11.0.7+10-post-Ubuntu-2ubuntu218.04)
OpenJDK 64-Bit Server VM (build 11.0.7+10-post-Ubuntu-2ubuntu218.04, mixed mode, sharing)


In [15]:
!pip install h2o

Collecting h2o
[?25l  Downloading https://files.pythonhosted.org/packages/ad/5a/8328741dd6b1e9ac9345ec47e1e963d1bba56e0747063d6ce606a2813a55/h2o-3.30.0.6.tar.gz (128.4MB)
[K     |████████████████████████████████| 128.4MB 90kB/s 
Collecting colorama>=0.3.8
  Downloading https://files.pythonhosted.org/packages/c9/dc/45cdef1b4d119eb96316b3117e6d5708a08029992b2fee2c143c7a0a5cc5/colorama-0.4.3-py2.py3-none-any.whl
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.30.0.6-py2.py3-none-any.whl size=128412300 sha256=894d57ff0bf65d9a2e167b03cfd9f5d6ffea0dd29afe2b5b8676828c184e99a7
  Stored in directory: /root/.cache/pip/wheels/de/4c/dd/4813e95c4b5328b4de2e2e1aa56ca66a547f45d11fe47c3c8a
Successfully built h2o
Installing collected packages: colorama, h2o
Successfully installed colorama-0.4.3 h2o-3.30.0.6


In [16]:
import h2o
from h2o.automl import H2OAutoML

In [17]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.7" 2020-04-14; OpenJDK Runtime Environment (build 11.0.7+10-post-Ubuntu-2ubuntu218.04); OpenJDK 64-Bit Server VM (build 11.0.7+10-post-Ubuntu-2ubuntu218.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.6/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpb9q2zpnf
  JVM stdout: /tmp/tmpb9q2zpnf/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpb9q2zpnf/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.6
H2O_cluster_version_age:,19 days
H2O_cluster_name:,H2O_from_python_unknownUser_wqzzkp
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.180 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [34]:
path = '/content/drive/My Drive/[2020.1] APRENDIZADO DE MÁQUINA/TRABALHO/02. Arquivos/01. Dados/01. Dados para treino e teste/'
x_train = pd.read_csv(path + 'x_train.csv', index_col=False)
x_test = pd.read_csv(path + 'x_test.csv', index_col=False)
y_train = pd.read_csv(path + 'y_train.csv', index_col=False, usecols=['classification'])
y_test = pd.read_csv(path + 'y_test.csv', index_col=False, usecols=['classification'])

x_train.drop(columns=['Unnamed: 0'], inplace=True)
x_test.drop(columns=['Unnamed: 0'], inplace=True)

In [35]:
x_train['classification'] = y_train['classification']
x_test['classification'] = y_test['classification']

In [36]:
x_test.shape

(277, 301)

In [20]:
train = h2o.H2OFrame(x_train)
test = h2o.H2OFrame(x_test)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [21]:
x = train.columns
y = "classification"
x.remove(y)

In [22]:
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [23]:
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=x, y=y, training_frame=train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [24]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
StackedEnsemble_BestOfFamily_AutoML_20200720_012255,0.920564,0.355423,0.93083,0.170044,0.337818,0.114121
GLM_1_AutoML_20200720_012255,0.920546,0.342566,0.930816,0.163746,0.335061,0.112266
DeepLearning_grid__1_AutoML_20200720_012255_model_1,0.919371,0.816107,0.928317,0.170625,0.38021,0.14456
StackedEnsemble_AllModels_AutoML_20200720_012255,0.918315,0.36059,0.929043,0.158509,0.340345,0.115835
DeepLearning_1_AutoML_20200720_012255,0.898427,0.507371,0.910437,0.17747,0.376214,0.141537
DeepLearning_grid__2_AutoML_20200720_012255_model_1,0.898291,0.517038,0.908201,0.188862,0.384239,0.14764
GBM_5_AutoML_20200720_012255,0.898054,0.396853,0.908399,0.177352,0.358366,0.128426
GBM_grid__1_AutoML_20200720_012255_model_2,0.895242,0.39862,0.907659,0.1941,0.360234,0.129768
XGBoost_grid__1_AutoML_20200720_012255_model_1,0.890815,0.429702,0.905246,0.184425,0.37001,0.136907
GBM_2_AutoML_20200720_012255,0.887837,0.414824,0.901985,0.192399,0.365586,0.133653




In [45]:
preds = aml.predict(test)

stackedensemble prediction progress: |████████████████████████████████████| 100%


In [46]:
preds_as_df = h2o.as_list(preds)
test_as_df = h2o.as_list(test)

In [48]:
preds_as_df

Unnamed: 0,predict,p0,p1
0,1,0.045933,0.954067
1,0,0.953249,0.046751
2,0,0.955435,0.044565
3,1,0.028880,0.971120
4,1,0.210436,0.789564
...,...,...,...
272,0,0.901663,0.098337
273,1,0.028452,0.971548
274,0,0.931121,0.068879
275,0,0.842654,0.157346


In [49]:
y_test_2 =  preds_as_df['predict']
label_prevista_2 = test_as_df['classification']
report = classification_report(y_test_2, label_prevista_2, output_dict=True)

In [56]:
report = classification_report(y_test_2, label_prevista_2, output_dict=True)

In [55]:
print(report)

              precision    recall  f1-score   support

           0       0.82      0.85      0.84       126
           1       0.87      0.85      0.86       151

    accuracy                           0.85       277
   macro avg       0.85      0.85      0.85       277
weighted avg       0.85      0.85      0.85       277



In [57]:
resultados = pd.DataFrame({
    'precision': report['weighted avg']['precision'], 
    'recall': report['weighted avg']['recall'], 
    'f1score': report['weighted avg']['f1-score'], 
    'y_predict': [label_prevista_2],
    'y_real': [y_test_2]
})
resultados.to_json('/content/drive/My Drive/[2020.1] APRENDIZADO DE MÁQUINA/TRABALHO/05. Resultados/5.1. Resultados dos modelos/9. AutoML/automl.json')