# Projeto Final de Ciência dos Dados. ( PkmnID)

## Algoritmo de predição da categoria de Pokémons por meio de suas imagens.
### O algoritmo realiza a extração e a clusterização de features de imagens por meio do método \"Bag of Visual Words\" (BOVW),classifica-as utilizando o método de machine learning \"Random Forest\" e prevê a categoria de Pokémons por meio de novas imagens.

In [124]:
!pip install opencv-contrib-python
import cv2
import os
import os.path
import numpy as np
import math
import pandas as pd
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
import matplotlib.pyplot as plt
from pprint import pprint
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

np.random.seed(0)

TRAIN_DIR = 'Assets/Final/Train_base'
SUPER_TRAIN_DIR = 'Assets/Final/Train_super'
TEST_DIR = 'Assets/Final/Test'
CLASSES = os.listdir(TRAIN_DIR)

NUM_CLUSTERS = 40



In [125]:
## Máquina de controle:

# Re-adquiri as features das imagens.
READ_IMAGES = 0

# Cria arquivo Json para dicionário de Features.
UPDATE_FILES = 0

# Constrói dataframe com dados do dicionário de features.
CREATE_FEATURE_DATAFRAME = 1

# Mostra em quais 'n' pokémons, cada feature é mais proeminente.
SHOW_TOP_N_FOR_FEATURES = 1

# Re-treina os models.
FIT_MODELS = 1

# Produz matrizes de confusão para todos os modelos.
PLOT_CONFUSION_MATRIXES = 0

# Produz uma lista com a métrica precision@n para todos os modelos.
PRECISION_AT_N = 0

# Roda o modelo combinado dos 3 modelos originais.
RUN_SUPER_MODEL = 0


## 1- Extração de features de imagens: Bag of Visual Words
### Uma vez que o dataset se trata de um conjunto de imagens de diferentes Pokémons, é necessário inicialmente extrair features dessas imagens, através do método "Bag of Visual Words".
### Com as imagens transformadas em features clusterizadas, elas são separadas em categorias de treino e teste, que serão utilizadas posteriormente pelo algoritmo de machine learning.
### O código abaixo realiza essas duas etapas:
#### Obs: Código produzido com a assistência do Prof. Fábio Ayres

In [126]:
def get_img_names(TRAIN_DIR = TRAIN_DIR, SUPER_TRAIN_DIR = SUPER_TRAIN_DIR, TEST_DIR = TEST_DIR):

    TRAIN_IMG = []
    TEST_IMG = []
    TRAIN_LABEL = []
    TEST_LABEL = []
    SUPER_TRAIN_IMG = []
    SUPER_TRAIN_LABEL = []

    for train, sup_train, test in zip(os.listdir(TRAIN_DIR), os.listdir(SUPER_TRAIN_DIR), os.listdir(TEST_DIR)): 
        #Tecnicamente são iguais, mas não custa garantir.
        dir_train = os.listdir(os.path.join(TRAIN_DIR,train))
        dir_sup_train = os.listdir(os.path.join(SUPER_TRAIN_DIR,sup_train))
        dir_test = os.listdir(os.path.join(TEST_DIR,test))
        for img_train, sup_img_train, img_test in zip(dir_train, dir_sup_train, dir_test):
            TRAIN_IMG.append(os.path.join(TRAIN_DIR, train, img_train))
            TEST_IMG.append(os.path.join(TEST_DIR, test, img_test))
            TRAIN_LABEL.append(train)
            TEST_LABEL.append(test)
            SUPER_TRAIN_IMG.append(os.path.join(SUPER_TRAIN_DIR, sup_train, sup_img_train))
            SUPER_TRAIN_LABEL.append(sup_train)

    return TRAIN_IMG, TEST_IMG, SUPER_TRAIN_IMG, SUPER_TRAIN_LABEL, TRAIN_LABEL, TEST_LABEL

def cria_vocabulario(imagens, num_clusters):
    km = cv2.BOWKMeansTrainer(num_clusters)
    akaze = cv2.KAZE_create()
    for p in imagens:
        img = cv2.imread(p, cv2.IMREAD_GRAYSCALE)
        mask = np.ones(img.shape)
        kp, desc = akaze.detectAndCompute(img, mask)
        km.add(desc)
    return km.cluster()

def representa(vocab, img):
    kaze = cv2.KAZE_create()
    kp = kaze.detect(img)
    bowdesc = cv2.BOWImgDescriptorExtractor(kaze, cv2.FlannBasedMatcher())
    bowdesc.setVocabulary(vocab)
    return bowdesc.compute(img, kp)

def transforma_imagens(imagens, vocab):
    X = []
    for p in imagens:
        img = cv2.imread(p, cv2.IMREAD_GRAYSCALE)
        X.append(representa(vocab, img).flatten())
    return np.array(X)

def show_example(path = os.listdir("Testes/Testes/")[0], plot = True):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    img_resized = cv2.resize(img, dsize=(120, 120))
    if Plot:
        plt.imshow(img_resized, cmap='gray', vmin=0, vmax=255)
    return representa(vocab, img_resized)

In [127]:
Train_Dict = {}
Test_Dict = {}
TRAIN_IMG, TEST_IMG, SUPER_TRAIN_IMG, SUPER_TRAIN_LABEL, TRAIN_LABEL, TEST_LABEL = get_img_names()

if READ_IMAGES:       
    vocab = cria_vocabulario(TRAIN_IMG, NUM_CLUSTERS)
    for pkmn in os.listdir(TRAIN_DIR):
        Hist_Dict[pkmn] = transforma_imagens([os.path.join(TRAIN_DIR, pkmn, n) for n in os.listdir(os.path.join(TRAIN_DIR,pkmn))], vocab)   
    for pkmn in os.listdir(TEST_DIR):
        Test_Dict[pkmn] = transforma_imagens([os.path.join(TEST_DIR, pkmn, n) for n in os.listdir(os.path.join(TEST_DIR,pkmn))], vocab)
    
    
    if UPDATE_FILES:
        if 'files' not in os.listdir("Assets/"):
            os.mkdir('Assets/files')
        np.save('Assets/files/Features_Train', Hist_Dict)
        np.save('Assets/files/Features_Test', Test_Dict)
        np.save('Assets/files/Bag_of_Visual_Words', vocab)
            

else:
        Hist_Dict = np.load('Assets/files/Features_Train.npy', allow_pickle=True)[()]
        Test_Dict = np.load('Assets/files/Features_Test.npy', allow_pickle=True)[()]
        vocab = np.load('Assets/files/Bag_of_Visual_Words.npy', allow_pickle=True)[()]

In [128]:
X_train_vectors = np.array([v for k,v in Hist_Dict.items()]) 
X_test_vectors = np.array([v for k,v in Test_Dict.items()]) 
y_train_vectors = [] 
y_test_vectors = []

for k in Hist_Dict:
    x = []
    for v in Hist_Dict[k]:
        x.append(k)
    x = np.array(x)
    y_train_vectors.append(x)


for k in Test_Dict:
    x2 = []
    for v in Test_Dict[k]:
        x2.append(k)
    x2 = np.array(x2)
    y_test_vectors.append(x2)

y_train_vectors = np.array(y_train_vectors)
y_test_vectors = np.array(y_test_vectors)

X_train, X_test, y_train, y_test = [], [], [], []
for pkmn, matrix in zip(CLASSES, X_train_vectors):
    for feature_vector in matrix:
        X_train.append(feature_vector)
        y_train.append(pkmn)

for pkmn_test, matrix_test in zip(CLASSES, X_test_vectors):
    for feature_vector_test in matrix_test:
        X_test.append(feature_vector_test)
        y_test.append(pkmn_test)

X_train, X_test, y_train, y_test = np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test)

## 2 - Análise Exploratória:
### Para realizar a análise exploratória seguiremos alguns passos:

## 2.2 - Criar um dataframe para trabalhar melhor com o dataset:

In [129]:
if CREATE_FEATURE_DATAFRAME:    
    lista = []
    for k in Hist_Dict:
        mean = []
        for i in range(NUM_CLUSTERS):
            mean.append(pd.Series(Hist_Dict[k][:,i]).mean())
        lista.append(mean)
    df_medias = pd.DataFrame(lista, index = CLASSES)
else:
    df_medias = pd.DataFrame([[1],[1]])

## Tabela das frequências relativas médias de cada feature por pokémon:

In [130]:
df_medias

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
Aerodactyl,0.044075,0.03623,0.023366,0.033021,0.023688,0.10218,0.03204,0.005423,0.016188,0.060155,...,0.012572,0.024531,0.025768,0.011084,0.006088,0.02907,0.033695,0.025318,0.023531,0.102536
Alakazam,0.042203,0.071594,0.019158,0.046033,0.032938,0.054789,0.030445,0.006991,0.019175,0.041339,...,0.010976,0.035718,0.045208,0.005365,0.009104,0.027694,0.020182,0.033983,0.029512,0.046339
Arbok,0.021487,0.036324,0.050306,0.020419,0.026035,0.029829,0.025081,0.014049,0.015443,0.01943,...,0.016183,0.014349,0.020465,0.084399,0.014313,0.008742,0.013688,0.02296,0.022318,0.030259
Arcanine,0.03045,0.041339,0.037431,0.043823,0.028322,0.022084,0.023125,0.009938,0.018811,0.028991,...,0.012259,0.028663,0.036785,0.01808,0.011499,0.030481,0.026166,0.045413,0.042705,0.014914
Beedrill,0.045788,0.052127,0.015594,0.029527,0.02349,0.057362,0.020623,0.003705,0.016917,0.029218,...,0.009918,0.036671,0.0379,0.008291,0.005948,0.041655,0.036905,0.04564,0.048834,0.049458
Bellsprout,0.01019,0.025281,0.023933,0.056161,0.031274,0.027047,0.017095,0.0084,0.017391,0.029988,...,0.005172,0.049062,0.032132,0.009053,0.008097,0.056463,0.051991,0.060806,0.041321,0.020776
Bulbasaur,0.038257,0.039303,0.057396,0.036865,0.018234,0.018575,0.025118,0.006608,0.020509,0.029396,...,0.025432,0.014796,0.020394,0.047251,0.011836,0.006658,0.007208,0.020668,0.019306,0.009178
Charmander,0.03088,0.017806,0.038978,0.023512,0.035953,0.038134,0.01955,0.006893,0.03424,0.039148,...,0.018101,0.01917,0.030615,0.037461,0.008151,0.019516,0.027197,0.034926,0.032833,0.034143
Jigglypuff,0.021146,0.02988,0.025263,0.065154,0.017972,0.0166,0.022263,0.007698,0.01202,0.01917,...,0.021365,0.032932,0.035,0.018608,0.00922,0.037777,0.02542,0.036577,0.03804,0.012375
Meowth,0.034858,0.043058,0.017794,0.047876,0.028052,0.029387,0.019585,0.008609,0.014159,0.046917,...,0.008856,0.041095,0.044787,0.007415,0.011203,0.043575,0.028548,0.031875,0.034071,0.024253


## 2.3 - Calculando os valores médios dos dados:
### Nesta etapa foi calculado os valores médios dos dados, e em sequência foram aproximados do ponto (0,0), origem do sistema.

In [131]:
df_medias = df_medias - (1/NUM_CLUSTERS)

In [132]:
df_medias.sum(axis=1)

Aerodactyl    1.480803e-07
Alakazam      1.266599e-07
Arbok         1.131557e-07
Arcanine      1.103617e-07
Beedrill     -1.140870e-08
Bellsprout    7.683411e-08
Bulbasaur    -2.281740e-08
Charmander    1.410954e-07
Jigglypuff    4.190952e-09
Meowth       -9.313226e-10
Pidgey        1.015142e-07
Squirtle      1.615845e-07
Voltorb      -1.168810e-07
dtype: float64

In [133]:
normas = (df_medias*df_medias).sum(axis=1)
for m in normas.index:
    df_medias.loc[m] = df_medias.loc[m]/np.sqrt(normas[m])

## 2.4 - Comparação entre os Pokémons:
### Com base nos valores calculados anteriormente, foi criada a tabela seguinte, que mostra o quanto os Pokémons são semelhantes entre si, sendo 1 a semelhança máxima, e -1 o oposto.

In [134]:
df_compara = df_medias.dot(df_medias.transpose())
df_compara

Unnamed: 0,Aerodactyl,Alakazam,Arbok,Arcanine,Beedrill,Bellsprout,Bulbasaur,Charmander,Jigglypuff,Meowth,Pidgey,Squirtle,Voltorb
Aerodactyl,1.0,0.702084,0.092649,0.284312,0.706093,0.275357,-0.048073,0.421713,0.056577,0.439771,0.560301,0.208436,0.43744
Alakazam,0.702084,1.0,0.190372,0.659588,0.793147,0.445378,0.075146,0.27708,0.235179,0.698492,0.449811,0.442211,0.103099
Arbok,0.092649,0.190372,1.0,0.340027,0.095485,-0.127736,0.564565,0.345506,-0.021403,0.002739,0.131951,0.47896,0.599455
Arcanine,0.284312,0.659588,0.340027,1.0,0.735642,0.602886,0.369396,0.49743,0.540171,0.725805,0.66695,0.862296,0.090724
Beedrill,0.706093,0.793147,0.095485,0.735642,1.0,0.579947,0.107189,0.47523,0.36318,0.727374,0.743031,0.544686,0.274503
Bellsprout,0.275357,0.445378,-0.127736,0.602886,0.579947,1.0,-0.170264,0.290983,0.7075,0.734042,0.553396,0.533618,-0.017376
Bulbasaur,-0.048073,0.075146,0.564565,0.369396,0.107189,-0.170264,1.0,0.591847,0.252955,0.241852,0.361455,0.530171,0.514097
Charmander,0.421713,0.27708,0.345506,0.49743,0.47523,0.290983,0.591847,1.0,0.44475,0.543096,0.827574,0.739806,0.627
Jigglypuff,0.056577,0.235179,-0.021403,0.540171,0.36318,0.7075,0.252955,0.44475,1.0,0.698497,0.50542,0.586286,0.170074
Meowth,0.439771,0.698492,0.002739,0.725805,0.727374,0.734042,0.241852,0.543096,0.698497,1.0,0.742866,0.63966,0.145789


### Podemos observar que alguns Pokémons possuem muitas semelhanças pois apresentam as mesmas features em abundância (na média)."

In [135]:
if SHOW_TOP_N_FOR_FEATURES:    
    monstros = []
    for feat in range(NUM_CLUSTERS):
        monstros.append(sorted(df_medias.nlargest(n=5, columns=[feat]).index) + [feat])
    x = sorted(monstros)
    pprint(x)

[['Aerodactyl', 'Alakazam', 'Arbok', 'Bulbasaur', 'Voltorb', 6],
 ['Aerodactyl', 'Alakazam', 'Beedrill', 'Bulbasaur', 'Pidgey', 0],
 ['Aerodactyl', 'Alakazam', 'Beedrill', 'Charmander', 'Voltorb', 39],
 ['Aerodactyl', 'Alakazam', 'Beedrill', 'Pidgey', 'Voltorb', 5],
 ['Aerodactyl', 'Alakazam', 'Bellsprout', 'Bulbasaur', 'Voltorb', 16],
 ['Aerodactyl', 'Alakazam', 'Bulbasaur', 'Charmander', 'Squirtle', 8],
 ['Aerodactyl', 'Alakazam', 'Bulbasaur', 'Charmander', 'Voltorb', 21],
 ['Aerodactyl', 'Alakazam', 'Bulbasaur', 'Jigglypuff', 'Voltorb', 24],
 ['Aerodactyl', 'Alakazam', 'Charmander', 'Meowth', 'Pidgey', 9],
 ['Aerodactyl', 'Arbok', 'Bellsprout', 'Bulbasaur', 'Voltorb', 12],
 ['Aerodactyl', 'Arbok', 'Bellsprout', 'Bulbasaur', 'Voltorb', 27],
 ['Aerodactyl', 'Arbok', 'Bellsprout', 'Jigglypuff', 'Voltorb', 7],
 ['Aerodactyl', 'Arbok', 'Bellsprout', 'Jigglypuff', 'Voltorb', 28],
 ['Aerodactyl', 'Arbok', 'Bulbasaur', 'Jigglypuff', 'Voltorb', 34],
 ['Aerodactyl', 'Beedrill', 'Bellsprout', 

### A soma das colunas da tabela anterior mostra quais Pokémons são mais difíceis de distinguir.

In [136]:
df_compara.sum(axis = 1).sort_values(ascending = False)

Pidgey        7.729333
Squirtle      7.619577
Arcanine      7.375228
Meowth        7.339984
Beedrill      7.145508
Charmander    7.082016
Alakazam      6.071588
Jigglypuff    5.539187
Bellsprout    5.407731
Aerodactyl    5.136662
Voltorb       4.726126
Bulbasaur     4.390337
Arbok         3.692570
dtype: float64

## 3 - "Machine Learning" e Classificação:
### O método de aprendizado de máquina e classificação utilizado foi o "Random Forest Classifier", assim como "Logistic Regression Classifier" e "KNearesNeighbors Classifier".

In [137]:
if FIT_MODELS:
    # Random forest
    randf = RandomForestClassifier(n_jobs=-1, random_state=0, n_estimators = 100)
    randf.fit(X_train, y_train)

    # KNearestNeighbors
    neigh = KNeighborsClassifier(n_neighbors=5)
    neigh.fit(X_train, y_train)
    
    # Decision Tree
    tree = DecisionTreeClassifier(random_state=0)
    tree.fit(X_train, y_train)

    # Logistic regression, não utilizado.
    #logit = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)
    #logit.fit(X_train, y_train);

    # Método Nearest Centroid, não utilizado.
    # from sklearn.neighbors.nearest_centroid import NearestCentroid
    # clf4 = NearestCentroid()
    # clf4.fit(X_train, y_train)

    # Método Support Vector Machine, não utilizado.
    # from sklearn import svm
    # clf5 = svm.SVC(gamma='scale')
    # clf5.fit(X_train, y_train)

### Abaixo armazenamos os modelos numa estrutura que nos será mais acessível.

In [138]:
if FIT_MODELS:
    models = {'Random Forest': randf,
            'KNearestNeighbors': neigh,
            'Decision Tree': tree}
else:
    models = {0:0}

## 3.1 - Análise das classificações realizadas pelo modelo:
### A matriz de confusão abaixo mostra em mais detalhes os erros e acertos do classificador. É possível identificar que na maioria das vezes que o modelo falhou, ele identificou erroneamente o Pokémon como sendo uma "Jigglypuff" ou um "Arcanine".

#### Obs: A função *plot_confusion_matrix* abaixo não é de nossa autoria, e sua versão original pode ser encontrada no seguinte endereço: https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

In [139]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=True,
                          title=None,
                          cmap=plt.cm.Blues):
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    #print(cm)

    fig, ax = plt.subplots(figsize = (16,16))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

def multi_confusion_mtx(X_test, y_test, model_dict):
    for k,v in model_dict.items():
        plot_confusion_matrix(y_test, v.predict(X_test), classes=v.classes_,
                            title='Normalized confusion matrix for model %s' % k)
        plt.show()

def precision_at_n(model, vocab = vocab):
    hits, miss = 0, 0
    for img, label in zip(TEST_IMG, TEST_LABEL):
        rep = representa(vocab, cv2.imread(img))
        top3 = pd.Series(model.predict_proba(rep)[0], index = os.listdir('Assets/Data_Test')).nlargest(3)
        if label in top3.index.tolist():
            hits += 1
        else:
            miss += 1
            
    return hits, miss, hits/(hits+miss)

def show_guess(path, model):
    return pd.Series(model.predict_proba(show_example(path))[0], index = model.classes_).sort_values(ascending = False)

np.set_printoptions(precision=2)

In [140]:
if PLOT_CONFUSION_MATRIXES and FIT_MODELS:
    multi_confusion_mtx(X_test, y_test, models)

In [141]:
if PRECISION_AT_N and FIT_MODELS:
    pprint({model_name: precision_at_n(model) for (model_name, model) in models.items()})

In [142]:
for model_name, model in models.items():
    print("%s score of: %.5f" % (model_name, model.score(X_test, y_test)))

Random Forest score of: 0.61993
KNearestNeighbors score of: 0.60424
Decision Tree score of: 0.47509


## Nada mal! Mas podemos fazer melhor?

## Stacking: Combinando os modelos em um(a) supermodelo.