# Projeto Final de Ciência dos Dados. ( PkmnID)

## Algoritmo de "machine learning", utilizando a ferramenta "RandomForest", para identificar o nome de cada pokemon a partir de suas imagens.

In [1]:
!pip install opencv-contrib-python
import cv2
import os
import os.path
import numpy as np
import math
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from pprint import pprint

# Set random seed
np.random.seed(0)



In [19]:
TRAIN_DIR = 'Assets//Data_Train'
TEST_DIR = 'Assets//Data_Test'

NUM_CLUSTERS = 40

TRAIN_IMG = []
TEST_IMG = []
TRAIN_LABEL = []
TEST_LABEL = []

for train, test in zip(os.listdir(TRAIN_DIR), os.listdir(TEST_DIR)): #Tecnicamente são iguais, mas não custa garantir.
    for img_train, img_test in zip(os.listdir(os.path.join(TRAIN_DIR,train)), os.listdir(os.path.join(TEST_DIR,test))):
        TRAIN_IMG.append(os.path.join(TRAIN_DIR,train,img_train))
        TEST_IMG.append(os.path.join(TEST_DIR,test,img_test))
        TRAIN_LABEL.append(train)
        TEST_LABEL.append(test)


# def get_images_from_category(category, num_train, num_test, data_dir):
#     category_dir = os.path.join(DATA_DIR, category)
#     num_total = num_train + num_test
#     filenames_train = []
#     filenames_test = []
    
#     for k, filename in enumerate(os.listdir(category_dir)):
#         if k < num_train:
#             filenames_train.append(os.path.join(category_dir, filename))
#         elif k < num_total:
#             filenames_test.append(os.path.join(category_dir, filename))
#         else:
#             break
#     return filenames_train, filenames_test

# def get_images_from_category_list(category_list, num_train, num_test, data_dir):
#     filenames_train_all = []
#     target_train = []
#     filenames_test_all = []
#     target_test = []
#     for category in category_list:
#         filenames_train, filenames_test = get_images_from_category(category, num_train, num_test, data_dir)
#         filenames_train_all.extend(filenames_train)
#         target_train.extend([category] * NUM_IMAGES_TRAIN_PER_CATEGORY)
#         filenames_test_all.extend(filenames_test)
#         target_test.extend([category] * NUM_IMAGES_TEST_PER_CATEGORY)
#     return filenames_train_all, filenames_test_all, target_train, target_test

def cria_vocabulario(imagens, num_clusters):
    km = cv2.BOWKMeansTrainer(num_clusters)
    akaze = cv2.KAZE_create()
    for p in imagens:
        img = cv2.imread(p, cv2.IMREAD_GRAYSCALE)
        mask = np.ones(img.shape)
        kp, desc = akaze.detectAndCompute(img, mask)
        km.add(desc)
    return km.cluster()

def representa(vocab, img):
    kaze = cv2.KAZE_create()
    kp = kaze.detect(img)
    bowdesc = cv2.BOWImgDescriptorExtractor(kaze, cv2.FlannBasedMatcher())
    bowdesc.setVocabulary(vocab)
    return bowdesc.compute(img, kp)

def transforma_imagens(imagens, vocab):
    X = []
    for p in imagens:
        img = cv2.imread(p, cv2.IMREAD_GRAYSCALE)
        X.append(representa(vocab, img).flatten())
    return np.array(X)



vocab = cria_vocabulario(TRAIN_IMG, NUM_CLUSTERS)
X_train = transforma_imagens(TRAIN_IMG, vocab)
X_test = transforma_imagens(TEST_IMG, vocab)
y_train = TRAIN_LABEL
y_test = TEST_LABEL

# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=-1, random_state=0, n_estimators = 100)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(X_train, y_train)
scr = clf.score(X_test, y_test)

In [20]:
def show_example(path = "Testes/Testes/9.png", Plot = True):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    img_resized = cv2.resize(img, dsize=(120, 120))
    if Plot:
        plt.imshow(img_resized, cmap='gray', vmin=0, vmax=255)
    return representa(vocab, img_resized)

# clf.predict_proba(show_example()), clf.classes_

## Análise Exploratória:

## Para realizar a análise exploratória seguiremos alguns passos:

# Passo 1:
## Extrair histograma:
### O código abaixo Extrai histograma de frequências relativas de features de todas as imagens do Dataset escolhido. (Pokemons)

In [21]:
origin_dir = 'Assets//Data_Filtered_Resized'
Hist_Dict = {}
for pkmn in os.listdir(origin_dir):
    Hist_Dict[pkmn] = []
    current_dir = os.path.join(origin_dir,pkmn)
    for k, img in enumerate(os.listdir(current_dir)):
        Hist_Dict[pkmn].append(show_example(os.path.join(current_dir,img), Plot = False))
# print(Hist_Dict['Alakazam'])

# Passo 2:
## Criar um DataFrame para trabalhar melhor com o DataSet:
### Foi necessário criar um dataframe dedicado a cada pokemon com suas respectivas frequências relativas. Visando melhorar e facilitar a análise permitindo cruzar e manusear os dados com maior facilidade.

In [22]:
lista = []
lista_nomes = os.listdir('Assets/Data_Filtered_Resized')
for k in Hist_Dict:
    x = pd.Series(Hist_Dict[k]).mean()
    x = pd.Series(x[0])
    lista.append(x)
df_medias = pd.DataFrame(lista, index = lista_nomes)

## Tabela das frequências relativas médias de cada feature por pokémon:

In [23]:
df_medias.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
Aerodactyl,0.038123,0.024636,0.036961,0.020319,0.051775,0.008856,0.042713,0.027365,0.04466,0.030395,...,0.045744,0.013153,0.003213,0.00851,0.049913,0.021774,0.022891,0.003673,0.0109,0.007772
Alakazam,0.020676,0.03714,0.036753,0.038121,0.073055,0.008344,0.031288,0.037998,0.030992,0.037137,...,0.026023,0.010541,0.007122,0.006746,0.029654,0.011067,0.029014,0.004637,0.011596,0.011869
Arbok,0.012339,0.03454,0.013313,0.023675,0.074651,0.012806,0.027611,0.02732,0.014038,0.024934,...,0.013385,0.013042,0.011538,0.018673,0.031744,0.106301,0.030819,0.013263,0.024969,0.014263
Arcanine,0.010291,0.040168,0.021365,0.051638,0.075762,0.005306,0.043999,0.029923,0.023513,0.05214,...,0.017794,0.004969,0.012442,0.007626,0.018754,0.027901,0.035601,0.004488,0.006584,0.005143
Beedrill,0.026458,0.030862,0.027536,0.036588,0.07274,0.005034,0.067834,0.027491,0.021661,0.043528,...,0.021519,0.008826,0.007077,0.006141,0.037172,0.013624,0.02218,0.004155,0.0062,0.010009


# Passo 3:
## Calculando o valor médio dos dados:
### Nesta etapa foi necessário tirar o valor médio dos dados. Para assim aproxima-los do ponto (0,0), origem do sistema. Dessa forma permitindo a distinção evidente entre cada um deles.

In [24]:
df_medias = df_medias - (1/NUM_CLUSTERS) 

In [25]:
df_medias.sum(axis=1)

Aerodactyl    1.490116e-08
Alakazam      1.676381e-08
Arbok        -7.450581e-09
Arcanine      3.352761e-08
Beedrill     -1.229346e-07
Bellsprout    1.117587e-08
Bulbasaur    -3.352761e-08
Charmander   -2.980232e-08
Jigglypuff    1.303852e-08
Meowth       -3.539026e-08
Pidgey        3.818423e-08
Squirtle      0.000000e+00
Voltorb       1.862645e-08
dtype: float32

In [26]:
normas = (df_medias*df_medias).sum(axis=1)
for m in normas.index:
    df_medias.loc[m] = df_medias.loc[m]/np.sqrt(normas[m])

# Passo 4:
## Facilitando a comparação entre os pokemons:
### Tabela que mostra o quanto os pokémons são semelhantes entre si, diagonal principal é 1, pois todo pokémon é idêntico a si mesmo.

In [27]:
df_compara = df_medias.dot(df_medias.transpose())
df_compara

Unnamed: 0,Aerodactyl,Alakazam,Arbok,Arcanine,Beedrill,Bellsprout,Bulbasaur,Charmander,Jigglypuff,Meowth,Pidgey,Squirtle,Voltorb
Aerodactyl,1.0,0.742209,0.258303,0.453628,0.695164,0.406922,0.136273,0.545938,0.185705,0.528394,0.52546,0.363037,0.286706
Alakazam,0.742209,1.0,0.281,0.805565,0.853678,0.647337,0.267871,0.516498,0.530589,0.834444,0.606345,0.625544,-0.009938
Arbok,0.258303,0.281,1.0,0.453129,0.256119,0.211377,0.682441,0.51186,0.260122,0.215157,0.30991,0.587973,0.672228
Arcanine,0.453628,0.805565,0.453129,1.0,0.83553,0.784662,0.565247,0.702748,0.792591,0.881118,0.831446,0.895348,0.091817
Beedrill,0.695164,0.853678,0.256119,0.83553,1.0,0.73877,0.302266,0.61985,0.635881,0.866233,0.78357,0.677962,0.066369
Bellsprout,0.406922,0.647337,0.211377,0.784662,0.73877,1.0,0.188626,0.578429,0.713564,0.788469,0.75273,0.709532,0.028198
Bulbasaur,0.136273,0.267871,0.682441,0.565247,0.302266,0.188626,1.0,0.682232,0.526602,0.407372,0.509595,0.703985,0.512802
Charmander,0.545938,0.516498,0.51186,0.702748,0.61985,0.578429,0.682232,1.0,0.639947,0.665236,0.880795,0.853396,0.521199
Jigglypuff,0.185705,0.530589,0.260122,0.792591,0.635881,0.713564,0.526602,0.639947,1.0,0.772413,0.716891,0.742375,0.153584
Meowth,0.528394,0.834444,0.215157,0.881118,0.866233,0.788469,0.407372,0.665236,0.772413,1.0,0.801786,0.786093,-0.004914


### Aqui, podemos observar que alguns pokémons tem semelhanças muito fortes entre si, pois apresentam as mesmas features em abundância (na média).

In [28]:
monstros = []
for feat in range(NUM_CLUSTERS):
    monstros.append(sorted(df_medias.nlargest(n=5, columns=[feat]).index) + [feat])
x = sorted(monstros)
pprint(x)

[['Aerodactyl', 'Alakazam', 'Arbok', 'Charmander', 'Voltorb', 22],
 ['Aerodactyl', 'Alakazam', 'Beedrill', 'Bulbasaur', 'Meowth', 2],
 ['Aerodactyl', 'Alakazam', 'Beedrill', 'Charmander', 'Voltorb', 0],
 ['Aerodactyl', 'Alakazam', 'Beedrill', 'Charmander', 'Voltorb', 24],
 ['Aerodactyl', 'Alakazam', 'Charmander', 'Meowth', 'Pidgey', 8],
 ['Aerodactyl', 'Alakazam', 'Charmander', 'Pidgey', 'Voltorb', 30],
 ['Aerodactyl', 'Arbok', 'Bulbasaur', 'Jigglypuff', 'Voltorb', 5],
 ['Aerodactyl', 'Arbok', 'Bulbasaur', 'Jigglypuff', 'Voltorb', 25],
 ['Aerodactyl', 'Arbok', 'Charmander', 'Squirtle', 'Voltorb', 31],
 ['Aerodactyl', 'Beedrill', 'Charmander', 'Pidgey', 'Voltorb', 34],
 ['Alakazam', 'Arbok', 'Arcanine', 'Beedrill', 'Meowth', 4],
 ['Alakazam', 'Arbok', 'Arcanine', 'Bulbasaur', 'Squirtle', 36],
 ['Alakazam', 'Arbok', 'Bellsprout', 'Bulbasaur', 'Voltorb', 39],
 ['Alakazam', 'Arbok', 'Bellsprout', 'Jigglypuff', 'Voltorb', 17],
 ['Alakazam', 'Arbok', 'Bellsprout', 'Jigglypuff', 'Voltorb', 38

In [29]:
df_compara.sum(axis = 1).sort_values(ascending = False)

Squirtle      9.113712
Arcanine      9.092827
Pidgey        8.859223
Charmander    8.718126
Meowth        8.541802
Beedrill      8.331392
Alakazam      7.701142
Jigglypuff    7.670264
Bellsprout    7.548616
Bulbasaur     6.485313
Aerodactyl    6.127738
Arbok         5.699617
Voltorb       3.909120
dtype: float32

In [30]:
scr, 1/13

(0.41051660516605165, 0.07692307692307693)

## Bibliografia:
- Modelo Bag of Visual Words, e parte da análise exploratória produzidos por/com assistência de Fábio Ayres.
- Dataset: [Pokémon Gen One](https://www.kaggle.com/thedagger/pokemon-generation-one/data) da plataforma Kaggle.com
