# Projeto Final de Ciência dos Dados. ( PkmnID)

## Algoritmo de "machine learning", utilizando a ferramenta "RandomForest", para identificar o nome de cada pokemon a partir de suas imagens.

In [31]:
!pip install opencv-contrib-python
import cv2
import os
import os.path
import numpy as np
import math
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from pprint import pprint

# Set random seed
np.random.seed(0)



In [32]:
TRAIN_DIR = 'Assets//Data_Train'
TEST_DIR = 'Assets//Data_Test'

NUM_CLUSTERS = 50

TRAIN_IMG = []
TEST_IMG = []
TRAIN_LABEL = []
TEST_LABEL = []

for train, test in zip(os.listdir(TRAIN_DIR), os.listdir(TEST_DIR)): #Tecnicamente são iguais, mas não custa garantir.
    for img_train, img_test in zip(os.listdir(os.path.join(TRAIN_DIR,train)), os.listdir(os.path.join(TEST_DIR,test))):
        TRAIN_IMG.append(os.path.join(TRAIN_DIR,train,img_train))
        TEST_IMG.append(os.path.join(TEST_DIR,test,img_test))
        TRAIN_LABEL.append(train)
        TEST_LABEL.append(test)


# def get_images_from_category(category, num_train, num_test, data_dir):
#     category_dir = os.path.join(DATA_DIR, category)
#     num_total = num_train + num_test
#     filenames_train = []
#     filenames_test = []
    
#     for k, filename in enumerate(os.listdir(category_dir)):
#         if k < num_train:
#             filenames_train.append(os.path.join(category_dir, filename))
#         elif k < num_total:
#             filenames_test.append(os.path.join(category_dir, filename))
#         else:
#             break
#     return filenames_train, filenames_test

# def get_images_from_category_list(category_list, num_train, num_test, data_dir):
#     filenames_train_all = []
#     target_train = []
#     filenames_test_all = []
#     target_test = []
#     for category in category_list:
#         filenames_train, filenames_test = get_images_from_category(category, num_train, num_test, data_dir)
#         filenames_train_all.extend(filenames_train)
#         target_train.extend([category] * NUM_IMAGES_TRAIN_PER_CATEGORY)
#         filenames_test_all.extend(filenames_test)
#         target_test.extend([category] * NUM_IMAGES_TEST_PER_CATEGORY)
#     return filenames_train_all, filenames_test_all, target_train, target_test

def cria_vocabulario(imagens, num_clusters):
    km = cv2.BOWKMeansTrainer(num_clusters)
    akaze = cv2.KAZE_create()
    for p in imagens:
        img = cv2.imread(p, cv2.IMREAD_GRAYSCALE)
        mask = np.ones(img.shape)
        kp, desc = akaze.detectAndCompute(img, mask)
        km.add(desc)
    return km.cluster()

def representa(vocab, img):
    kaze = cv2.KAZE_create()
    kp = kaze.detect(img)
    bowdesc = cv2.BOWImgDescriptorExtractor(kaze, cv2.FlannBasedMatcher())
    bowdesc.setVocabulary(vocab)
    return bowdesc.compute(img, kp)

def transforma_imagens(imagens, vocab):
    X = []
    for p in imagens:
        img = cv2.imread(p, cv2.IMREAD_GRAYSCALE)
        X.append(representa(vocab, img).flatten())
    return np.array(X)



vocab = cria_vocabulario(TRAIN_IMG, NUM_CLUSTERS)
X_train = transforma_imagens(TRAIN_IMG, vocab)
X_test = transforma_imagens(TEST_IMG, vocab)
y_train = TRAIN_LABEL
y_test = TEST_LABEL

# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=-1, random_state=0, n_estimators = 100)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(X_train, y_train)
scr = clf.score(X_test, y_test)

In [33]:
def show_example(path = "Testes/Testes/9.png", Plot = True):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    img_resized = cv2.resize(img, dsize=(120, 120))
    if Plot:
        plt.imshow(img_resized, cmap='gray', vmin=0, vmax=255)
    return representa(vocab, img_resized)

# clf.predict_proba(show_example()), clf.classes_

## Análise Exploratória:

## Para realizar a análise exploratória seguiremos alguns passos:

# Passo 1:
## Extrair histograma:
### O código abaixo Extrai histograma de frequências relativas de features de todas as imagens do Dataset escolhido. (Pokemons)

In [34]:
origin_dir = 'Assets//Data_Filtered_Resized'
Hist_Dict = {}
for pkmn in os.listdir(origin_dir):
    Hist_Dict[pkmn] = []
    current_dir = os.path.join(origin_dir,pkmn)
    for k, img in enumerate(os.listdir(current_dir)):
        Hist_Dict[pkmn].append(show_example(os.path.join(current_dir,img), Plot = False))
# print(Hist_Dict['Alakazam'])

# Passo 2:
## Criar um DataFrame para trabalhar melhor com o DataSet:
### Foi necessário criar um dataframe dedicado a cada pokemon com suas respectivas frequências relativas. Visando melhorar e facilitar a análise permitindo cruzar e manusear os dados com maior facilidade.

In [35]:
lista = []
lista_nomes = os.listdir('Assets/Data_Filtered_Resized')
for k in Hist_Dict:
    x = pd.Series(Hist_Dict[k]).mean()
    x = pd.Series(x[0])
    lista.append(x)
df_medias = pd.DataFrame(lista, index = lista_nomes)

## Tabela das frequências relativas médias de cada feature por pokémon:

In [36]:
df_medias.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
Aerodactyl,0.00537,0.008657,0.003677,0.017769,0.02545,0.029396,0.013957,0.033435,0.029857,0.017584,...,0.007684,0.011772,0.02316,0.004911,0.006304,0.067243,0.023329,0.072332,0.034939,0.024245
Alakazam,0.005327,0.008195,0.00527,0.023065,0.014583,0.048091,0.0119,0.024753,0.056123,0.010113,...,0.008084,0.009054,0.030292,0.004303,0.004289,0.028933,0.036862,0.040554,0.038372,0.013197
Arbok,0.018193,0.025495,0.004566,0.013746,0.053167,0.033231,0.014881,0.017274,0.0327,0.010215,...,0.008181,0.009814,0.023665,0.011251,0.01185,0.023429,0.028,0.030735,0.021659,0.03492
Arcanine,0.007832,0.00589,0.002739,0.020347,0.030738,0.049283,0.005373,0.028757,0.048093,0.012536,...,0.008088,0.026238,0.032142,0.004082,0.00377,0.010771,0.046559,0.011625,0.053147,0.037954
Beedrill,0.004064,0.003796,0.004452,0.023542,0.01348,0.051968,0.010511,0.050559,0.050289,0.011758,...,0.007799,0.016905,0.023721,0.00284,0.007724,0.031719,0.041168,0.037871,0.0513,0.01982


# Passo 3:
## Calculando o valor médio dos dados:
### Nesta etapa foi necessário tirar o valor médio dos dados. Para assim aproxima-los do ponto (0,0), origem do sistema. Dessa forma permitindo a distinção evidente entre cada um deles.

In [37]:
df_medias = df_medias - (1/NUM_CLUSTERS) 

In [38]:
df_medias.sum(axis=1)

Aerodactyl    2.235174e-08
Alakazam      2.793968e-08
Arbok         2.048910e-08
Arcanine      4.097819e-08
Beedrill     -4.284084e-08
Bellsprout   -4.097819e-08
Bulbasaur     7.171184e-08
Charmander    2.235174e-08
Jigglypuff    4.097819e-08
Meowth        2.793968e-08
Pidgey        7.450581e-08
Squirtle     -7.450581e-09
Voltorb       5.215406e-08
dtype: float32

In [39]:
normas = (df_medias*df_medias).sum(axis=1)
for m in normas.index:
    df_medias.loc[m] = df_medias.loc[m]/np.sqrt(normas[m])

# Passo 4:
## Facilitando a comparação entre os pokemons:
### Tabela que mostra o quanto os pokémons são semelhantes entre si, diagonal principal é 1, pois todo pokémon é idêntico a si mesmo.

In [40]:
df_compara = df_medias.dot(df_medias.transpose())
df_compara

Unnamed: 0,Aerodactyl,Alakazam,Arbok,Arcanine,Beedrill,Bellsprout,Bulbasaur,Charmander,Jigglypuff,Meowth,Pidgey,Squirtle,Voltorb
Aerodactyl,1.0,0.708597,0.180822,0.351251,0.665188,0.395471,-0.030521,0.56686,0.119861,0.455059,0.480101,0.296195,0.331802
Alakazam,0.708597,1.0,0.199108,0.76344,0.889255,0.675858,0.2118,0.58646,0.505548,0.837661,0.620609,0.632238,0.011999
Arbok,0.180822,0.199108,1.0,0.399188,0.206276,0.174641,0.637423,0.479259,0.193077,0.205959,0.26132,0.550131,0.721993
Arcanine,0.351251,0.76344,0.399188,1.0,0.817256,0.755229,0.555241,0.722745,0.753807,0.877178,0.828138,0.91815,0.069048
Beedrill,0.665188,0.889255,0.206276,0.817256,1.0,0.782279,0.245681,0.654988,0.578733,0.869836,0.779213,0.691146,0.126588
Bellsprout,0.395471,0.675858,0.174641,0.755229,0.782279,1.0,0.160962,0.58248,0.702093,0.805889,0.766434,0.688977,0.055557
Bulbasaur,-0.030521,0.2118,0.637423,0.555241,0.245681,0.160962,1.0,0.555434,0.479888,0.393532,0.393506,0.657868,0.376637
Charmander,0.56686,0.58646,0.479259,0.722745,0.654988,0.58248,0.555434,1.0,0.599258,0.657362,0.861551,0.837879,0.464217
Jigglypuff,0.119861,0.505548,0.193077,0.753807,0.578733,0.702093,0.479888,0.599258,1.0,0.76029,0.674901,0.733005,0.115235
Meowth,0.455059,0.837661,0.205959,0.877178,0.869836,0.805889,0.393532,0.657362,0.76029,1.0,0.771343,0.765124,0.015726


### Aqui, podemos observar que alguns pokémons tem semelhanças muito fortes entre si, pois apresentam as mesmas features em abundância (na média).

In [41]:
monstros = []
for feat in range(NUM_CLUSTERS):
    monstros.append(sorted(df_medias.nlargest(n=5, columns=[feat]).index) + [feat])
x = sorted(monstros)
pprint(x)

[['Aerodactyl', 'Alakazam', 'Beedrill', 'Bulbasaur', 'Charmander', 17],
 ['Aerodactyl', 'Alakazam', 'Beedrill', 'Charmander', 'Pidgey', 16],
 ['Aerodactyl', 'Alakazam', 'Beedrill', 'Charmander', 'Voltorb', 45],
 ['Aerodactyl', 'Alakazam', 'Beedrill', 'Charmander', 'Voltorb', 47],
 ['Aerodactyl', 'Alakazam', 'Beedrill', 'Pidgey', 'Squirtle', 38],
 ['Aerodactyl', 'Alakazam', 'Charmander', 'Pidgey', 'Voltorb', 11],
 ['Aerodactyl', 'Alakazam', 'Meowth', 'Pidgey', 'Voltorb', 13],
 ['Aerodactyl', 'Arbok', 'Bellsprout', 'Jigglypuff', 'Voltorb', 1],
 ['Aerodactyl', 'Arbok', 'Bulbasaur', 'Jigglypuff', 'Voltorb', 28],
 ['Aerodactyl', 'Arbok', 'Bulbasaur', 'Meowth', 'Voltorb', 27],
 ['Aerodactyl', 'Arbok', 'Charmander', 'Squirtle', 'Voltorb', 6],
 ['Aerodactyl', 'Beedrill', 'Bellsprout', 'Meowth', 'Pidgey', 7],
 ['Aerodactyl', 'Bulbasaur', 'Charmander', 'Jigglypuff', 'Voltorb', 40],
 ['Aerodactyl', 'Bulbasaur', 'Charmander', 'Pidgey', 'Voltorb', 9],
 ['Alakazam', 'Arbok', 'Arcanine', 'Bulbasaur',

In [42]:
df_compara.sum(axis = 1).sort_values(ascending = False)

Squirtle      8.890807
Arcanine      8.810672
Charmander    8.568492
Pidgey        8.534653
Meowth        8.414959
Beedrill      8.306439
Alakazam      7.642574
Bellsprout    7.545871
Jigglypuff    7.215695
Bulbasaur     5.637450
Aerodactyl    5.520687
Arbok         5.209196
Voltorb       3.802022
dtype: float32

In [43]:
scr, 1/13

(0.4252767527675277, 0.07692307692307693)

In [86]:
hits, miss = 0, 0
for img, label in zip(TEST_IMG, TEST_LABEL):
    rep = representa(vocab, cv2.imread(img))
    top3 = pd.Series(clf.predict_proba(rep)[0], index = os.listdir('Assets/Data_Test')).nlargest(3)
    if label in top3.index.tolist():
        hits += 1
    else:
        miss += 1
        
hits, miss, hits/(hits+miss)

## Bibliografia:
- Modelo Bag of Visual Words, e parte da análise exploratória produzidos por/com assistência de Fábio Ayres.
- Dataset: [Pokémon Gen One](https://www.kaggle.com/thedagger/pokemon-generation-one/data) da plataforma Kaggle.com
