# Exemplo de "Bag of Visual Words"

Vocês estão recebendo este código do professor e devem dar o crédito devido, para que não se caracterize a situação de tentar passar esforço dos outros como sendo seu (a.k.a. plágio). Divirtam-se!

In [None]:
!pip install opencv-contrib-python
import cv2
import os
import os.path
import numpy as np
import math
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# Set random seed
np.random.seed(0)


DATA_DIR = 'Assets//Data_Filtered_Resized'
CATEGORY_LIST = ['Alakazam', 'Aerodactyl']   #os.listdir(DATA_DIR)
NUM_IMAGES_TRAIN_PER_CATEGORY = 60
NUM_IMAGES_TEST_PER_CATEGORY = 20
NUM_CLUSTERS = 25

def get_images_from_category(category, num_train, num_test, data_dir):
    category_dir = os.path.join(DATA_DIR, category)
    num_total = num_train + num_test
    filenames_train = []
    filenames_test = []
    
    for k, filename in enumerate(os.listdir(category_dir)):
        if k < num_train:
            filenames_train.append(os.path.join(category_dir, filename))
        elif k < num_total:
            filenames_test.append(os.path.join(category_dir, filename))
        else:
            break
    return filenames_train, filenames_test

def get_images_from_category_list(category_list, num_train, num_test, data_dir):
    filenames_train_all = []
    target_train = []
    filenames_test_all = []
    target_test = []
    for category in category_list:
        filenames_train, filenames_test = get_images_from_category(category, num_train, num_test, data_dir)
        filenames_train_all.extend(filenames_train)
        target_train.extend([category] * NUM_IMAGES_TRAIN_PER_CATEGORY)
        filenames_test_all.extend(filenames_test)
        target_test.extend([category] * NUM_IMAGES_TEST_PER_CATEGORY)
    return filenames_train_all, filenames_test_all, target_train, target_test

def cria_vocabulario(imagens, num_clusters):
    km = cv2.BOWKMeansTrainer(num_clusters)
    akaze = cv2.KAZE_create()
    for p in imagens:
        img = cv2.imread(p, cv2.IMREAD_GRAYSCALE)
        mask = np.ones(img.shape)
        kp, desc = akaze.detectAndCompute(img, mask)
        km.add(desc)
    return km.cluster()

def representa(vocab, img):
    kaze = cv2.KAZE_create()
    kp = kaze.detect(img)
    bowdesc = cv2.BOWImgDescriptorExtractor(kaze, cv2.FlannBasedMatcher())
    bowdesc.setVocabulary(vocab)
    return bowdesc.compute(img, kp)

def transforma_imagens(imagens, vocab):
    X = []
    for p in imagens:
        img = cv2.imread(p, cv2.IMREAD_GRAYSCALE)
        X.append(representa(vocab, img).flatten())
    return np.array(X)

imagens_train, imagens_test, target_train, target_test = get_images_from_category_list(CATEGORY_LIST, 
                                                                                        NUM_IMAGES_TRAIN_PER_CATEGORY, 
                                                                                        NUM_IMAGES_TEST_PER_CATEGORY, 
                                                                                        DATA_DIR)
vocab = cria_vocabulario(imagens_train, NUM_CLUSTERS)
X_train = transforma_imagens(imagens_train, vocab)
X_test = transforma_imagens(imagens_test, vocab)
y_train = target_train
y_test = target_test

# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=-1, random_state=0, n_estimators = 100)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(X_train, y_train)
scr = clf.score(X_test, y_test)

In [None]:
def show_example(path = "Testes/Testes/9.png", Plot = True):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    img_resized = cv2.resize(img, dsize=(120, 120))
    if Plot:
        plt.imshow(img_resized, cmap='gray', vmin=0, vmax=255)
    return representa(vocab, img_resized)

show_example()

## Análise Exploratória:

In [None]:
"""
Extrai histograma de frequências relativas de features de todas as imagens do Dataset.
"""

# origin_dir = 'Assets//Data_Filtered_Resized'
# Hist_Dict = {}
# for pkmn in os.listdir(origin_dir):
#     Hist_Dict[pkmn] = []
#     current_dir = os.path.join(origin_dir,pkmn)
#     for k, img in enumerate(os.listdir(current_dir)):
#         Hist_Dict[pkmn].append(show_example(os.path.join(current_dir,img), Plot = False))
# print(Hist_Dict['Alakazam'])

In [None]:
lista = []
lista_nomes = os.listdir('Assets/Data_Filtered_Resized')
for k in Hist_Dict:
    x = pd.Series(Hist_Dict[k]).mean()
    x = pd.Series(x[0])
    lista.append(x)
df_medias = pd.DataFrame(lista, index = lista_nomes)

In [None]:
df_medias
print("Tabela das frequências relativas médias de cada feature por pokémon")

In [None]:
df_medias = df_medias - (1/25) ## Tira o valor médio dos dados, assim aproximando-os do ponto (0,0), e permitindo a distinção entre eles.

In [None]:
df_medias.sum(axis=1)

In [None]:
normas = (df_medias*df_medias).sum(axis=1)
for m in normas.index:
    df_medias.loc[m] = df_medias.loc[m]/np.sqrt(normas[m])

In [None]:
df_medias.dot(df_medias.transpose())
print("Tabela que mostra o quanto os pokémons são semelhantes entre si, diagonal principal é 1, pois todo pokémon é idêntico a si mesmo.")

In [None]:
monstros = []
for feat in range(25):
    monstros.append(sorted(df_medias.nlargest(n=5, columns=[feat]).index) + [feat])
x = sorted(monstros)
print(x, "Aqui, podemos observar que alguns pokémons tem semelhanças muito fortes entre si, pois apresentam as mesmas features em abundância (na média).")

## Bibliografia:
- Modelo Bag of Visual Words, e parte da análise exploratória produzidos por/com assistência de Fábio Ayres.
- Dataset: [Pokémon Gen One](https://www.kaggle.com/thedagger/pokemon-generation-one/data) da plataforma Kaggle.com
