In [1]:
#import das libs necessárias
import pandas as pd # trabalhar com dataframes
import numpy as np # realizacao de algumas operacoes com matrizes

#imagens
import cv2 # transformacoes faceis em imagens
from PIL import Image # trabalhar com imagens

# ferramentais
import glob # exploracao de diretorios
from pylab import *

# plot 
import matplotlib.pyplot as plt # plotagem
%matplotlib inline

# Machine Learning
from sklearn import feature_selection

In [2]:
# itera por cada imagem  adicionando a classe de acordo com o nome da img
imagePaths = glob.glob(r"../data/sample/*")
IMG_SIZE = 400
X = []
Y = []
for img in imagePaths:
    img_data = cv2.imread(img, cv2.IMREAD_GRAYSCALE)
    img_data = cv2.resize(img_data, (IMG_SIZE, IMG_SIZE))
    X.append([np.array(img_data)])
    Y.append(0 if "cat" in img else 1)

In [3]:
# prepara dataframe com img e classe
X_train = np.array([i[0] for i in X]).reshape(-1, IMG_SIZE, IMG_SIZE)
X_train = np.array([i.flatten() for i in X_train])

# Prepara Dataframe
df = pd.DataFrame(X_train)
df["class"] = Y
df.shape

(200, 160001)

In [4]:
#visualização do dataframe ao final do processo de entrada de dados
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,159991,159992,159993,159994,159995,159996,159997,159998,159999,class
0,194,161,149,151,153,145,147,147,145,146,...,115,114,115,116,115,113,113,117,122,0
1,30,27,20,13,11,13,19,28,38,47,...,170,171,171,170,167,162,156,151,145,0
2,239,233,222,237,246,249,252,249,235,217,...,214,215,213,212,207,207,211,215,215,0
3,77,76,74,71,71,62,58,61,70,47,...,176,181,184,184,182,183,183,184,185,0
4,74,75,77,79,82,84,87,89,91,92,...,99,98,97,96,95,94,93,92,92,0
5,19,18,18,18,17,17,18,18,18,19,...,124,128,130,129,126,121,112,102,92,0
6,7,7,7,7,7,7,7,7,8,8,...,41,45,52,53,38,23,22,28,25,0
7,156,154,152,152,154,156,157,152,155,150,...,141,144,151,146,142,141,144,146,147,0
8,25,25,27,28,28,27,26,27,28,29,...,42,66,94,90,64,65,84,97,106,0
9,34,39,41,35,34,42,40,39,39,34,...,55,52,54,57,54,51,53,55,56,0


### Definição da função geradora dos K-folds

- Dados um pandas.DataFrame dataset, uma string class_name e um int k, retorna o dataset divido em k partes estratificadas pelos valores da coluna class_name.

#### Observação: 
- Vale lembrar que a priorizamos manter as mesmas distribuições entre as classes, por isso, entre os folds, as classes terão no máximo 1 instância a mais ou a menos que a outra.

In [5]:
#função que retorna o dataset divido em k partes estratificadas pelos valores da coluna class_name
def get_k_fold(dataset, class_name, k):
    #reconhece as classes que serão estratificadas
    classes = dataset[class_name].drop_duplicates()
    
    #cria uma lista com os dataframes que serao preenchidos (folds)
    folds=[pd.DataFrame() for i in range(k)]
    
    #itera sobre as classes para garantir que todas tenham a mesma distribuição do dataset original
    for cls in classes:
        #filtra o dataset para o valor de classe atual
        cls_data = dataset[dataset[class_name]==cls]
        
        #calcula o valor de n (numero de instâncias da classe em cada fold)
        n = int(len(cls_data)/k)
        
        #itera sobre os folds, adicionando uma quantidade n de instancias em cada fold
        for i in range(1,k+1):
            #adiciona as primeiras n instancias
            folds[i-1] = folds[i-1].append(cls_data[max(0, (i-1)*n) : n*i-1])
            #adiciona mais 1 instancia para os casos onde sobram instancias sem fold
            folds[i-1] = folds[i-1].append(cls_data[(k*n)+i:(k*n)+i+1])
            
    return folds

In [6]:
#chama a funcao para dividir o dataset em df em 13 folds estratificados pelos valores da coluna "class"
folds = get_k_fold(df, "class", 13)

In [7]:
#visualizacao do primeiro fold obtido
folds[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,159991,159992,159993,159994,159995,159996,159997,159998,159999,class
0,194,161,149,151,153,145,147,147,145,146,...,115,114,115,116,115,113,113,117,122,0
1,30,27,20,13,11,13,19,28,38,47,...,170,171,171,170,167,162,156,151,145,0
2,239,233,222,237,246,249,252,249,235,217,...,214,215,213,212,207,207,211,215,215,0
3,77,76,74,71,71,62,58,61,70,47,...,176,181,184,184,182,183,183,184,185,0
4,74,75,77,79,82,84,87,89,91,92,...,99,98,97,96,95,94,93,92,92,0
5,19,18,18,18,17,17,18,18,18,19,...,124,128,130,129,126,121,112,102,92,0
100,42,42,42,43,44,44,44,44,43,43,...,55,46,43,44,48,42,36,33,32,0
91,115,118,116,110,105,107,112,118,128,142,...,71,85,99,99,87,73,69,71,71,1
92,228,176,136,124,131,135,139,141,136,132,...,248,246,248,250,250,249,247,247,247,1
93,159,163,173,181,183,183,168,152,165,177,...,199,200,197,195,195,196,196,196,196,1


In [8]:
#visualizacao do segundo fold obtido
folds[1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,159991,159992,159993,159994,159995,159996,159997,159998,159999,class
7,156,154,152,152,154,156,157,152,155,150,...,141,144,151,146,142,141,144,146,147,0
8,25,25,27,28,28,27,26,27,28,29,...,42,66,94,90,64,65,84,97,106,0
9,34,39,41,35,34,42,40,39,39,34,...,55,52,54,57,54,51,53,55,56,0
10,18,21,25,28,28,25,25,27,32,35,...,60,64,66,64,58,56,57,61,66,0
11,90,86,80,73,64,56,54,50,41,34,...,62,57,44,39,31,39,44,48,51,0
12,152,148,145,150,158,165,167,168,164,157,...,202,197,193,190,190,197,201,198,194,0
101,57,58,50,41,46,28,36,49,59,80,...,38,24,24,35,45,137,185,203,207,0
98,21,21,21,20,21,23,26,30,34,37,...,125,134,136,132,127,136,135,132,130,1
108,92,101,90,75,83,91,80,60,68,86,...,248,248,248,248,248,248,247,247,247,1
109,195,225,247,251,242,240,230,177,110,90,...,69,83,100,111,115,118,114,96,74,1


In [9]:
#visualizacao do último fold
folds[-1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,159991,159992,159993,159994,159995,159996,159997,159998,159999,class
84,133,133,133,133,133,133,133,133,133,133,...,113,117,120,123,122,121,120,119,118,0
85,56,51,48,50,57,65,71,70,67,58,...,211,216,212,206,197,191,193,202,209,0
86,114,115,118,120,119,113,107,107,111,118,...,38,44,37,23,24,42,48,40,24,0
87,127,126,129,105,101,135,135,134,135,138,...,98,80,83,90,96,105,117,117,117,0
88,96,100,103,103,103,105,107,107,107,106,...,111,115,118,117,113,111,110,109,108,0
89,9,8,7,7,6,6,6,7,7,7,...,22,28,32,31,33,39,43,41,36,0
184,14,15,16,17,18,19,18,17,16,13,...,9,9,9,11,12,14,15,16,17,1
185,149,149,149,148,147,146,146,145,145,145,...,65,64,64,64,64,64,65,65,65,1
186,11,11,12,14,15,16,18,23,35,47,...,36,31,32,35,38,37,34,32,32,1
187,54,56,58,60,61,61,60,57,56,62,...,71,72,72,72,72,72,73,74,74,1
