In [33]:
import os
import itertools
from PIL import Image,ImageEnhance,ImageFilter
import pandas as pd
import numpy as np 
import multiprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score,KFold,cross_val_predict,train_test_split,GridSearchCV
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn.preprocessing import normalize
from itertools import groupby
from random import randint
from collections import Counter

In [4]:
#Variables
%matplotlib inline
dataDir = "./data/out/"

## Construcción de DataFrame y Tratamiento de Imagenes

Nos aprovechamos de que las imágenes tienen la letra en el nombre del archivo, para cada imagen realzamos el contraste y la ponemos en blanco y negro (en la memoria estaría guay poner algo tipo, antes del tratamiento y después). Las imágenes originalmente tienen unas dimensiones de 160x120, nosotros para el DataFrame las expandimos, y tenemos un DataFrame final de 19801 columnas (19800 de cada pixel y 1 del etiquetado de la letra a la que corresponde esa imagen)

In [109]:
files = os.listdir(dataDir)
l1 = []
cols = []
height = None
width = None
for file in files:
    with open(dataDir+file,"rb") as f:
        im = Image.open(dataDir+file)
        enhancer = ImageEnhance.Contrast(im)
        im = enhancer.enhance(4.)
        #im = im.filter(ImageFilter.GaussianBlur(radius=4))
        label = file.split("_")[1].split(".")[0]
        imArray = np.array(im)
        imArray[imArray >= 128] = 255
        imArray[imArray < 128] = 0
        height,width = imArray.shape
        l1.append([bit for bit in imArray.reshape(imArray.shape[0]*imArray.shape[1])]+[label]) #Todos los bits de la imagen
        
df = pd.DataFrame(l1)
for i in range(height):
    for j in range(width):
        cols.append("%dx%d"%(i,j))
cols.append("Label")
df.columns = cols        

Esta función permite ver las imagenes del DataFrame con el que trabajamos, solo hay que pasarle el DataFrame, la fila y la forma de la imagen original

In [102]:
def showImage(df,row,shape):
    data = np.asarray(df.iloc[row,:-1],dtype=np.uint8).reshape(shape)
    img = Image.fromarray(data)
    img.show()
showImage(df,346,(height,width))    

Como nos piden probar con distintos clasificadores y distintas configuraciones,  la función `fitHyperparameter` encuentra los parámetros óptimos en un rango mediante GridSearch y validación cruzada para unos datos dado un clasificador y unos datos.
La función `testClassifiers` prueba distintos clasificadores que vienen dados por un diccionario

`clfs = {
        "NombreClasificador":{"clf": ClaseClasificador(),"confs":{"parametro":posibles valores,"parametro":.....}
        ...
       }  `   

In [111]:

def fitHyperparameter(clf,conf,X,y,splits,njobs=multiprocessing.cpu_count()):
    opt = GridSearchCV(clf,conf,cv=splits,n_jobs=njobs,iid=False)
    search = opt.fit(X, y)
    clf.set_params(**search.best_params_)
    return clf


def testClassifiers(clfs,data,testSize,splits):
    results = {}
    X,y = data.iloc[:,:-1],data.iloc[:,-1]
    xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=testSize)
    for c in clfs:
        clf,conf = clfs[c]["clf"],clfs[c]["confs"]
        clf = fitHyperparameter(clf,conf,xTrain,yTrain,splits)
        clf.fit(xTrain,yTrain)
        results[c] = {"conf":clf.get_params(),"acc":accuracy_score(yTest, clf.predict(xTest), normalize=True, sample_weight=None)}
    return results


clfs = {
        "MLP":{"clf": MLPClassifier(max_iter=200,hidden_layer_sizes=(20,20), random_state=1),"confs":{"solver":("lbfgs","adam","sgd")}},
        "RandomForest":{"clf":RandomForestClassifier(n_jobs=8,),"confs":{"n_estimators":range(10,200,10)}},
        "KNN":{"clf":KNeighborsClassifier(n_jobs=8),"confs":{"n_neighbors":range(1,10),"weights":("uniform","distance")}},
        "SVM":{"clf":svm.SVC(C=0.25),"confs":{"kernel":("rbf","linear","sigmoid")}},
        "Logistic Regression":{"clf": LogisticRegression(tol=np.inf,multi_class="multinomial"),"confs":{"penalty":("none","l2"),"solver":("lbfgs","sag","saga"),"max_iter":range(10,100,15)}},
        "SGD":{"clf":SGDClassifier(eta0=0.5,learning_rate='constant',tol=np.inf),"confs":{"eta0":np.linspace(0.1,2,10),"max_iter":range(10,50,10)}}
       }      
print(testClassifiers(clfs,df,0.1,5))


{'MLP': {'conf': {'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (20, 20), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_iter': 200, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': 1, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}, 'acc': 0.09090909090909091}, 'RandomForest': {'conf': {'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 140, 'n_jobs': 8, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}, 'acc': 0.8}, 'KNN': {'conf': {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'mink

## Nuevo Clasificador

Esto en la memoria lo explico yo, no worries, basicamente detecta patrones y hace similitudes con ellos, tiene un 0.98 de acierto por validación cruzada

In [51]:
def getImagePatterns(img,storage,patternSensibility=3):
        patternQueue = []
        for row in img:
            pattern = getPattern(row)
            if len(patternQueue) == 0:
                patternQueue.append(pattern)
            elif patternQueue[-1] == pattern:
                patternQueue.append(pattern)
                if len(patternQueue) == patternSensibility:
                    patternQueue = []
                    if len(storage) == 0:
                        storage.append((pattern,1))
                    elif pattern != storage[-1][0]:
                        storage.append((pattern,1))
                    else:
                        storage[-1] = (storage[-1][0],storage[-1][1] + 1)
            else:
                patternQueue = [pattern]

def getSimilarityScore(imagePatterns1,imagePatterns2):
        count = 0

        if len(imagePatterns1) < len(imagePatterns2):
            minList = imagePatterns1
            maxList = imagePatterns2
        else:
            minList = imagePatterns2
            maxList = imagePatterns1

        for i,x in enumerate(minList):
            if x[0] == maxList[i][0]:
                count += 1/(1+abs(x[1] - maxList[i][1]))
            else:
                continue 

        return count/len(maxList)

def getPattern(row,patternTrigger=5):
        pattern = []
        aux = []
        counter = 0
        color = -1
        i = 0
        while i < len(row):
            if color == row[i]:
                counter += 1
                if counter >= patternTrigger:
                    pattern.append(color)
                    counter = 0
            else:
                counter = 1
                color = row[i]
            i += 1

        return [cl for cl,_ in flatList(pattern)]
        

def flatList(l):
        flat=[]
        actual = l[0]
        count = 0
        i = 0
        while i < len(l):
            if i+1 == len(l):
                flat.append((l[i],count+1))
            elif l[i+1] == l[i]:
                count +=1
            else:
                flat.append((l[i],count+1))
                count = 0

            i += 1
        return flat

def cont(data):
    originalShape = data.shape
    HCont,VCont = [],[]
    for row in data:
        row = list(row)
        
        if 0 in row:
            first = row.index(0)
            row.reverse()
            last = (- row.index(0) -1) % len(row)
            row.reverse()
            HCont.append([0 if i >= first and i <=  last else 255 for i in range(len(row))])
        else:
            HCont.append(row)
    for row in np.transpose(data):
        row = list(row)
            first = row.index(0)
            row.reverse()
            last = (- row.index(0) -1) % len(row)
            row.reverse()
            VCont.append([0 if i >= first and i <=  last else 255 for i in range(len(row))])
        else:
            VCont.append(row)
    
    HCont = np.array(HCont,dtype=np.uint8).reshape(originalShape)
    VCont = np.transpose(np.array(VCont,dtype=np.uint8))
    
    ret = HCont + VCont
    ret[ret > 250] = 255
    ret[ret < 250] = 0
    return ret
    

In [47]:
class block():
    Hpatterns = []
    Vpatterns = []
    width = None
    blackOverWhite = 0.
    
    def __init__(self,width):
        self.Hpatterns = []
        self.Vpatterns = []
        self.blackOverWhite = 0.
        self.width = width
    
    
    def match(self,image):
        HScore,Vscore = 0,0
        Hpatterns = []
        getImagePatterns(image,Hpatterns)
        Hscore = getSimilarityScore(self.Hpatterns,Hpatterns)
        
        Vpatterns = []
        getImagePatterns(np.transpose(image),Vpatterns)
        Vscore = getSimilarityScore(self.Vpatterns,Vpatterns)
        
        return Hscore*Vscore
        
        
    
    def generatePatterns(self,image):
        vals,counts = np.unique(image,return_counts=True)
        self.blackOverWhite = counts[np.where(vals == 0)]/sum(counts)
        self.width = image.shape[1]
        getImagePatterns(image,self.Hpatterns)
        getImagePatterns(np.transpose(image),self.Vpatterns)
    

                        
    def __eq__(self,other):
        if other.Hpatterns == self.Hpatterns and other.Vpatterns == self.Vpatterns:
            return True
        else:
            return False
        

            
class PatternClassifier():
    patterns = {}
    classes = None
    
    def __init__(self,classes,height,width):
        self.classes = classes
        self.patterns = {cl:{"ink":.0,"blocks":[]} for cl in classes}    
    
    def fit(self,X,y):
        for cl in self.classes:
            bOW  = []
            imgs = np.array(X.iloc[np.where(y == cl)])
            for img in imgs:
                img = img.reshape((height,width))
                img = cont(img) 
                vals,counts = np.unique(img,return_counts=True)
                bOW.append(counts[np.where(vals == 0)]/sum(counts))
                b = block(width)
                b.generatePatterns(img)
                if b not in self.patterns[cl]["blocks"]:
                    self.patterns[cl]["blocks"].append(b)
            self.patterns[cl]["ink"] = np.mean(np.array(bOW))
    
    def predict(self,X):
        y = []
        for i,x in enumerate(np.array(X)):
            matches = []
            maxMatches = []
            #print(i,"/",height)
            img = x.reshape((height,width))
            img = cont(img)

            for cl in self.classes:
                for b in self.patterns[cl]["blocks"]:
                    matches.append((b.match(img),cl))
            
            sortedMatches = sorted(matches,reverse=True)
            
            parts = []
            for cl in self.classes:
                parts.append((np.mean([fit for fit,c in sortedMatches if c == cl][:3]),cl))
            y.append(sorted(parts,reverse=True)[0][1])

                
        return y


            
            
        
        
        


        

In [27]:
acc = []
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
kf = KFold(n_splits=10, shuffle=True)
pc = PatternClassifier(np.unique(y),height,width)
for train_index, test_index in kf.split(X):
    xTrain = X.iloc[train_index]
    yTrain = y.iloc[train_index]
    xTest = X.iloc[test_index]
    yTest = y.iloc[test_index]
    pc.fit(xTrain, yTrain)
    sc = accuracy_score(yTest, pc.predict(xTest), normalize=True, sample_weight=None)
    acc.append(sc)
print("Acc Cross validated:",np.mean(acc))

Acc Cross validated: 0.9800000000000001
