In [1]:
# Librairies
import pytesseract
import fitz
import io
from PIL import Image
import cv2 as cv
import numpy as np
from mtcnn import MTCNN
import re
import os
from tqdm import tqdm
from pathlib import Path
import matplotlib.image as mpimg 
import pandas as pd
import pickle
import joblib

## Data acquisition <br>
<p> 
    Loop in the directory containing legal files and do the following elements <br>
    <ul>
        <li> Extract pictures from pdf files </li>
        <li> Extract account number in corresponding page file </li>
        <li> Associate account to pictures and save them in the folder </li>
    </ul>
</p>

In [19]:
# Get matricule from file
detector = MTCNN()
def get_matricule(image):
    # Convert pdf file into string
    
    found = None
    
    string = pytesseract.image_to_string(image).strip()

    match_search = re.search('0\d{10}', string)
    if match_search:
        found = match_search.group(0)
        
    return found

#faces detector extraction funcfion
def get_face_detector(image):
    faces = detector.detect_faces(image)
    
    return faces



In [30]:
directory = 'datas'

def func_cv_cascade(image):
    
    faceCascade = cv.CascadeClassifier("face_detector.xml")
    
    gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
        
    faces = faceCascade.detectMultiScale2(gray,1.1,4, minSize=(145, 145))
        
    return faces

#Loop through the files
for subdir, dirs, files in os.walk(directory):
    
    for file in files:
        
        path = os.path.join(subdir, file)
        
        # File reading
        try:
            file = fitz.open(path)
            
            print(path)
        except Exception as e:
            print("Error occured: {}".format(e))
            break
        
        # Process
        
        # Read images inside pdf
        images = []
        for page_index in range(len(file)):

            #get page and images list
            page = file[page_index]
            images_list = page.getImageList()

            images_number = len(images_list)

            #print("Number of images: {}".format(images_number))

            if images_number != 0:

                for image_index, img in enumerate(images_list, start=1):

                    xref = img[0]
                    
                    # extract image bytes
                    base_image = file.extractImage(xref)
                    image_bytes = base_image["image"]
                    # get image extension
                    image_ext = base_image["ext"]
                    
                    # Load PIL                    
                    pil_image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
                    
                    image = np.array(pil_image)
                    image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
                    
#                     cv.imshow("im", image)
#                     cv.waitKey(0)

                    faces = func_cv_cascade(image)
                    #print(faces)
                                        
                    # Faces have been detected
                    if len(faces[0]) is not 0:
                        
                        matricule = get_matricule(image)
                        
                        if matricule != None:
                        
                            for i, face in enumerate(faces):
                                
                                box = faces[0][0]

                                x, y, h, w = box
                                img = image[y:y + h, x:x + w]
                                
                                folder = directory + "/" + str(matricule) + "/"

                                image_path = folder + str(image_index) + "_" + str(i) + "." + str(image_ext)
                                
                                
                                Path(folder).mkdir(parents=True, exist_ok=True)
                                                                
                                
                                # img = cv.cvtColor(img, cv.COLOR_BGR2RGB)

                                cv.imwrite(image_path, img)
                                
                                # Remove thses two lines in production
                                doc_path = folder +  str(matricule) + ".pdf"
                                
                                file.save(doc_path)
                                
                                break

datas\000010001095100221ETSKIKI.pdf
datas\000010001159105186TANKIOELISE.pdf
datas\000010001222105134KONGVOULATAPEREKONGVOULATAPERE.pdf
datas\000010002858105194KOUEKEM.pdf
datas\000010003072105139NDJIKEUEDMOND.pdf
datas\000010003072105139ndjikeuok.pdf
datas\000010003276105160CHAUNGEUJEAN.pdf
datas\000010003368105101FANKAMDAVID.pdf
datas\000010003939105204DOSSIEROUVERTURECOMPTE+DOSSIERSUPPLEMENTAIRE.pdf
datas\000010003957105106NGUEDJUITHERESE.pdf
datas\Couleur0899.pdf
datas\Couleur1053.pdf
datas\Couleur1055.pdf
datas\Couleur1126.pdf
datas\Couleur1127.pdf
datas\Couleur1129.pdf
datas\Couleur1130.pdf
datas\Couleur1138.pdf
datas\Couleur1187.pdf
datas\Couleur1195.pdf
datas\Couleur1316.pdf
datas\file.pdf


In [25]:
# Read images inside pdf
images = []
for page_index in range(len(file)):
    
    #get page and images list
    page = file[page_index]
    images_list = page.getImageList()
    
    images_number = len(images_list)
    
    print("Number of images: {}".format(images_number))
    
    if images_number != 0:
        
        for image_index, img in enumerate(images_list, start=1):
            
            xref = img[0]
            
            # extract image bytes
            base_image = file.extractImage(xref)
            image_bytes = base_image["image"]
            
            # get image extension
            image_ext = base_image["ext"]
            
            # Load PIL
            image = Image.open(io.BytesIO(image_bytes))
            
            image_path = "datas/image" + str(page_index + 1) + "_" + str(image_index) + "." + str(image_ext)
            
            image.save(image_path)
            
            image_path_merge = (image, image_path)
            
            images.append(image_path_merge)

Number of images: 1
Number of images: 1
Number of images: 1
Number of images: 1
Number of images: 1
Number of images: 1
Number of images: 1
Number of images: 1
Number of images: 1
Number of images: 1
Number of images: 1
Number of images: 1


## Model building
<p>
    This part of code is dedicated to model building, 
    we aim to build here the our classification model for face recognition.
    The process is divided as follow:
    <ul>
        <li> Data augmentation: Using traditionnal method or Generation methods like Adversarial networks </li>
        <li> Data preprocessing: To prepare the data in other to fit well model building process </li>
        <li> Model building: We intend to build here a convolutionnal neural network if all conditions are met; <br>
            Otherwise, we'll use a more suitable method </li>
         <li> Model Evaluation: To evaluate our model accuracy </li>   
    </ul>
<p>

In [2]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import img_to_array
from keras.models import load_model

In [90]:
X_train = new_model.predict(X_train)
X_test = new_model.predict(X_test)
X_test.shape, X_train.shape

((10, 2048), (16, 2048))

In [3]:
#Convert each face to his correspondent embedding
def get_embedding(model, face):
    
    face = np.array(face).astype('float32')
    
    mean, std = face.mean(), face.std()
    
    face = (face - mean) / std
    
    # convert to model input
    embed_face = np.expand_dims(face, axis=0)
    
    # get prediction
    pred = model.predict(embed_face)
    
    return pred[0]


#Loop through the files
directory = "imgs"
x_train = []
y_train = []
model_transfert  = load_model('keras-facenet/model/facenet_keras.h5')


for subdir, dirs, files in os.walk(directory):
    
    for file in files:
        
        path = os.path.join(subdir, file)
        
        # File reading
        try:
            #file = fitz.open(path)
            
            # get image
            x = mpimg.imread(path)
            x = cv.resize(x,(160,160))
            
            embedding = get_embedding(model_transfert, x)
            x_train.append(embedding)
            
            # get account_number
            path_split = path.split("\\")
            account_number = path_split[1]
            y_train.append(account_number)
                        
            
        except Exception as e:
            print("Error occured: {}".format(e))
x_train = np.array(x_train)
y_train



['00010951002',
 '00011591051',
 '00028581051',
 '00028581101',
 '00039391052',
 '00621451051',
 '04961881051',
 '06576441051',
 '06756641101',
 '06759901051',
 '06778321051',
 '06779911101',
 '06780511051',
 '06783231101',
 '06783611051',
 '06840591101',
 '06840691101']

In [124]:
list(probas)

[array([0.03545853, 0.05663378, 0.05001278, 0.0596537 , 0.06225282,
        0.0524389 , 0.05165692, 0.05923474, 0.05149576, 0.06021238,
        0.0574472 , 0.05756227, 0.05428203, 0.06006576, 0.05730191,
        0.06279877, 0.05632492, 0.05516683])]

In [123]:
probas = new_model.predict_proba(new.reshape((1, -1)))
max_prob = np.argmax(probas)
probas[0][max_prob] , new_model.classes_[max_prob], 

(0.06279876903337908, '06784171051')

In [92]:
#y_train = y_train.reshape((-1))
#y_test = y_test.reshape((-1))

In [35]:
n = x_train.shape[0]
x_train = x_train.reshape((n, -1))

In [36]:
x_train.shape

(17, 128)

In [4]:
from sklearn.svm import SVC
# from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# from sklearn.ensemble import RandomForestClassifier

In [36]:
new_model = LogisticRegression(random_state=1)
new_model.fit(x_train, y_train)

LogisticRegression(random_state=1)

In [37]:
new_model.predict_proba (x_train[0].reshape((1,-1)))

array([[9.33295251e-01, 6.20720276e-03, 7.62768975e-03, 3.94905736e-03,
        2.24489258e-03, 4.32721432e-03, 4.79450483e-03, 2.70579975e-03,
        9.06593330e-03, 3.27900710e-03, 1.66995973e-03, 4.49848941e-03,
        1.13506738e-03, 6.52959219e-04, 3.12386619e-03, 3.82473491e-03,
        7.59837068e-03]])

In [42]:
new_model.classes_

array(['00010951002', '00011591051', '00028581051', '00028581101',
       '00039391052', '00621451051', '04961881051', '06576441051',
       '06756641101', '06759901051', '06778321051', '06779911101',
       '06780511051', '06783231101', '06783611051', '06840591101',
       '06840691101'], dtype='<U11')

In [34]:
new_model.predict_proba (x_train[0].reshape((1,-1)))

array([[0.03604073, 0.05850242, 0.05624378, 0.05876766, 0.06370974,
        0.05883437, 0.05840013, 0.0601526 , 0.05524242, 0.06264899,
        0.06486085, 0.06052789, 0.06141757, 0.06658163, 0.06340428,
        0.05870695, 0.05595799]])

In [44]:
classes = new_model.classes_
probas = new_model.predict_proba (x_train[0].reshape((1,-1)))
one = probas.reshape(-1).tolist()
two = classes.tolist()
preds = list(zip(one, two))
preds.sort(reverse=True)

In [49]:
preds.sort(reverse=True)

In [54]:
{ index: {"account_number": tup[1], "probability": tup[0]} for index, tup in enumerate(preds) }

{0: {'account_number': '00010951002', 'probability': 0.933295250725872},
 1: {'account_number': '06756641101', 'probability': 0.009065933303239591},
 2: {'account_number': '00028581051', 'probability': 0.0076276897479774395},
 3: {'account_number': '06840691101', 'probability': 0.007598370679763024},
 4: {'account_number': '00011591051', 'probability': 0.006207202756767783},
 5: {'account_number': '04961881051', 'probability': 0.004794504829249769},
 6: {'account_number': '06779911101', 'probability': 0.004498489414337839},
 7: {'account_number': '00621451051', 'probability': 0.004327214322607562},
 8: {'account_number': '00028581101', 'probability': 0.0039490573599957664},
 9: {'account_number': '06840591101', 'probability': 0.003824734905385653},
 10: {'account_number': '06759901051', 'probability': 0.0032790070997868585},
 11: {'account_number': '06783611051', 'probability': 0.0031238661941713234},
 12: {'account_number': '06576441051', 'probability': 0.0027057997517416247},
 13: {'

In [35]:
from sklearn.linear_model import LogisticRegression

In [None]:
new_model = SVC(probability=True)
new_model.fit(x_train, y_train)

In [9]:
new_model.predict_proba(x_train[0].reshape((1,-1)))

array([[0.03604073, 0.05850242, 0.05624378, 0.05876766, 0.06370974,
        0.05883437, 0.05840013, 0.0601526 , 0.05524242, 0.06264899,
        0.06486085, 0.06052789, 0.06141757, 0.06658163, 0.06340428,
        0.05870695, 0.05595799]])

In [75]:
model = RandomForestClassifier(n_estimators=4, max_depth=2)
model.fit(X_train, y_train)
pred = model.predict(X_train)
accuracy_score(y_train, pred)

0.9375

In [41]:
pred = new_model.predict(x_train)
accuracy_score(y_train, pred)

1.0

In [43]:
joblib.dump(new_model, "identity_model.sav")

['identity_model.sav']

In [100]:
from scipy.spatial.distance import cosine

In [101]:
X_test[0]

array([0.        , 0.02348295, 0.        , ..., 0.05618392, 0.09274382,
       0.53808534], dtype=float32)

In [127]:
cosine(X_train[0], X_train[15])

0.5424740016460419

In [84]:
from keras_vggface import VGGFace

In [87]:
new_model = VGGFace(model='resnet50', include_top=False, input_shape=(160, 160, 3), pooling='avg')

ValueError: Negative dimension size caused by subtracting 7 from 5 for '{{node avg_pool/AvgPool}} = AvgPool[T=DT_FLOAT, data_format="NHWC", ksize=[1, 7, 7, 1], padding="VALID", strides=[1, 7, 7, 1]](Placeholder)' with input shapes: [?,5,5,2048].

In [86]:
new_model.input

<KerasTensor: shape=(None, 224, 224, 3) dtype=float32 (created by layer 'input_1')>

In [67]:
pred = model.predict_proba(X_train)
#print(accuracy_score(y_train, pred))
pred

array([[0.15929882, 0.3421023 , 0.26282205, 0.23577684],
       [0.16880556, 0.24735219, 0.26879341, 0.31504885],
       [0.18433299, 0.2422787 , 0.25757465, 0.31581367],
       [0.22297237, 0.28735158, 0.25081702, 0.23885903],
       [0.20723918, 0.24421581, 0.28531072, 0.26323429],
       [0.27411606, 0.19240825, 0.25621964, 0.27725605],
       [0.23844674, 0.21466563, 0.24831009, 0.29857754],
       [0.26343751, 0.18766391, 0.30749227, 0.24140631],
       [0.25502759, 0.2813136 , 0.19731278, 0.26634602],
       [0.24187309, 0.29165293, 0.20555093, 0.26092305],
       [0.22502309, 0.33470323, 0.15884981, 0.28142386],
       [0.29195766, 0.26019563, 0.17974235, 0.26810436],
       [0.25529127, 0.27663583, 0.24667334, 0.22139955],
       [0.29220513, 0.34269004, 0.21594196, 0.14916287],
       [0.24747249, 0.24516819, 0.297015  , 0.21034432],
       [0.23623091, 0.28590656, 0.2710455 , 0.20681704]])

In [59]:
from keras_vggface.vggface import VGGFace
from keras_vggface.utils import preprocess_input
from keras_vggface.utils import decode_predictions

ModuleNotFoundError: No module named 'keras_vggface'

In [150]:
image_test = Image.open("imgs/00010951002/1_0.jpeg")

In [151]:
im = image_test.resize((160,160))

In [147]:
im = np.array(im)
im = np.expand_dims(im, axis=0)
im.shape

(1, 160, 160, 3)

In [154]:
#Convert each face to his correspondent embedding

def get_embedding(model, face):
    
    face = np.array(face).astype('float32')
    
    mean, std = face.mean(), face.std()
    
    face = (face - mean) / std
    
    # convert to model input
    embed_face = np.expand_dims(face, axis=0)
    
    # get prediction
    pred = model.predict(embed_face)
    
    return pred[0]

In [181]:
# Convert to embeddings

directory = "new_im"
face_datas = {}

for subdir, _, files in os.walk("new_im"):
    
    for file in files:
        
        #Get matricule and image path
        
        image_path = os.path.join(subdir, file)
        matricule = subdir.split("\\")[1]
        
        im = cv.imread(image_path)
        
        face = get_face_detector(im)
        
        if len(face) == 0:
            continue
            
        
        box = get_face_detector(im)[0]['box']
        
        x, y, h, w = box

        img = im[y-50:y+h+50, x-50:x+w]
        
        image = Image.fromarray(img).resize((160,160))

        
        #image = Image.open(image_path).resize((160,160))
        
        if matricule not in face_datas:
            face_datas[matricule] = []
            
        face_datas[matricule].append(get_embedding(model, image))
        
print(len(face_datas))

8


In [182]:
face_datas.keys()

dict_keys(['1', '10', '11', '4', '6', '7', '8', '9'])

In [179]:
min_treshold = 0.2

for(name, encoded_image_name) in face_datas.items():
      #distance between two embedding vector
        dist = np.linalg.norm(face_datas['1'][0] - encoded_image_name)
        print('Min dist: {}'.format(dist))
        
#         if(dist < min_dist):
#             min_dist = dist
#             identity = name
        

Min dist: 0.0
Min dist: 10.924302101135254
Min dist: 9.954599380493164
Min dist: 10.33474349975586
Min dist: 10.323163986206055
Min dist: 6.508690357208252
Min dist: 9.216507911682129
Min dist: 10.051806449890137


In [102]:
# Return only well colored images
def color_func(image):
    
    image = np.array(image)
    rbg = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
    return Image.fromarray(rgb)


def generate_images(img, directory,  number=10,):
    
    #reshaping
    image = img_to_array(img)
    image = np.expand_dims(image, axis=0)
    
    #image gen
    data_gen = ImageDataGenerator(
        featurewise_center=True,
        featurewise_std_normalization=True,
        rotation_range=25,
        zoom_range=0.15,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.15,
        horizontal_flip=True,
        fill_mode="nearest")
    
    # Generate and save to directory
    
    data_gen = data_gen.flow(image, batch_size=1, save_prefix="image", save_format="jpg", save_to_dir=directory)
    
    
    
    for i in range(number):
        next(data_gen)


parent_direct = "imgs"

for subdir, dirs, files in os.walk(parent_direct):
    
    for file in files:
        
        path = os.path.join(subdir, file)
        
        split = path.split("\\")
        
        data_gen = ImageDataGenerator(
        rotation_range=25,
        zoom_range=0.15,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.15,
        horizontal_flip=True,
        fill_mode="nearest")
        
        
        
        directory =  os.path.join(split[0], split[1], "generates")
        
        Path(directory).mkdir(parents=True, exist_ok=True)
        
        img = cv.imread(path)
        
        generate_images(img, directory, 100)
    

TypeError: flow() got an unexpected keyword argument 'preprocessing_function'

In [55]:
import tensorflow as tf

In [None]:
tf.keras.preprocessing.image_dataset_from_directory(
    "imgs",
    labels="inferred",
    label_mode="int",
    class_names=None,
    color_mode="rgb",
    batch_size=32,
    image_size=(256, 256),
    shuffle=True,
    seed=None,
    validation_split=None,
    subset=None,
    interpolation="bilinear",
    follow_links=False,
)


In [49]:
base_model = VGG16(weights=constant.IMAGENET, include_top=False, input_tensor=Input(shape=(constant.IMG_WIDTH, constant.IMG_HEIGHT, 3)), pooling='max', classes=15)   

base_model.summary()

for layer in base_model.layers:
    layer.trainable = False

x = base_model.get_layer('block5_pool').output
# Stacking a new simple convolutional network on top of it
x = Convolution2D(64, 3)(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Flatten()(x)
x = Dense(constant.NUMBER_FULLY_CONNECTED, activation=constant.RELU_ACTIVATION_FUNCTION)(x)
x = Dense(self.n_classes, activation=constant.SOFTMAX_ACTIVATION_FUNCTION)(x)

self.vgg = Model(inputs=base_model.input, outputs=x)
self.vgg.summary()

def __init__(self, dataSet=None):
   super().__init__(dataSet)
   opt = keras.optimizers.Adam(learning_rate=0.001)
   self.vgg.compile(loss=keras.losses.binary_crossentropy,
                    optimizer=opt,
                    metrics=[constant.METRIC_ACCURACY])

ModuleNotFoundError: No module named 'torch'