In [48]:
from os.path import isfile, join
from os import listdir
import tempfile
from pdf2image import convert_from_path, convert_from_bytes
import time
from PIL import Image
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Convert pdf files to images

In [41]:
def concatenateImages(images, destination):
    
    new_im = Image.new('RGB', (444,95))
    
    widths, heights = zip(*(i.size for i in images))

    total_width = sum(widths)
    max_height = max(heights)

    new_im = Image.new('RGB', (total_width, max_height))

    x_offset = 0
    
    for im in images:
        
        new_im.paste(im, (x_offset,0))
        x_offset += im.size[0]

    new_im.save(destination, 'JPEG')

In [42]:
def convertPDF2Img(source, destination):
    
    onlyfiles = [f for f in listdir(source) if isfile(join(source, f))]

    for file in onlyfiles:
        images = convert_from_bytes(open(f'{source}/{file}', 'rb').read())
            
        concatenateImages(images, f'{destination}/{file[:-4]}.jpg')


In [43]:
start_time = time.time()

convertPDF2Img('Bad', 'badImages') 
convertPDF2Img('Good', 'goodImages') 

print(time.time() - start_time)

98.44232368469238



# Prepare Data For CNN

In [22]:
badImages = [f for f in listdir('Bad')] #array with names of a bad images
goodImages = [f for f in listdir('Good')] #array with names of a good images

In [54]:
badImagesDataset = pd.DataFrame({
    'Name of a file': badImages ,
    'GoodOrBad': [0] * len(badImages)
})

In [55]:
goodImagesDataset = pd.DataFrame({
    'Name of a file': goodImages ,
    'GoodOrBad': [1] * len(goodImages),
})

In [56]:
dataset = badImagesDataset.append(goodImagesDataset) #Creating final dataset with all images

In [57]:
dataset.sample(frac = 1).reset_index(drop = True) #Shuffle dataset 

Unnamed: 0,Name of a file,GoodOrBad
0,KPMG_Thomas Yap_AM.pdf,1
1,Rodyk_Nelson Lee_Sr Cosec.pdf,0
2,Pos Logistics_Norashra Omar_Finance VP.pdf,0
3,Zhenghe Capital_Rainbow Chen_Office Manager.pdf,0
4,Vertex Venture_Ocvia Freriana_Compliance Lead.pdf,0
...,...,...
95,Harneys_Serene Lai_Cosec Officer.pdf,1
96,Manulife_Kenneth Lin_Manager.pdf,1
97,Evalueserve_SHRIMALI Shipra_Sr Equity Research...,1
98,Guard Capital_Godiva Tse_Office Manager.pdf,1


In [58]:
X = dataset["Name of a file"]
y = dataset["GoodOrBad"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [62]:
for image, in X_train, y_train:
    print(image)

28                 FCL Mgmt_Stacia Heng_Sr Accounts.pdf
43                         Brotzeit_Zara Zhang_FP_A.pdf
5               Paulson_Co_Ruby Shiu_Office Manager.pdf
16               Alibaba_Chester Chow_Cosec Officer.pdf
15                          Chanel_Penny Wong_AP GL.pdf
                            ...                        
10                      UBS_Shawn Koh_CDD Assurance.pdf
21    KPMG_Savita Sharma_Forensic Assistant Manager.pdf
14                    Equiom_Cyann So_Trust Officer.pdf
42                       Xander_Rohit Khandelwal_VP.pdf
1                                KPMG_Thomas Yap_AM.pdf
Name: Name of a file, Length: 67, dtype: object
28    0
43    1
5     0
16    1
15    1
     ..
10    1
21    1
14    0
42    1
1     1
Name: GoodOrBad, Length: 67, dtype: int64
