# Problem description: 
### Given document image, document PDF and OCR result create document classificator to predict 4 classes: resumee, email, invoice, letter

# Part 1 Approach
# Part 2 Image classification model
# Part 3 Text classification model
# Part 4 Final prediction

# Part 1 Approach
### I will train image classification model and text classification model and combine results of both models to make final prediction

In [1]:
import os
import numpy as np
import pandas as pd
import efficientnet.tfkeras as efn 
import tensorflow as tf
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_auc_score
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
from tensorflow.keras.layers import Input, Dropout, Dense, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.image import resize, decode_jpeg
from nltk.tokenize import RegexpTokenizer
from scipy.special import softmax

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 500)

In [2]:
df = pd.read_csv('document_type_data.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,ocr,text,label,file_name
0,"{'pageImages': [{'__typename': 'Image', 'width...","['Chaikin, ', 'Karen ', 'n ', ""O' "", 'o ', 'Fr...",email,2085136614c.pdf
1,"{'pageImages': [{'__typename': 'Image', 'width...","['> ', 'Jenny, ', 'After ', 'speaking ', 'with...",email,2085136814a.pdf
2,"{'pageImages': [{'__typename': 'Image', 'width...","['Please ', 'call ', 'with ', 'any ', 'questio...",email,2085140145a.pdf
3,"{'pageImages': [{'__typename': 'Image', 'width...","['2085158326 ', 'Williams, ', 'Carrie ', 'T. '...",email,2085158326.pdf
4,"{'pageImages': [{'__typename': 'Image', 'width...","['GJ ', '□3 ', 'A ', 'nice ', 'ending ', 'to '...",email,2085161311b.pdf


### shuffle sample

In [4]:
df = df.sample(frac = 1)

# Part 2: Image classification model
### Many possible choises for the model
### As a baseline I prefer efficientnet because there are several models to choose from with an increasing amount of parameters. 
### Depending on the speed vs quality needed for the task it I can choose whatever model I need. For the sake of speed in this example I will chose model with lowest parameters
### I will use whole image as an input

In [5]:
# create path to image that will be uaed as X
df['image_path'] = df.label + '/' + df.file_name.str[:-4]+'.jpg'
x_image = df['image_path']
# labels
y = pd.get_dummies(df['label']).astype('int32').values
# train test split
x_train, y_train = x_image[:80], y[:80]

In [6]:
# model parameters
img_size = 512 #arbitrary image size. In real life scenarios this will be one of the hyperparameters to tune.
channels = 3
batch_size = 16 
epochs = 3 # for the sake of speed I will keep epochs count low. Normally I would use ~100 epochs and set callbacks to stop training whenever the model finds in global extrema
labels = 4

In [7]:
# helper function that allows to iteratively read images from disk to avoid memory overflow
def decode_image(filename, label=None, image_size=(img_size, img_size)): 
    bits = tf.io.read_file(filename)
    image = decode_jpeg(bits, channels=channels)
    image = tf.cast(image, tf.float32) / 255.0
    image = resize(image, image_size)
    if label is None:
        return image
    else:
        return image, label

# dedicated function for image augmentation. we can use any suitable augmentation here: image quality augmentations, rotations, etc.  
def image_augment(filename, label): 
    image = decode_image(filename)
    #image = tf.image.random_flip_left_right(image)
    return image, label

In [8]:
# train and test pipeline
train_ds = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .map(image_augment)
    .cache()
    .shuffle(16)
    .repeat()
    .batch(batch_size)
    .prefetch(16)
    )

In [9]:
def build_model():
    base = efn.EfficientNetB0(weights='imagenet', input_shape=(img_size, img_size, channels), pooling = 'avg', include_top = False)
    x = base.output
    x = Dense(256, activation = 'elu')(x)
    x = BatchNormalization(axis = -1)(x)
    x = Dropout(rate = 0.5)(x)
    x = Dense(256, activation = 'elu')(x)
    x = BatchNormalization(axis = -1)(x)
    output = Dense(labels, activation="softmax")(x)
    model = Model(inputs=base.input, outputs = output)
    return model

In [10]:
opt = Adam(learning_rate=0.0005)
image_model = build_model()
image_model.compile(optimizer=opt, loss=CategoricalCrossentropy(), metrics=['accuracy'])

In [11]:
history = image_model.fit(
            train_ds, 
            steps_per_epoch=x_train.shape[0] // batch_size, 
            epochs=epochs
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


# Part 3: Language model for text
### Once again, many possible choises for the model. From Naive Bayes to modern models like BERT. I will use smaller version of BERT - distilbert in this task
### It might not the best possible choise here because I dont have enough memory to tokenize whole document without truncation and it is not what I would usually pick as a baseline model.
### But I think that you dont need the whole document to make a correct prediction about its type. 
###  CV will usualy have a specific title in the beginning. Letters will have an adress etc

In [12]:
# cleat text from all non alphabetic + numeric symbols
clean_text = RegexpTokenizer(r'\w+')

In [13]:
df['full_text'] = df['text'].apply(clean_text.tokenize)
df.full_text = df.full_text.apply(lambda x: ' '.join(x))

### This is the part that explains why throwing the whole document text is not the best solution. 
### Possible workaround: use OCR extracted coordinates to create "phrases" and feed phrases to the model. 
### This will increase our dataset, improve robustness of the model and will allow usage of more advanced and heavy models

In [14]:
# too many words in the document
df.full_text.map(lambda x: len(x)).describe()

count     100.000000
mean     1079.770000
std       671.980742
min       177.000000
25%       565.750000
50%       911.000000
75%      1443.250000
max      3260.000000
Name: full_text, dtype: float64

In [15]:
x_text = df['full_text']
y = pd.get_dummies(df['label']).astype('int32').values
x_train, y_train = x_text[:80], y[:80]

In [16]:
batch_size = 16
epochs = 2

optimizer = Adam(learning_rate=0.0005)
loss = CategoricalCrossentropy(from_logits=True)
metric = CategoricalAccuracy('accuracy')

In [17]:
model_name = 'distilbert-base-uncased'
max_length = 256 # maximum length of the string used

tokenizer = DistilBertTokenizerFast.from_pretrained(pretrained_model_name_or_path=model_name, do_lower_case=True)
text_model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=labels)
text_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'activation_13', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_20', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [18]:
x_train = tokenizer(
    text=x_train.to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding='max_length', 
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True)

In [19]:
history = text_model.fit(
    x=[x_train['input_ids'], x_train['attention_mask']],
    y=y_train,
    batch_size=batch_size,
    epochs=epochs,
    shuffle=True)

Epoch 1/2
Epoch 2/2


# Part 4: Final prediction
### There are 2 ways to make the best out of 2 models
### 1. Extract embedding feature layer from the CNN and BERT and train those outputs together using models like multinomial logistic regression or xgboost
### 2. Combine probability predictions of 2 models with different coefficients and pick the class with the highest combined probability
### I will use second approach in this case

In [20]:
x_image_test, x_text_test, y_test = x_image[80:], x_text[80:], y[80:]

In [21]:
test_ds = (
    tf.data.Dataset
    .from_tensor_slices((x_image_test, y_test))
    .map(decode_image)
    .cache()
    .batch(batch_size)
    .prefetch(16)
    )
x_test = tokenizer(
    text=x_text_test.to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding='max_length', 
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True)

In [22]:
image_pred = image_model.predict(test_ds)
text_pred = text_model.predict(x=[x_test['input_ids'], x_test['attention_mask']])
text_pred = softmax(text_pred[0], axis = 1)

In [23]:
def ensemble(image_pred, text_pred, w1 = 0.5, w2 = 0.5):
    num_predictions = len(image_pred)
    a = np.zeros(shape=(num_predictions, labels))
    for i in range(num_predictions):
        for n in range(4):
            a[i][n] = w1 * image_pred[i][n] + w2 * text_pred[i][n]
    return a

In [24]:
y_prob = ensemble(image_pred, text_pred)
y_pred = (y_prob == y_prob.max(axis=1)[:,None]).astype(int)

### Not much to say about metrics since our dataset is too small to make significant conclusions

In [25]:
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       1.00      0.40      0.57         5
           2       0.19      1.00      0.32         3
           3       1.00      0.40      0.57         5

   micro avg       0.35      0.35      0.35        20
   macro avg       0.55      0.45      0.36        20
weighted avg       0.53      0.35      0.33        20
 samples avg       0.35      0.35      0.35        20

