In [None]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import re

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

from sklearn.model_selection import train_test_split

np.random.seed(987)
plt.style.use("ggplot")

In [None]:
raw_file = os.path.join("..", "data", "raw", "raw_data.csv")
df_raw = pd.read_csv(raw_file)

df_sample = df_raw.sample(10000).reset_index().drop(columns='index')

In [None]:
unspsc_file = os.path.join("..","references","clasificador_de_bienes_y_servicios_v14_1.xlsx")
df_unspsc = pd.read_excel(unspsc_file)

segment_dict = dict(
    zip(
        df_unspsc['Código Segmento'].astype('str'),
        df_unspsc['Nombre Segmento']
    )
)
family_dict = dict(
    zip(
        df_unspsc['Código Familia'].astype('str'),
        df_unspsc['Nombre Familia']
    )
)
class_dict = dict(
    zip(
        df_unspsc['Código Clase'].astype('str'),
        df_unspsc['Nombre Clase']
    )
)
commodity_dict = dict(
    zip(
        df_unspsc['Código Producto'].astype('str'),
        df_unspsc['Nombre Producto']
    )
)

In [None]:
df_sample['codigo_de_categoria_principal'] = df_sample['codigo_de_categoria_principal'].str.extract('([0-9]{8})', expand=False)

df_sample['segment_code'] = df_sample['codigo_de_categoria_principal'].str[:2]
df_sample['family_code'] = df_sample['codigo_de_categoria_principal'].str[:4]
df_sample['class_code'] = df_sample['codigo_de_categoria_principal'].str[:6]
df_sample['commodity_code'] = df_sample['codigo_de_categoria_principal'].str[:8]

df_sample['segment_text'] = df_sample.segment_code.map(segment_dict)
df_sample['family_text'] = df_sample.family_code.map(family_dict)
df_sample['class_text'] = df_sample.class_code.map(class_dict)
df_sample['commodity_text'] = df_sample.commodity_code.map(commodity_dict)

df_sample['segment_code'] = df_sample['segment_code'].astype('Int64')
df_sample['family_code'] = df_sample['family_code'].astype('Int64')
df_sample['class_code'] = df_sample['class_code'].astype('Int64')
df_sample['commodity_code'] = df_sample['commodity_code'].astype('Int64')

## Select sample with every class code

In [None]:
unspsc_commodity_code = df_unspsc['Código Clase'].astype('Int64').dropna().unique().tolist()

In [None]:
df = df_sample[df_sample['class_code'].isin(unspsc_commodity_code)][['descripcion_del_proceso', 'class_code', 'class_text']]

In [None]:
# Definimos los tokenizer
wpt = nltk.WordPunctTokenizer()

# Descargamos las stopwords para espanhol
stop_words = nltk.corpus.stopwords.words('spanish')
# Definimos la función de preprocesamiento
def normalize_document(doc):
    # Se convierten los téxtos a minúsculas
    doc = doc.lower()
    # Se eliminan caracteres especiales
    doc = re.sub(r'[^a-zñàáâãäåèéêëìíîïòóôõöùúûüýÿ\s]', '', doc, re.I|re.A)
    doc = doc.strip()
    # Tokenizado de documento
    tokens = wpt.tokenize(doc)
    # Eliminación de stopwords
    tokens = [token for token in tokens if token not in stop_words]
    # Retornamos una versión filtrada del texto
    doc = ' '.join(tokens)
    return doc
# Vectorización de la función
normalize_corpus = np.vectorize(normalize_document)

df['Text'] = df['descripcion_del_proceso'].apply(normalize_document)

df.drop(columns='descripcion_del_proceso', inplace=True)

## split train and test

In [None]:
num_classes = len(df["class_code"].unique())

In [None]:
dict_labels = {key:value for value, key in enumerate(df['class_code'].unique().tolist())}

df['Labels'] = df["class_code"].map(dict_labels)

In [None]:
y = tf.keras.utils.to_categorical(df["Labels"].values, num_classes=num_classes)

x_train, x_test, y_train, y_test = train_test_split(df['Text'], y, test_size=0.25)

# Data modeling

In [None]:
preprocessor = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2")
encoder = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-base/1")

def get_embeddings(sentences):
  '''return BERT-like embeddings of input text
  Args:
    - sentences: list of strings
  Output:
    - BERT-like embeddings: tf.Tensor of shape=(len(sentences), 768)
  '''
  preprocessed_text = preprocessor(sentences)
  return encoder(preprocessed_text)['pooled_output']

## Create and train the classification model

In [25]:
from keras import backend as K

def balanced_recall(y_true, y_pred):
    """This function calculates the balanced recall metric
    recall = TP / (TP + FN)
    """
    recall_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true_class, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        recall_by_class = recall_by_class + recall
    return recall_by_class / y_pred.shape[1]

def balanced_precision(y_true, y_pred):
    """This function calculates the balanced precision metric
    precision = TP / (TP + FP)
    """
    precision_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred_class, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        precision_by_class = precision_by_class + precision
    # return average balanced metric for each class
    return precision_by_class / y_pred.shape[1]

def balanced_f1_score(y_true, y_pred):
    """This function calculates the F1 score metric"""
    precision = balanced_precision(y_true, y_pred)
    recall = balanced_recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

In [26]:
i = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
x = preprocessor(i)
x = encoder(x)
x = tf.keras.layers.Dropout(0.2, name="dropout")(x['pooled_output'])
x = tf.keras.layers.Dense(num_classes, activation='softmax', name="output")(x)

model = tf.keras.Model(i, x)

In [27]:
n_epochs = 20

METRICS = [
      tf.keras.metrics.CategoricalAccuracy(name="accuracy"),
      balanced_recall,
      balanced_precision,
      balanced_f1_score
]

earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", 
                                                      patience = 3,
                                                      restore_best_weights = True)

model.compile(optimizer = "adam",
              loss = "categorical_crossentropy",
              metrics = METRICS)

model_fit = model.fit(x_train, 
                      y_train, 
                      epochs = n_epochs,
                      validation_data = (x_test, y_test),
                      callbacks = [earlystop_callback])

Epoch 1/20

In [None]:
x = list(range(1, n_epochs+1))
metric_list = list(model_fit.history.keys())
num_metrics = int(len(metric_list)/2)

fig, ax = plt.subplots(nrows=1, ncols=num_metrics, figsize=(30, 5))

for i in range(0, num_metrics):
  ax[i].plot(x, model_fit.history[metric_list[i]], marker="o", label=metric_list[i].replace("_", " "))
  ax[i].plot(x, model_fit.history[metric_list[i+num_metrics]], marker="o", label=metric_list[i+num_metrics].replace("_", " "))
  ax[i].set_xlabel("epochs",fontsize=14)
  ax[i].set_title(metric_list[i].replace("_", " "),fontsize=20)
  ax[i].legend(loc="lower left")

## Predict

In [None]:
def predict_class(reviews):
  '''predict class of input text
  Args:
    - reviews (list of strings)
  Output:
    - class (list of int)
  '''
  return [np.argmax(pred) for pred in model.predict(reviews)]


predict_class('Prestar con plena autonomía técnica y administrativa sus servicios profesionales en sistemas de la información  bibliotecología y archivística en el área de Gestión Documental  para apoyar la misión de la Biblioteca de la FUGA.')

## Blind set evaluation

# Save and load model

In [None]:
model.save("../models/text_classifier_v1")

We can now load the model as needed for future use:

In [None]:
from tensorflow import keras

# load model
new_model = keras.models.load_model("/models/text_classifier_v1")

# test predictions
[np.argmax(pred) for pred in new_model.predict('Prestación de Servicios Profesionales como Abogado (a) en la Subsecretaría de Acceso a Servicíos de Justicia  en desarrollo del proyecto denominado:  Fortalecimiento de los servicios de acceso a la justicia en Santiago de Cali  Según ficha EBI No. 26002080')]
# output: [3, 1, 0]