In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 42.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 20.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 3.8 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
data = pd.read_csv("/content/drive/My Drive/celonis/labelled_text.csv", encoding='ISO-8859-1', header=None)

In [6]:
texts, labels = list(data[data[2] != "Irrelevant"][3]), list(data[data[2] != "Irrelevant"][2])

In [7]:
len(texts), len(labels)

(61692, 61692)

In [8]:
labels_numeric = []
for label in labels:
    if label == "Neutral":
        labels_numeric.append(0)
    elif label == "Positive":
        labels_numeric.append(1)
    else:
        labels_numeric.append(2)

In [9]:
len(texts), len(labels_numeric)

(61692, 61692)

In [10]:
#Clean NaNs
clean_texts, clean_labels = [], []
for i, text in enumerate(texts):
    if isinstance(text, str):
        clean_texts.append(text)
        clean_labels.append(labels_numeric[i])

In [11]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [12]:
text_data = []

for text in clean_texts:
    text_data.append(tokenizer.encode(text, return_tensors="tf")[0])

In [13]:
len(clean_texts), len(clean_labels)

(61121, 61121)

In [14]:
padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
    text_data, padding="post", maxlen=20
)

In [15]:
one_hot_labels = tf.one_hot(clean_labels, depth=3)

In [16]:
#labels_data = tf.convert_to_tensor(clean_labels)
padded_inputs = tf.convert_to_tensor(padded_inputs)

In [17]:
one_hot_labels.shape, padded_inputs.shape

(TensorShape([61121, 3]), TensorShape([61121, 20]))

In [18]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=tokenizer.vocab_size,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3),
    tf.keras.layers.Softmax()
])

In [20]:
predictions = model.predict(np.array([padded_inputs[0]]))
print(predictions)

[[0.33303478 0.33381215 0.33315307]]


In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(padded_inputs.numpy(), one_hot_labels.numpy(), test_size=0.1, random_state=50)

In [23]:
print("Train: ", X_train.shape, Y_train.shape)
print("Test: ", X_test.shape, Y_test.shape)

Train:  (55008, 20) (55008, 3)
Test:  (6113, 20) (6113, 3)


In [29]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [30]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy', f1_m])

In [32]:
history = model.fit(x=X_train, y=Y_train, epochs=10, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [33]:
model.save("/content/drive/My Drive/celonis/lstmv2.h5")

In [34]:
val_history = model.evaluate(x=X_test, y=Y_test)



In [46]:
val_history#

[0.561454176902771, 0.8346147537231445, 0.8395311832427979]

In [49]:
history_json = history.history
history_json["params"] = history.params

In [48]:
import json

In [53]:
with open("/content/drive/My Drive/celonis/history_lstm_v3.json", "w+") as handle:
  json.dump(history_json, handle)

with open("/content/drive/My Drive/celonis/history_test_lstm_v3.pkl", "wb") as handle:
  pkl.dump(val_history, handle)