In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 39.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYA

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import pickle as pkl

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv("/content/drive/My Drive/celonis/labelled_text.csv", encoding='ISO-8859-1', header=None)

In [None]:
with open("/content/drive/My Drive/celonis/initial_texts.pkl", "rb") as handle:
  data = pkl.load(handle)

In [None]:
texts_temp, labels_temp = data#list(data[data[2] != "Irrelevant"][3]), list(data[data[2] != "Irrelevant"][2])

In [None]:
texts, labels = [], []

for i, text in enumerate(texts_temp):
  if labels_temp[i] != "Irrelevant":
    texts.append(text)
    labels.append(labels_temp[i])

del texts_temp
del labels_temp

In [None]:
len(texts), len(labels)

(61121, 61121)

In [None]:
labels_numeric = []
for label in labels:
    if label == "Neutral":
        labels_numeric.append(0)
    elif label == "Positive":
        labels_numeric.append(1)
    else:#Negative
        labels_numeric.append(2)

In [None]:
len(texts), len(labels_numeric)

(61121, 61121)

In [None]:
#Clean NaNs
clean_texts, clean_labels = [], []
for i, text in enumerate(texts):
    if isinstance(text, str):
        clean_texts.append(text)
        clean_labels.append(labels_numeric[i])

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
text_data = []

for text in clean_texts:
    text_data.append(tokenizer.encode(text, return_tensors="tf")[0])

In [None]:
len(clean_texts), len(clean_labels)

(61121, 61121)

In [None]:
maxlen = 50

In [None]:
padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
    text_data, padding="post", maxlen=maxlen
)

In [None]:
one_hot_labels = tf.one_hot(clean_labels, depth=3)

In [None]:
#labels_data = tf.convert_to_tensor(clean_labels)
padded_inputs = tf.convert_to_tensor(padded_inputs)

In [None]:
one_hot_labels.shape, padded_inputs.shape

(TensorShape([61121, 3]), TensorShape([61121, 50]))

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=tokenizer.vocab_size,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3),
    tf.keras.layers.Softmax()
])

In [None]:
predictions = model.predict(np.array([padded_inputs[0]]))
print(predictions)

[[0.33719933 0.33087757 0.33192313]]


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(padded_inputs.numpy(), one_hot_labels.numpy(), test_size=0.1, random_state=50)

In [None]:
print("Train: ", X_train.shape, Y_train.shape)
print("Test: ", X_test.shape, Y_test.shape)

Train:  (55008, 50) (55008, 3)
Test:  (6113, 50) (6113, 3)


In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy', f1_m])

In [None]:
history = model.fit(x=X_train, y=Y_train, epochs=10, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.save("/content/drive/My Drive/celonis/lstmv4.h5")

In [None]:
val_history = model.evaluate(x=X_test, y=Y_test)



In [None]:
val_history#

[0.5136594176292419, 0.8532635569572449, 0.8548662066459656]

In [None]:
history_json = history.history
history_json["params"] = history.params

In [None]:
import json

In [None]:
with open("/content/drive/My Drive/celonis/history_lstm_v4.json", "w+") as handle:
  json.dump(history_json, handle)

with open("/content/drive/My Drive/celonis/history_test_lstm_v4.pkl", "wb") as handle:
  pkl.dump(val_history, handle)

In [None]:
text = "I love this movie!"

In [None]:
tokenized = tokenizer.encode(text)

In [None]:
model = tf.keras.models.load_model("/content/drive/My Drive/celonis/lstmv4.h5", custom_objects={"f1_m": f1_m})

In [None]:
out = model.predict([tokenized])

In [None]:
out

array([[0.03423184, 0.9453402 , 0.02042788]], dtype=float32)