In this notebook we classify fake news with a transformer model.<br>
<br>
We have already encoded the fake news train & test datasets in the **Fake news encoding** notebook,<br>
here we will:
- Add positional encodings to the data.
- Initialize a transformer model.
- Train the model on the train data.
- Evaluate the models predictions on the test data.

We show several versions of transfomer models and encodings of the datasets, in each version we try different approaches of encodings and model-building.


A guide to Transformer models that we used:<br>
https://keras.io/examples/nlp/text_classification_with_transformer/

# Learning with Bag-of-words encodings

In this section, we use the corpus BOW encodings we created in the **Fake news encoding** notebook as data for our model.<br>
<br>
The corpus is encoded in the following manner- Each document is converted to an array of numbers, each number represents a matching word in the document. there are **56668** words altogether.<br>
<br>
We will build, fit and evaluate three different models:
1. Transformer model that **multiclass-classifies** **6** categories.
2. Transformer model that **binary-classifies**, fitted and tested only on documents that are labeled as **'true'**, **'false'**.
3. Transformer model that **binary-classifies**, fitted and tested on all documents but the labels will be merged to two categories - **'true'**, **'false'**.

## Model 1 - Multiclass classification

In [None]:
import pandas as pd
import numpy as np
import io
import time
from tqdm import tqdm
from math import sin, cos

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

### Get datasets

In [None]:
#train
num_splits = 7
train = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/liar_train_0.csv')
for i in range(1, num_splits):
  temp = pd.read_csv(f'https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/liar_train_{i}.csv')
  train = train.append(temp)

#train encoded
num_splits = 4
train_encoded = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/train_encoded_v3_0.csv')
for i in range(1, num_splits):
  temp = pd.read_csv(f'https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/train_encoded_v3_{i}.csv')
  train_encoded = train_encoded.append(temp)

#test
test = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset-train/main/liar_test.csv')

#test encoded
test_encoded = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/test_encoded_v3.csv')

In [None]:
train_encoded_lst = list(train_encoded['full_text_encoded'])
test_encoded_lst = list(test_encoded['full_text_encoded'])

# documents encodings are saved as strings
# use eval to return it to actual encodings (list of numbers)
for i, doc in enumerate(train_encoded_lst):
  train_encoded_lst[i] = eval(doc)

for i, doc in enumerate(test_encoded_lst):
  test_encoded_lst[i] = eval(doc)

### Get train and test labels One-hot encodings

In [None]:
train_labels_oh = pd.get_dummies(train['label-liar'])
test_labels_oh = pd.get_dummies(test['label-liar'])

train_labels = np.asarray(train_labels_oh).tolist()
test_labels = np.asarray(test_labels_oh).tolist()

### Truncate/pad sequences to a *max sequence length*
First we need to determine the value of *max sequence length*.<br>
Find minimum, maximum and average sequence length.

In [None]:
doc_len = len(train_encoded_lst[0])
min_len = doc_len
max_len = doc_len
avg_len = doc_len
for doc in train_encoded_lst:
  doc_len = len(doc)

  if min_len > doc_len:
    min_len = doc_len

  elif max_len < doc_len:
    max_len = doc_len
  
  avg_len = (avg_len+doc_len)/2

print('min sequence length =', min_len)
print('max sequence length =', max_len)
print('avrage sequence length =', avg_len)

min sequence length = 33
max sequence length = 1616
avrage sequence length = 466.2858263541781


Decision: Our model will use a *max sequence length* of 500

In [None]:
def pad_list(lst, value=0, size=0):
  count=0
  while count < size:
    lst.append(value)
    count += 1

In [None]:
maxlen = 500 # max sequence length

train_docs = []
for doc in tqdm(train_encoded_lst, position=0, leave=True):
  truncated_doc = doc[:maxlen] # truncate to max len
  pad_size = maxlen - len(truncated_doc)
  pad_list(truncated_doc, size=pad_size) # pad end of seq with zeros
  train_docs.append(truncated_doc)

test_docs = []
for doc in tqdm(test_encoded_lst, position=0, leave=True):
  truncated_doc = doc[:maxlen] # truncate to max len
  pad_size = maxlen - len(truncated_doc)
  pad_list(truncated_doc, size=pad_size) # pad end of seq with zeros
  test_docs.append(truncated_doc)

100%|██████████| 15052/15052 [00:00<00:00, 29457.99it/s]
100%|██████████| 1266/1266 [00:00<00:00, 44658.91it/s]


In [None]:
# check that all sequence lengths' are the same
doc_lens = set()
for doc in train_docs:
  doc_lens.add(len(doc))

for doc in test_docs:
  doc_lens.add(len(doc))

print(doc_lens)

{500}


### Transformer blocks and positional embeddings

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
def positional_embeddings(maxlen):
  pe_lst = []
  for pos in range(maxlen):
    pe_lst.append(sin(pos))

  return pe_lst

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x, maxlen):
        positions = tf.convert_to_tensor(positional_embeddings(maxlen))
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

### Build model and fit

In [None]:
vocab_size = 56668 # taken from 'Fake news embedding' notebook
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs, maxlen)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(6, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
x_train = tf.convert_to_tensor(train_docs)
y_train = tf.convert_to_tensor(train_labels)

In [None]:
model.compile("adam", "categorical_crossentropy", metrics=["categorical_accuracy", tf.keras.metrics.Recall()])
history = model.fit(
    x_train, y_train, batch_size=32, epochs=2, validation_split=0.2, shuffle=True
)

Epoch 1/2
Epoch 2/2


### Evaluate model on test

In [None]:
x_test = tf.convert_to_tensor(test_docs)
y_test = tf.convert_to_tensor(test_labels)

In [None]:
results = model.evaluate(x_test, y_test, batch_size=32)



## Model 2 - Binary classification, only 'true', 'false'

In [None]:
import pandas as pd
import numpy as np
import io
import time
from tqdm import tqdm
from math import sin, cos

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

### Get datasets

In [None]:
#train
num_splits = 7
train = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/liar_train_0.csv')
for i in range(1, num_splits):
  temp = pd.read_csv(f'https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/liar_train_{i}.csv')
  train = train.append(temp)

#train encoded
num_splits = 4
train_encoded = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/train_encoded_v3_0.csv')
for i in range(1, num_splits):
  temp = pd.read_csv(f'https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/train_encoded_v3_{i}.csv')
  train_encoded = train_encoded.append(temp)

#test
test = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset-train/main/liar_test.csv')

#test encoded
test_encoded = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/test_encoded_v3.csv')

### Filter documents and labels where labels are binary ('true' or 'false')

In [None]:
train_labels_lst = train['label-liar'].tolist()
train_encoded_lst = train_encoded['full_text_encoded'].tolist()

test_labels_lst = test['label-liar'].tolist()
test_encoded_lst = test_encoded['full_text_encoded'].tolist()

In [None]:
is_binary_train_index = lambda i: True if train_labels_lst[i] in ('true', 'false') else False

is_binary_test_index = lambda i: True if test_labels_lst[i] in ('true', 'false') else False

In [None]:
train_labels_binary = []
train_encoded_binary = []

test_labels_binary = []
test_encoded_binary = []

for i in range(len(train_labels_lst)):
  if is_binary_train_index(i):
    train_labels_binary.append(train_labels_lst[i])
    train_encoded_binary.append(train_encoded_lst[i])

for i in range(len(test_labels_lst)):
  if is_binary_test_index(i):
    test_labels_binary.append(test_labels_lst[i])
    test_encoded_binary.append(test_encoded_lst[i])

# documents encodings are saved as strings
# use eval to return it to actual encodings (list of numbers)
for i, doc in enumerate(train_encoded_binary):
  train_encoded_binary[i] = eval(doc)

for i, doc in enumerate(test_encoded_binary):
  test_encoded_binary[i] = eval(doc)

### Get train and test labels One-hot encodings

In [None]:
train_labels_binary_df = pd.DataFrame({'label-liar': train_labels_binary})
test_labels_binary_df = pd.DataFrame({'label-liar': test_labels_binary})

train_labels_oh = pd.get_dummies(train_labels_binary_df['label-liar'])
test_labels_oh = pd.get_dummies(test_labels_binary_df['label-liar'])

train_labels = np.asarray(train_labels_oh).tolist()
test_labels = np.asarray(test_labels_oh).tolist()

### Truncate/pad sequences to a max sequence length
First we need to determine the value of max sequence length.<br>
Find minimum, maximum and average sequence length

In [None]:
doc_len = len(train_encoded_binary[0])
min_len = doc_len
max_len = doc_len
avg_len = doc_len
for doc in train_encoded_binary:
  doc_len = len(doc)

  if min_len > doc_len:
    min_len = doc_len

  elif max_len < doc_len:
    max_len = doc_len
  
  avg_len = (avg_len+doc_len)/2

print('min sequence length =', min_len)
print('max sequence length =', max_len)
print('avrage sequence length =', avg_len)

min sequence length = 38
max sequence length = 1616
avrage sequence length = 393.13873830976917


Decision: Our model will use a *max sequence length* of 500

In [None]:
def pad_list(lst, value=0, size=0):
  count=0
  while count < size:
    lst.append(value)
    count += 1

In [None]:
maxlen = 500 # max sequence length

train_docs = []
for doc in tqdm(train_encoded_binary, position=0, leave=True):
  truncated_doc = doc[:maxlen] # truncate to max len
  pad_size = maxlen - len(truncated_doc)
  pad_list(truncated_doc, size=pad_size) # pad end of seq with zeros
  train_docs.append(truncated_doc)

test_docs = []
for doc in tqdm(test_encoded_binary, position=0, leave=True):
  truncated_doc = doc[:maxlen] # truncate to max len
  pad_size = maxlen - len(truncated_doc)
  pad_list(truncated_doc, size=pad_size) # pad end of seq with zeros
  test_docs.append(truncated_doc)

100%|██████████| 5330/5330 [00:00<00:00, 36780.76it/s]
100%|██████████| 457/457 [00:00<00:00, 52527.94it/s]


In [None]:
# check that all sequence lengths' are the same
doc_lens = set()
for doc in train_docs:
  doc_lens.add(len(doc))

for doc in train_docs:
  doc_lens.add(len(doc))

print(doc_lens)

{500}


### Transformer blocks and positional embeddings

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
def positional_embeddings(maxlen):
  pe_lst = []
  for pos in range(maxlen):
    pe_lst.append(sin(pos))

  return pe_lst

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x, maxlen):
        # positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = tf.convert_to_tensor(positional_embeddings(maxlen))
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

### Build model and fit

In [None]:
vocab_size = 56668 # taken from 'Fake news embedding' notebook
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs, maxlen)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
x_train = tf.convert_to_tensor(train_docs)
y_train = tf.convert_to_tensor(train_labels)

In [None]:
model.compile("adam", "binary_crossentropy", metrics=["binary_accuracy", tf.keras.metrics.Recall()])
history = model.fit(
    x_train, y_train, batch_size=32, epochs=4, validation_split=0.2, shuffle=True
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


### Evaluate model on test

In [None]:
x_test = tf.convert_to_tensor(test_docs)
y_test = tf.convert_to_tensor(test_labels)

In [None]:
results = model.evaluate(x_test, y_test, batch_size=32)



## Model 3 - Binary classification, merged classes

In [None]:
import pandas as pd
import numpy as np
import io
import time
from tqdm import tqdm
from math import sin, cos

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

### Get datasets

In [None]:
#train
num_splits = 7
train = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/liar_train_0.csv')
for i in range(1, num_splits):
  temp = pd.read_csv(f'https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/liar_train_{i}.csv')
  train = train.append(temp)

#train encoded
num_splits = 4
train_encoded = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/train_encoded_v3_0.csv')
for i in range(1, num_splits):
  temp = pd.read_csv(f'https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/train_encoded_v3_{i}.csv')
  train_encoded = train_encoded.append(temp)

#test
test = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset-train/main/liar_test.csv')

#test encoded
test_encoded = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/test_encoded_v3.csv')

In [None]:
train_encoded_lst = train_encoded['full_text_encoded'].tolist()
test_encoded_lst = test_encoded['full_text_encoded'].tolist()

# documents encodings are saved as strings
# use eval to return it to actual encodings (list of numbers)
for i, doc in enumerate(train_encoded_lst):
  train_encoded_lst[i] = eval(doc)

for i, doc in enumerate(test_encoded_lst):
  test_encoded_lst[i] = eval(doc)

### Merge labels
- ('true', 'mostly-true', 'half-true') => 'true'
- ('false', 'pants-fire', 'barely-true') => 'false'

In [None]:
train['label-liar'].unique()

array(['barely-true', 'pants-fire', 'half-true', 'mostly-true', 'true',
       'false'], dtype=object)

In [None]:
merge_labels = lambda label: 'true' if label in ('true', 'mostly-true', 'half-true') else 'false'

In [None]:
train_labels_lst = train['label-liar'].tolist()
test_labels_lst = test['label-liar'].tolist()

train_labels_merged = []
test_labels_merged = []

for label in train_labels_lst:
  train_labels_merged.append(merge_labels(label))

for label in test_labels_lst:
  test_labels_merged.append(merge_labels(label))

### Get train and test labels One-hot encodings

In [None]:
train_labels_merged_df = pd.DataFrame({'label-liar': train_labels_merged})
test_labels_merged_df = pd.DataFrame({'label-liar': test_labels_merged})

train_labels_oh = pd.get_dummies(train_labels_merged_df['label-liar'])
test_labels_oh = pd.get_dummies(test_labels_merged_df['label-liar'])

train_labels = np.asarray(train_labels_oh).tolist()
test_labels = np.asarray(test_labels_oh).tolist()

### Truncate/pad sequences to a max sequence length
First we need to determine the value of max sequence length.<br>
Find minimum, maximum and average sequence length

In [None]:
doc_len = len(train_encoded_lst[0])
min_len = doc_len
max_len = doc_len
avg_len = doc_len
for doc in train_encoded_lst:
  doc_len = len(doc)

  if min_len > doc_len:
    min_len = doc_len

  elif max_len < doc_len:
    max_len = doc_len
  
  avg_len = (avg_len+doc_len)/2

print('min sequence length =', min_len)
print('max sequence length =', max_len)
print('avrage sequence length =', avg_len)

min sequence length = 33
max sequence length = 1616
avrage sequence length = 466.2858263541781


Decision: Our model will use a *max sequence length* of 500

In [None]:
def pad_list(lst, value=0, size=0):
  count=0
  while count < size:
    lst.append(value)
    count += 1

In [None]:
maxlen = 500 # max sequence length

train_docs = []
for doc in train_encoded_lst:
  truncated_doc = doc[:maxlen] # truncate to max len
  pad_size = maxlen - len(truncated_doc)
  pad_list(truncated_doc, size=pad_size) # pad end of seq with zeros
  train_docs.append(truncated_doc)

test_docs = []
for doc in test_encoded_lst:
  truncated_doc = doc[:maxlen] # truncate to max len
  pad_size = maxlen - len(truncated_doc)
  pad_list(truncated_doc, size=pad_size) # pad end of seq with zeros
  test_docs.append(truncated_doc)

In [None]:
# check that all sequence lengths' are the same
doc_lens = set()
for doc in train_docs:
  doc_lens.add(len(doc))

for doc in test_docs:
  doc_lens.add(len(doc))

print(doc_lens)

{500}


### Transformer blocks and positional embeddings

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
def positional_embeddings(maxlen):
  pe_lst = []
  for pos in range(maxlen):
    pe_lst.append(sin(pos))

  return pe_lst

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x, maxlen):
        # positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = tf.convert_to_tensor(positional_embeddings(maxlen))
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

### Build model and fit

In [None]:
vocab_size = 56668 # taken from 'Fake news embedding' notebook
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs, maxlen)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
x_train = tf.convert_to_tensor(train_docs)
y_train = tf.convert_to_tensor(train_labels)

In [None]:
model.compile("adam", "binary_crossentropy", metrics=["binary_accuracy", tf.keras.metrics.Recall()])
history = model.fit(
    x_train, y_train, batch_size=32, epochs=1, validation_split=0.2, shuffle=True
)



### Evaluate model on test

In [None]:
x_test = tf.convert_to_tensor(test_docs)
y_test = tf.convert_to_tensor(test_labels)

In [None]:
results = model.evaluate(x_test, y_test, batch_size=32)



# Learning with Bert's contextualized embeddings

In this section, we use the fake news embeddings we extracted from Bert as data for our model.<br>
We extracted two different embeddings:<br>
**The first**, embeddings Bert learned with *max sequence length* of 128.<br>
**The second**, embeddings Bert learned with *max sequence length* of 512 (the max *max sequence length* for bert).<br>
<br>
We will build, fit and evaluate three different models:
1. Transformer model that **multiclass-classifies** to **6** categories, fitted on **the first** embeddings.
2. Transformer model that **multiclass-classifies** to **6** categories, fitted on **the second** embeddings.
3. Transformer model that **binary-classifies**, fitted on **the second** embeddings.

## Model 1 - Multiclass-classification, *max sequence length* = 128

In [None]:
import pandas as pd
import numpy as np
import io
import time
from tqdm import tqdm
from math import sin, cos

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

### Get datasets

In [None]:
#train
num_splits = 7
train = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/liar_train_0.csv')
for i in range(1, num_splits):
  temp = pd.read_csv(f'https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/liar_train_{i}.csv')
  train = train.append(temp)

#train embedded
num_splits = 4
train_embedded = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/train_embedded_0.csv')
for i in range(1, num_splits):
  temp = pd.read_csv(f'https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/train_embedded_{i}.csv')
  train_embedded = train_embedded.append(temp)

#test
test = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset-train/main/liar_test.csv')

#test embedded
test_embedded = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/test_embedded.csv')

In [None]:
train_embedded.head(3)

Unnamed: 0,full_text_embedded
0,"[[-0.39664504, -3.201748, -1.6849961, 0.911999..."
1,"[[-0.112607814, -2.6428027, -1.084317, 1.01488..."
2,"[[1.1767617, -1.7852252, -2.0916524, 0.0093422..."


In [None]:
train_embedded_lst = train_embedded['full_text_embedded'].tolist()
test_embedded_lst = test_embedded['full_text_embedded'].tolist()

# documents embeddings are saved as strings
# use eval to return it to acctual embeddings (list of word embedding vectors)
for i, doc in enumerate(train_embedded_lst):
  train_embedded_lst[i] = eval(doc)

for i, doc in enumerate(test_embedded_lst):
  test_embedded_lst[i] = eval(doc)

### Get train and test labels One-hot encodings

In [None]:
train_labels_oh = pd.get_dummies(train['label-liar'])
test_labels_oh = pd.get_dummies(test['label-liar'])

train_labels = np.asarray(train_labels_oh).tolist()
test_labels = np.asarray(test_labels_oh).tolist()

### Truncate/pad sequences to a *max sequence length*

First we need to determine the value of max sequence length.<br>
Find minimum, maximum and average sequence length.

In [None]:
doc_len = len(train_embedded_lst[0])
min_len = doc_len
max_len = doc_len
avg_len = doc_len
for doc in train_embedded_lst:
  doc_len = len(doc)

  if min_len > doc_len:
    min_len = doc_len

  elif max_len < doc_len:
    max_len = doc_len
  
  avg_len = (avg_len+doc_len)/2

print('min sequence length =', min_len)
print('max sequence length =', max_len)
print('avrage sequence length =', avg_len)

min sequence length = 49
max sequence length = 109
avrage sequence length = 81.47448309689022


Decision: Our model will use a *max sequence length* of 80.

In [None]:
def pad_list(lst, value=0, size=0):
  count=0
  while count < size:
    lst.append(value)
    count += 1

In [None]:
maxlen = 80 # max sequence length

train_docs = []
for doc in train_embedded_lst:
  truncated_doc = doc[:maxlen] # truncate to max len
  pad_size = maxlen - len(truncated_doc)
  pad_list(truncated_doc, value=[0]*5, size=pad_size) # pad end of seq with zeros
  train_docs.append(truncated_doc)

test_docs = []
for doc in test_embedded_lst:
  truncated_doc = doc[:maxlen] # truncate to max len
  pad_size = maxlen - len(truncated_doc)
  pad_list(truncated_doc, value=[0]*5, size=pad_size) # pad end of seq with zeros
  test_docs.append(truncated_doc)

In [None]:
# check that all sequence lengths' are the same
doc_lens = set()
for doc in train_docs:
  doc_lens.add(len(doc))

for doc in test_docs:
  doc_lens.add(len(doc))

print(doc_lens)

{80}


### Add positional embeddings

In [None]:
def positional_embeddings(vector, dims):
  dim1, dim2 = dims
  pe = 0
  for pos in range(dim1):
    for i in range(dim2):

      if i%2 == 0: # is even
        pe = sin(pos/10000**(i/dim2))

      else: # is odd
        pe = cos(pos/10000**((i-1)/dim2))

      vector[pos][i] += pe

In [None]:
word_size = 5 # each word is represented as a vector with dim = word_size

for vector in train_docs:
  positional_embeddings(vector, (maxlen, word_size))

for vector in test_docs:
  positional_embeddings(vector, (maxlen, word_size))

### Transformer block

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

### Build model and fit

In [None]:
embed_dim = 32  # Embedding size for each token
num_heads = 4  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,5))
embedding_layer = layers.Dense(embed_dim) # (not realy an embedding_layer, regular fc layer)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(6, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
x_train = tf.convert_to_tensor(train_docs)
y_train = tf.convert_to_tensor(train_labels)

In [None]:
model.compile("adam", "categorical_crossentropy", metrics=["categorical_accuracy", tf.keras.metrics.Recall()])
history = model.fit(
    x_train, y_train, batch_size=32, epochs=2, validation_split=0.2, shuffle=True
)

Epoch 1/2
Epoch 2/2


### Evaluate model on test

In [None]:
x_test = tf.convert_to_tensor(test_docs)
y_test = tf.convert_to_tensor(test_labels)

In [None]:
results = model.evaluate(x_test, y_test, batch_size=32)



## Model 2 - Multiclass-classification, *max sequence length* = 512

In [None]:
import pandas as pd
import numpy as np
import io
import time
from tqdm import tqdm
from math import sin, cos

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

### Get datasets

In [None]:
#train
num_splits = 7
train = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/liar_train_0.csv')
for i in range(1, num_splits):
  temp = pd.read_csv(f'https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/liar_train_{i}.csv')
  train = train.append(temp)

#train embedded
num_splits = 12
train_embedded = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/train_embedded_v2_0.csv')
for i in range(1, num_splits):
  temp = pd.read_csv(f'https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/train_embedded_v2_{i}.csv')
  train_embedded = train_embedded.append(temp)

#test
test = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset-train/main/liar_test.csv')

#test embedded
test_embedded = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/test_embedded.csv')

In [None]:
train_embedded.head(3)

Unnamed: 0,full_text_embedded
0,"[[-0.37441736, -4.348481, -2.867022, -0.833148..."
1,"[[-1.0242298, -3.5270042, -1.9221768, 0.864446..."
2,"[[0.03841588, -3.0448728, -2.48761, 0.37428683..."


In [None]:
train_embedded_lst = train_embedded['full_text_embedded'].tolist()
test_embedded_lst = test_embedded['full_text_embedded'].tolist()

# documents embeddings are saved as strings
# use eval to return it to acctual embeddings (list of word embedding vectors)
for i, doc in enumerate(train_embedded_lst):
  train_embedded_lst[i] = eval(doc)

for i, doc in enumerate(test_embedded_lst):
  test_embedded_lst[i] = eval(doc)

### Get train and test labels One-hot encodings

In [None]:
train_labels_oh = pd.get_dummies(train['label-liar'])
test_labels_oh = pd.get_dummies(test['label-liar'])

train_labels = np.asarray(train_labels_oh).tolist()
test_labels = np.asarray(test_labels_oh).tolist()

### truncate/pad sequences to a *max sequence length*

First we need to determine the value of *max sequence length*.<br>
Find minimum, maximum and average sequence length.

In [None]:
doc_len = len(test_embedded_lst[0])
min_len = doc_len
max_len = doc_len
avg_len = doc_len
for doc in test_embedded_lst:
  doc_len = len(doc)

  if min_len > doc_len:
    min_len = doc_len

  elif max_len < doc_len:
    max_len = doc_len
  
  avg_len = (avg_len+doc_len)/2

print('min sequence length =', min_len)
print('max sequence length =', max_len)
print('avrage sequence length =', avg_len)

min sequence length = 68
max sequence length = 102
avrage sequence length = 83.20663866249106


Decision: Our model will use a *max sequence length* of 230.

In [None]:
def pad_list(lst, value=0, size=0):
  count=0
  while count < size:
    lst.append(value)
    count += 1

In [None]:
maxlen = 230 # max sequence length

train_docs = []
for doc in train_embedded_lst:
  truncated_doc = doc[:maxlen] # truncate to max len
  pad_size = maxlen - len(truncated_doc)
  pad_list(truncated_doc, value=[0]*5, size=pad_size) # pad end of seq with zeros
  train_docs.append(truncated_doc)

test_docs = []
for doc in test_embedded_lst:
  truncated_doc = doc[:maxlen] # truncate to max len
  pad_size = maxlen - len(truncated_doc)
  pad_list(truncated_doc, value=[0]*5, size=pad_size) # pad end of seq with zeros
  test_docs.append(truncated_doc)

In [None]:
# check that all sequence lengths' are the same
doc_lens = set()
for doc in train_docs:
  doc_lens.add(len(doc))

for doc in test_docs:
  doc_lens.add(len(doc))

print(doc_lens)

{230}


### Add positional embeddings

In [None]:
def positional_embeddings(vector, dims):
  dim1, dim2 = dims
  pe = 0
  for pos in range(dim1):
    for i in range(dim2):

      if i%2 == 0: # is even
        pe = sin(pos/10000**(i/dim2))

      else: # is odd
        pe = cos(pos/10000**((i-1)/dim2))

      vector[pos][i] += pe

In [None]:
word_size = 5 # each word is represented as a vector with dim = word_size

for vector in train_docs:
  positional_embeddings(vector, (maxlen, word_size))

for vector in test_docs:
  positional_embeddings(vector, (maxlen, word_size))

### Transformer block

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

### Build model and fit

In [None]:
embed_dim = 32  # Embedding size for each token
num_heads = 4  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,5))
embedding_layer = layers.Dense(embed_dim) # regular fc layer
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(6, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
x_train = tf.convert_to_tensor(train_docs)
y_train = tf.convert_to_tensor(train_labels)

In [None]:
model.compile("adam", "categorical_crossentropy", metrics=["categorical_accuracy", tf.keras.metrics.Recall()])
history = model.fit(
    x_train, y_train, batch_size=32, epochs=2, validation_split=0.2, shuffle=True
)

Epoch 1/2
Epoch 2/2


### Evaluate model on test

In [None]:
x_test = tf.convert_to_tensor(test_docs)
y_test = tf.convert_to_tensor(test_labels)

In [None]:
results = model.evaluate(x_test, y_test, batch_size=32)



## Model 3 - Binary classification, *max sequence length* = 512

In [None]:
import pandas as pd
import numpy as np
import io
import time
from tqdm import tqdm
from math import sin, cos

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

### Get datasets

In [None]:
#train
num_splits = 7
train = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/liar_train_0.csv')
for i in range(1, num_splits):
  temp = pd.read_csv(f'https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/liar_train_{i}.csv')
  train = train.append(temp)

#train embedded
num_splits = 12
train_embedded = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/train_embedded_v2_0.csv')
for i in range(1, num_splits):
  temp = pd.read_csv(f'https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/train_embedded_v2_{i}.csv')
  train_embedded = train_embedded.append(temp)

#test
test = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset-train/main/liar_test.csv')

#test embedded
test_embedded = pd.read_csv('https://raw.githubusercontent.com/AlonBrul/liar-liar-dataset/main/test_embedded.csv')

In [None]:
train_embedded.head(3)

Unnamed: 0,full_text_embedded
0,"[[-0.37441736, -4.348481, -2.867022, -0.833148..."
1,"[[-1.0242298, -3.5270042, -1.9221768, 0.864446..."
2,"[[0.03841588, -3.0448728, -2.48761, 0.37428683..."


### Filter documents and labels where labels are binary ('true', 'false')

In [None]:
train_labels_lst = train['label-liar'].tolist()
train_embedded_lst = train_embedded['full_text_embedded'].tolist()

test_labels_lst = test['label-liar'].tolist()
test_embedded_lst = test_embedded['full_text_embedded'].tolist()

In [None]:
is_binary_train_index = lambda i: True if train_labels_lst[i] in ('true', 'false') else False

is_binary_test_index = lambda i: True if test_labels_lst[i] in ('true', 'false') else False

In [None]:
train_labels_binary = []
train_embedded_binary = []

test_labels_binary = []
test_embedded_binary = []

for i in range(len(train_labels_lst)):
  if is_binary_train_index(i):
    train_labels_binary.append(train_labels_lst[i])
    train_embedded_binary.append(train_embedded_lst[i])

for i in range(len(test_labels_lst)):
  if is_binary_test_index(i):
    test_labels_binary.append(test_labels_lst[i])
    test_embedded_binary.append(test_embedded_lst[i])

In [None]:
# documents embeddings are saved as strings
# use eval to return it to acctual embeddings (list of word embedding vectors)
for i, doc in enumerate(train_embedded_binary):
  train_embedded_binary[i] = eval(doc)

for i, doc in enumerate(test_embedded_binary):
  test_embedded_binary[i] = eval(doc)

### Get train and test labels One-hot encodings

In [None]:
train_labels_binary_df = pd.DataFrame({'label-liar': train_labels_binary})
test_labels_binary_df = pd.DataFrame({'label-liar': test_labels_binary})

train_labels_oh = pd.get_dummies(train_labels_binary_df['label-liar'])
test_labels_oh = pd.get_dummies(test_labels_binary_df['label-liar'])

train_labels = np.asarray(train_labels_oh).tolist()
test_labels = np.asarray(test_labels_oh).tolist()

### truncate/pad sequences to a *max sequence length*

First we need to determine the value of *max sequence length*.<br>
Find minimum, maximum and average sequence length

In [None]:
doc_len = len(train_embedded_binary[0])
min_len = doc_len
max_len = doc_len
avg_len = doc_len
for doc in train_embedded_binary:
  doc_len = len(doc)

  if min_len > doc_len:
    min_len = doc_len

  elif max_len < doc_len:
    max_len = doc_len
  
  avg_len = (avg_len+doc_len)/2

print('min sequence length =', min_len)
print('max sequence length =', max_len)
print('avrage sequence length =', avg_len)

min sequence length = 65
max sequence length = 291
avrage sequence length = 234.99282043613502


Decision: Our model will use a *max sequence length* of 230

In [None]:
def pad_list(lst, value=0, size=0):
  count=0
  while count < size:
    lst.append(value)
    count += 1

In [None]:
maxlen = 230 # max sequence length

train_docs = []
for doc in train_embedded_binary:
  truncated_doc = doc[:maxlen] # truncate to max len
  pad_size = maxlen - len(truncated_doc)
  pad_list(truncated_doc, value=[0]*5, size=pad_size) # pad end of seq with zeros
  train_docs.append(truncated_doc)

test_docs = []
for doc in test_embedded_binary:
  truncated_doc = doc[:maxlen] # truncate to max len
  pad_size = maxlen - len(truncated_doc)
  pad_list(truncated_doc, value=[0]*5, size=pad_size) # pad end of seq with zeros
  test_docs.append(truncated_doc)

In [None]:
# check that all sequence lengths' are the same
doc_lens = set()
for doc in train_docs:
  doc_lens.add(len(doc))

for doc in test_docs:
  doc_lens.add(len(doc))

print(doc_lens)

{230}


### Add positional embeddings

In [None]:
def positional_embeddings(vector, dims):
  dim1, dim2 = dims
  pe = 0
  for pos in range(dim1):
    for i in range(dim2):

      if i%2 == 0: # is even
        pe = sin(pos/10000**(i/dim2))

      else: # is odd
        pe = cos(pos/10000**((i-1)/dim2))

      vector[pos][i] += pe

In [None]:
word_size = 5 # each word is represented as a vector with dim = word_size

for vector in train_docs:
  positional_embeddings(vector, (maxlen, word_size))

for vector in test_docs:
  positional_embeddings(vector, (maxlen, word_size))

### Transformer block

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

### Build model and fit

In [None]:
embed_dim = 32  # Embedding size for each token
num_heads = 4  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,5))
embedding_layer = layers.Dense(embed_dim) # (not realy an embedding_layer, regular fc layer)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
x_train = tf.convert_to_tensor(train_docs)
y_train = tf.convert_to_tensor(train_labels)

In [None]:
model.compile("adam", "binary_crossentropy", metrics=["binary_accuracy", tf.keras.metrics.Recall()])
history = model.fit(
    x_train, y_train, batch_size=32, epochs=2, validation_split=0.2, shuffle=True
)

Epoch 1/2
Epoch 2/2


### Evaluate model on test

In [None]:
x_test = tf.convert_to_tensor(test_docs)
y_test = tf.convert_to_tensor(test_labels)

In [None]:
results = model.evaluate(x_test, y_test, batch_size=32)



In [None]:
%%shell
jupyter nbconvert --to html /content/Fake_news_encoding.ipynb