In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/twitter-propaganda-classification/twitter_dataset_translated_ukrainian.csv
/kaggle/input/twitter-propaganda-classification/twitter_dataset.csv


In [6]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
import pandas as pd

from IPython.display import display, Markdown

In [7]:
display(Markdown("**Full Dataset:**"))
data = pd.read_csv('/kaggle/input/twitter-propaganda-classification/twitter_dataset.csv')
display(data.head())
display("Full Dataset Shape: ", data.shape)

display(Markdown("**Clean (Non-Propagandistic) Text Dataset:**"))
clean_data = data[data['is_propaganda'] == False]
clean_data = clean_data[['text']]
display(clean_data.head())
display("Clean Text Dataset Shape: ",clean_data.shape)

display(Markdown("**Human Propaganda Dataset:**"))
human_propaganda = data[data['is_propaganda'] == True]
human_propaganda = human_propaganda[['text']]
display(human_propaganda.head())
display("Human Propaganda Text Dataset Shape: ",human_propaganda.shape)

**Full Dataset:**

Unnamed: 0.1,Unnamed: 0,id,created_at,text,is_propaganda
0,1749,1514553915580329988,2022-04-14 10:39:27+00:00,Woman who held up poster of Marine Le Pen and ...,False
1,2409,1510803460320632839,2022-04-04 02:16:28+00:00,"⚡️Zelensky: Around 150,000 people trapped in M...",False
2,2463,1475560113536741379,2021-12-27 20:12:00+00:00,RT @natomission_ru: 🇷🇺#Russia Deputy FM Sergey...,True
3,116,1527722359314075649,2022-05-20 18:46:08+00:00,#Azovstal fully liberated – Russian military\n...,True
4,2742,1517110124325879808,2022-04-21 11:56:54+00:00,"RT @BloombergUK: ""He was almost foaming at the...",False


'Full Dataset Shape: '

(12990, 5)

**Clean (Non-Propagandistic) Text Dataset:**

Unnamed: 0,text
0,Woman who held up poster of Marine Le Pen and ...
1,"⚡️Zelensky: Around 150,000 people trapped in M..."
4,"RT @BloombergUK: ""He was almost foaming at the..."
8,Key UN climate change finding widely misinterp...
10,Lawyers for the two European tourists argued m...


'Clean Text Dataset Shape: '

(6495, 1)

**Human Propaganda Dataset:**

Unnamed: 0,text
2,RT @natomission_ru: 🇷🇺#Russia Deputy FM Sergey...
3,#Azovstal fully liberated – Russian military\n...
5,'Intense battle' | Russian army surrounds last...
6,Russia’s FSB has released footage reportedly s...
7,"Hundreds of activists gathered in Washington, ..."


'Human Propaganda Text Dataset Shape: '

(6495, 1)

# Text Generation with a Miniature GPT

In [8]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
from keras import layers
from keras import ops
from keras.layers import TextVectorization
import numpy as np
import os
import string
import random
import tensorflow
import tensorflow.data as tf_data
import tensorflow.strings as tf_strings

In [9]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Mask the upper half of the dot product matrix in self attention.
    This prevents flow of information from future tokens to current token.
    1's in the lower triangle, counting from the lower right corner.
    """
    i = ops.arange(n_dest)[:, None]
    j = ops.arange(n_src)
    m = i >= j - n_src + n_dest
    mask = ops.cast(m, dtype)
    mask = ops.reshape(mask, [1, n_dest, n_src])
    mult = ops.concatenate(
        [ops.expand_dims(batch_size, -1), ops.convert_to_tensor([1, 1])], 0
    )
    return ops.tile(mask, mult)


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads, embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(ff_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        input_shape = ops.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, "bool")
        attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(inputs + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

In [10]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = ops.shape(x)[-1]
        positions = ops.arange(0, maxlen, 1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [11]:
def create_model():
    inputs = layers.Input(shape=(maxlen,), dtype="int32")
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
    x = transformer_block(x)
    outputs = layers.Dense(vocab_size)(x)
    model = keras.Model(inputs=inputs, outputs=[outputs, x])
    loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(
        "adam",
        loss=[loss_fn, None],
    )  # No loss and optimization based on word embeddings from transformer block
    return model

In [12]:
def custom_standardization(input_string):
    """Remove html line-break tags and handle punctuation"""
    lowercased = tf.strings.lower(input_string)
    stripped_html = tf.strings.regex_replace(lowercased, "<br />", " ")
    return tf.strings.regex_replace(stripped_html, f"([{string.punctuation}])", r" \1")

In [13]:
# Define a function to prepare inputs and labels for language modeling
def prepare_lm_inputs_labels(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y

In [14]:
class TextGenerator(keras.callbacks.Callback):
    """A callback to generate text from a trained model.
    1. Feed some starting prompt to the model
    2. Predict probabilities for the next token
    3. Sample the next token and add it to the next input

    Arguments:
        max_tokens: Integer, the number of tokens to be generated after prompt.
        start_tokens: List of integers, the token indices for the starting prompt.
        index_to_word: List of strings, obtained from the TextVectorization layer.
        top_k: Integer, sample from the `top_k` token predictions.
        print_every: Integer, print after this many epochs.
    """

    def __init__(
        self, max_tokens, start_tokens, index_to_word, top_k=10, print_every=1
    ):
        self.max_tokens = max_tokens
        self.start_tokens = start_tokens
        self.index_to_word = index_to_word
        self.print_every = print_every
        self.k = top_k

    def sample_from(self, logits):
        logits, indices = ops.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(ops.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def detokenize(self, number):
        return self.index_to_word[number]

    def on_epoch_end(self, epoch, logs=None):
        start_tokens = [_ for _ in self.start_tokens]
        if (epoch + 1) % self.print_every != 0:
            return
        num_tokens_generated = 0
        tokens_generated = []
        while num_tokens_generated <= self.max_tokens:
            pad_len = maxlen - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:maxlen]
                sample_index = maxlen - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            x = np.array([x])
            y, _ = self.model.predict(x, verbose=0)
            sample_token = self.sample_from(y[0][sample_index])
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)
        txt = " ".join(
            [self.detokenize(_) for _ in self.start_tokens + tokens_generated]
        )
        print(f"generated text:\n{txt}\n")

In [15]:
vocab_size = 20000  # Only consider the top 20k words
maxlen = 100  # Max sequence size
embed_dim = 256  # Embedding size for each token
num_heads = 2  # Number of attention heads
feed_forward_dim = 256  # Hidden layer size in feed forward network inside transformer
batch_size = 128

In [16]:
# Shuffle the DataFrame
human_propaganda = human_propaganda.sample(frac=1).reset_index(drop=True)

In [17]:
# Create a vectorization layer
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size - 1,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)

In [18]:
# Adapt the vectorization layer to your text data
vectorize_layer.adapt(human_propaganda['text'].values)
vocab = vectorize_layer.get_vocabulary()

In [19]:
# Create a TensorFlow dataset from DataFrame
text_ds = tf.data.Dataset.from_tensor_slices(human_propaganda['text'].values)

# Shuffle and batch the dataset
text_ds = text_ds.shuffle(buffer_size=len(human_propaganda)).batch(batch_size)

# Map the prepare_lm_inputs_labels function to the dataset
text_ds = text_ds.map(prepare_lm_inputs_labels, num_parallel_calls=tf.data.AUTOTUNE)

# Prefetch the dataset
text_ds = text_ds.prefetch(tf.data.AUTOTUNE)

In [20]:
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index

start_prompt = "In recent months"
start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]
num_tokens_generated = 40
text_gen_callback = TextGenerator(num_tokens_generated, start_tokens, vocab)

In [21]:
model = create_model()

model.fit(text_ds, verbose=2, epochs=25, callbacks=[text_gen_callback])

Epoch 1/25
generated text:
[UNK] recent months https : / /t .co /8u9sqguyrv                                   

51/51 - 465s - 9s/step - loss: 2.9838
Epoch 2/25
generated text:
[UNK] recent months to its russian gas , a briefing for the us in the russian federation and ukraine , a meeting , which are not the us                

51/51 - 494s - 10s/step - loss: 1.2019
Epoch 3/25
generated text:
[UNK] recent months from the us and a [UNK] in the west bank of [UNK] , but it was held talks with president vladimir putin .…                  

51/51 - 519s - 10s/step - loss: 1.0388
Epoch 4/25
generated text:
[UNK] recent months and ukraine crisis https : / /t .co [UNK]                                

51/51 - 509s - 10s/step - loss: 0.9264
Epoch 5/25
generated text:
[UNK] recent months of russia 's foreign ministers council of [UNK] ) https : / /t .co [UNK] https : / /t .co / /t .co                  

51/51 - 479s - 9s/step - loss: 0.8395
Epoch 6/25
generated text:
[UNK] recent months of forei

<keras.src.callbacks.history.History at 0x7a946d3d8190>