In [None]:
!pip install sentence_transformers
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
import regex as re
from keras.layers import Dropout, Conv1D, Flatten, Dense, MaxPooling1D, LSTM, Bidirectional, GlobalAveragePooling1D
from transformers import AutoTokenizer, TFAutoModel, AutoModelWithLMHead

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Seed value
# Apparently you may use different seed values at each stage
seed_value= 42

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed_value)


In [None]:
tweetData = pd.read_csv("/content/drive/MyDrive/train_small.csv", encoding="utf-8")
tweetData

In [None]:
def clean_text(text):
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    text = re.sub(r'\s+([?.!,"])', r'\1', text)
    text = re.sub(r'[^\w\s]', '', text).lower()
    return text

In [None]:
get_text = tweetData["text"]
clean_text = list(map(clean_text, get_text))

In [None]:
X = clean_text
y = np.asarray(tweetData.source)
y_one_hot = tf.keras.utils.to_categorical(y-1, num_classes = 7)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.15, random_state=42)

In [None]:
MODEL_NAME = "lang-uk/electra-base-ukrainian-cased-discriminator"
MAX_LEN = 512
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def encode(texts, tokenizer):
    ct = len(texts)
    input_ids = np.ones((ct, MAX_LEN), dtype='int32')
    attention_mask = np.zeros((ct, MAX_LEN), dtype='int32')

    for k, text in enumerate(texts):
        # Tokenize
        tok_text = tokenizer.tokenize(text)
        
        # Truncate and convert tokens to numerical IDs
        enc_text = tokenizer.convert_tokens_to_ids(tok_text[:(MAX_LEN-2)])
        
        input_length = len(enc_text) + 2
        input_length = input_length if input_length < MAX_LEN else MAX_LEN
        
        # Add tokens [CLS] and [SEP] at the beginning and the end
        input_ids[k,:input_length] = np.asarray([0] + enc_text + [2], dtype='int32')
        
        # Set to 1s in the attention input
        attention_mask[k,:input_length] = 1


    return {
        'input_word_ids': input_ids,
        'input_mask': attention_mask,
    }

In [None]:
X_train = encode(X_train, tokenizer)
X_test = encode(X_test, tokenizer)
X_val = encode(X_val, tokenizer)

In [None]:
"""
========================================
Electra LINEAR
========================================
"""
def build_model1():
  input_word_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_word_ids')
  input_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_mask')

  trf_model = TFAutoModel.from_pretrained(MODEL_NAME, from_pt=True)
  x = trf_model(input_word_ids, attention_mask=input_mask)
  x = x[0]

  x = Flatten()(x)
  x = Dense(128, activation="relu")(x)
  x = Dense(7, activation='softmax')(x)

  model = tf.keras.Model(inputs=[input_word_ids, input_mask], outputs=x)
  model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
      loss= tf.keras.losses.binary_crossentropy,
      metrics=['accuracy'])
  return model
  

In [None]:
"""
========================================
Roberta + CNN 
batch = 16
epochs = 3
========================================
"""

def build_model2(conv_size=128):
  input_word_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_word_ids')
  input_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_mask')

  trf_model = TFAutoModel.from_pretrained(MODEL_NAME, from_pt=True)
  x = trf_model(input_word_ids, attention_mask=input_mask)
  x = x[0]

  x = Conv1D(conv_size, 3, padding='same', activation='relu')(x)
  x = Conv1D(conv_size, 2, padding='same', activation="relu")(x)
  x = MaxPooling1D(pool_size=5, strides=2, padding="same")(x)
  x = Flatten()(x)
  
  x = Dense(128, activation='relu')(x)
  x = tf.keras.layers.Dense(1, activation='sigmoid')(x)

  model = tf.keras.Model(inputs=[input_word_ids, input_mask], outputs=x)
  model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
      loss= tf.keras.losses.binary_crossentropy,
      metrics=['accuracy'])

  return model

In [None]:
"""
========================================
Electra + LSTM
========================================
"""

def build_model3():
  input_word_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_word_ids')
  input_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_mask')

  trf_model = TFAutoModel.from_pretrained(MODEL_NAME, from_pt=True)
  x = trf_model(input_word_ids, attention_mask=input_mask)
  x = x[0]

  x = Bidirectional(LSTM(256, return_sequences=True))(x)
  x = Bidirectional(LSTM(128, return_sequences=True))(x)
  x = MaxPooling1D(pool_size=5, strides=2, padding="same")(x)
  x = Flatten()(x)
  x = Dense(64, activation='relu')(x)
  x = tf.keras.layers.Dense(1, activation='sigmoid')(x)

  model = tf.keras.Model(inputs=[input_word_ids, input_mask], outputs=x)
  model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
      loss= tf.keras.losses.binary_crossentropy,
      metrics=['accuracy'])

  return model

In [None]:
model = build_model1()
history = model.fit(X_train,
                    y_train,
                    epochs=3,
                    batch_size=16,
                    verbose=1,
                    validation_data=(X_val, y_val))

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test.argmax(axis=1), y_pred.argmax(axis=1)))

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
con_mat_df = confusion_matrix(y_test, y_pred)
cmn = con_mat_df.astype('float') / con_mat_df.sum(axis=1)[:, np.newaxis]
disp = ConfusionMatrixDisplay(confusion_matrix=cmn, display_labels=["neutral", "hate"])

disp.plot(cmap=plt.cm.Blues)
plt.title("Roberta Model")
plt.show()