# Twitter Political Bias Classificator

Set correct path for WSL2

In [None]:
%cd ~/../../mnt/g/Documentos/Coding/political-bias/political-bias/model

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import numpy as np


%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

##  Data

### Users

In [None]:
accounts = pd.read_csv('../../data/accounts.csv', delimiter=';')
accounts.head()

In [None]:
accounts.shape

In [None]:
unique, counts = np.unique(accounts['spectrum'], return_counts=True)
print(unique, counts)

In [None]:
fig, ax = plt.subplots(1 ,1)
ax.bar(unique, counts)

In [None]:
N_CLASSES = len(unique)
N_CLASSES

### Tweets

In [None]:
tweets = pd.read_csv('../../data/tweets.csv', delimiter=';')
tweets.head()

In [None]:
unique, counts = np.unique(tweets['spectrum'], return_counts=True)
print(unique, counts)

In [None]:
fig, ax = plt.subplots(1 ,1)
ax.bar(unique, counts)

## Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(tweets['spectrum'])

tweets['spectrum'] = le.transform(tweets['spectrum'])
le.classes_

In [None]:
from nltk.tokenize import TweetTokenizer

from preprocessing import preprocessing

In [None]:
tt_tknzr = TweetTokenizer(preserve_case=True, reduce_len=True, strip_handles=True)

In [None]:
tweets['text_normalized'] = tweets['text'].apply(lambda x: preprocessing(x, tt_tknzr))

Removing unused columns

In [None]:
tweets.drop(tweets.columns[:-2], axis=1, inplace=True)

Removing duplicates

In [None]:
old_len = tweets.shape[0]
tweets.drop_duplicates(inplace=True)
print(f'{old_len - tweets.shape[0]} tweets were dropped.')

In [None]:
tweets.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
RANDOM_SEED = 321

df_train, df_test = train_test_split(tweets, test_size=0.2, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

In [None]:
unique, counts = np.unique(df_train['spectrum'], return_counts=True)
print(le.classes_, counts)

In [None]:
fig, ax = plt.subplots(1 ,1)
ax.bar(le.classes_, counts)

In [None]:
unique, counts = np.unique(df_test['spectrum'], return_counts=True)
print(le.classes_, counts)

In [None]:
fig, ax = plt.subplots(1 ,1)
ax.bar(le.classes_, counts)

In [None]:
unique, counts = np.unique(df_test['spectrum'], return_counts=True)
print(le.classes_, counts)

In [None]:
fig, ax = plt.subplots(1 ,1)
ax.bar(le.classes_, counts)

In [None]:
fig, ax = plt.subplots(1 ,1)
ax.bar(['Train', 'Validation', 'Test'], [df_train.shape[0], df_test.shape[0], df_val.shape[0]])

## Supervised Learning with BERT

In [None]:
import tensorflow as tf
tf.config.list_physical_devices("GPU")

### Loading BERTimbau pre-trained portuguese model

In [None]:
import transformers
from transformers import TFBertModel, TFBertForSequenceClassification, BertTokenizer

In [None]:
PRE_TRAINED_MODEL_NAME = 'neuralmind/bert-base-portuguese-cased'

bert_model = TFBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
bert_tknzr = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME, do_lower_case=False, model_max_len=MAX_LEN)

Training Settings

In [None]:
MAX_LEN = 200
BATCH_SIZE = 3
N_EPOCHS = 5

Keras Data Generator for mini-batch training

In [None]:
from typing import List, Tuple
from tensorflow.keras.utils import Sequence

class DataGenerator(Sequence):

    def __init__(self, x_in: np.array, y_in: np.array, tokenizer: transformers.AutoTokenizer, max_len: int, batch_size: int, shuffle: bool=True):
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.x = x_in
        self.y = y_in
        self.tokenizer = tokenizer
        self.datalen = len(y_in)
        self.indexes = np.arange(self.datalen)
        self.max_len = max_len
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __getitem__(self, index: int) -> Tuple[List[np.array], np.array]:
        batch_indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        x_batch = self.x[batch_indexes]
        y_batch = self.y[batch_indexes]

        input_ids = []
        attention_masks = []
        
        for text in x_batch:
            encoded = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                return_attention_mask=True,
                max_length=self.max_len,
                padding='max_length',
                truncation=True
            )

            input_ids.append(encoded['input_ids'])
            attention_masks.append(encoded['attention_mask'])

        return [np.array(input_ids, dtype=np.int64), np.array(attention_masks, dtype=np.int64)], y_batch

    def __len__(self):
        return self.datalen // self.batch_size

    def on_epoch_end(self):
        self.indexes = np.arange(self.datalen)
        if self.shuffle:
            np.random.shuffle(self.indexes)

In [None]:
train_gen = DataGenerator(df_train['text_normalized'].to_numpy(), df_train['spectrum'].to_numpy(), bert_tknzr, MAX_LEN, BATCH_SIZE)
test_gen = DataGenerator(df_test['text_normalized'].to_numpy(), df_test['spectrum'].to_numpy(), bert_tknzr, MAX_LEN, BATCH_SIZE, shuffle=False)
val_gen = DataGenerator(df_val['text_normalized'].to_numpy(), df_val['spectrum'].to_numpy(), bert_tknzr, MAX_LEN, BATCH_SIZE, shuffle=False)

### Adding Inputs and custom sotfmax output layer for classification

In [None]:
def create_model(bert_model: TFBertModel) -> TFBertModel:
    input_ids = tf.keras.Input(shape=(MAX_LEN,), dtype='int64', name='input_ids')
    attention_masks = tf.keras.Input(shape=(MAX_LEN,), dtype='int64', name='attention_masks')

    output = bert_model([input_ids, attention_masks])
    output = output[1]
    output = tf.keras.layers.Dense(N_CLASSES, activation='softmax', name='output')(output)

    model = tf.keras.models.Model(inputs=[input_ids, attention_masks], outputs=output)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

Checking compiled model

In [None]:
model = create_model(bert_model)
model.summary()

In [None]:
# Freezing BERT layers
# model.layers[2].trainable = False
# model.summary()

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=1),
    tf.keras.callbacks.ModelCheckpoint(filepath='./checkpoints/model.{epoch:02d}_{accuracy:.2f}-{loss:.2f}_{val_accuracy:.2f}-{val_loss:.2f}.h5'),
    tf.keras.callbacks.TensorBoard(log_dir='./logs')
]

#### Training

In [None]:
history = model.fit(
    train_gen, 
    epochs=N_EPOCHS, 
    validation_data=val_gen, 
    verbose=1, 
    shuffle=True, 
    callbacks=callbacks
)

Saving final model

In [None]:
model.save('./model/political-bias-final-model.h5')

## Visualizations

### History

In [None]:
from matplotlib.ticker import MaxNLocator

In [None]:
n = len(history.history['accuracy'])
accuracy = np.zeros(n + 1)
val_accuracy = np.zeros(n + 1)

for i in range(n):
    accuracy[i + 1] = history.history['accuracy'][i]
    val_accuracy[i + 1] = history.history['val_accuracy'][i]

fig, ax = plt.subplots(1, 1, figsize=(15, 10))

epochs = [(i + 1) for i in range(n + 1)]

ax.plot(
    epochs,
    accuracy,
)

ax.plot(
    epochs, 
    val_accuracy
)

ax.scatter(epochs, accuracy)
ax.scatter(epochs, val_accuracy)

ax.xaxis.set_major_locator(MaxNLocator(integer=True))


labels = ['Accuracy', 'Validation Accuracy']

colors = ['#1f77b4', '#ff7f0e']
for i, value in enumerate([accuracy[-1], val_accuracy[-1]]): 
    ax.text(
        n + 1,
        value - 0.01,
        labels[i],
        c=colors[i],
        horizontalalignment='left',
        verticalalignment='top',
        size=14
    )
    
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1, decimals=0))

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.xlabel('Epochs')
plt.ylabel('Accuracy')

plt.title('Data split')

fig.show()

## Testing Results

In [None]:
predictions = model.predict(test_gen)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

Accuracy

In [None]:
accuracy = accuracy_score(
    le.inverse_transform(df_test['spectrum'][:-1]), y_pred
)
accuracy

F1-Score

In [None]:
f1 = f1_score(
    le.inverse_transform(df_test['spectrum'][:-1]), y_pred, 
    labels = list(set(le.inverse_transform(df_test['spectrum']))), 
    average = 'micro'
)
f1

Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
y_pred = [le.inverse_transform(np.argmax(pred).reshape(-1,1))[0] for pred in predictions]
cf_matrix = confusion_matrix(le.inverse_transform(df_test['spectrum'][:-1]), y_pred, normalize='true')


disp = ConfusionMatrixDisplay(cf_matrix, display_labels=le.classes_)

fig, ax = plt.subplots(figsize=(10,10))
disp.plot(ax=ax, xticks_rotation=45)

disp.ax_.set_title('Confusion Matrix')
disp.im_.colorbar.remove()
disp.ax_.set_xlabel('Predicted label')

# plt.show()