In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import re
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoModel, AutoTokenizer, AdamW
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm
import seaborn as sns
import os
import random
from IPython import display

model_url = 'dumitrescustefan/bert-base-romanian-uncased-v1'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device


device(type='cuda')

In [2]:
import random


def reset_numpy_seed(seed_value=42):
  try:
    # Set NumPy random seed
    import numpy as np
    np.random.seed(seed_value)
    print(f'NumPy random seed set with value: {seed_value}')
  except Exception as e:
    print(f'NumPy random seed was not set: {e}')
  return


def reset_tensorflow_seed(seed_value=42):
  try:
    # Set TensorFlow random seed
    import tensorflow as tf
    success = False
    # Here we have 2 different ways to set the seed
    # depending on the version of TensorFlow
    try:
      tf.random.set_seed(seed_value)
      success = True
    except Exception as e:
      pass
    try:
      tf.set_random_seed(seed_value)
      success = True
    except Exception as e:
      pass
    if success:
      print(f'TensorFlow random seed set with value: {seed_value}')
    else:
      print(f'TensorFlow random seed was not set')
  except Exception as e:
    print(f'TensorFlow random seed was not set: {e}')
  return


def reset_torch_seed(seed_value=42):
  try:
    # Set PyTorch random seed
    import torch
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
      torch.cuda.manual_seed(seed_value)
      torch.cuda.manual_seed_all(seed_value)  # if you are using multiple GPUs
    print(f'PyTorch random seed set with value: {seed_value}')
  except Exception as e:
    print(f'PyTorch random seed was not set: {e}')
  return


def set_random_seeds(seed_value=42):
  # Set Python random seed
  random.seed(seed_value)
  reset_numpy_seed(seed_value)
  reset_tensorflow_seed(seed_value)
  reset_torch_seed(seed_value)
  return


if __name__ == '__main__':
  # Set the desired seed value
  seed = 42

  # Set random seeds
  set_random_seeds(seed)


NumPy random seed set with value: 42
TensorFlow random seed was not set: No module named 'tensorflow'
PyTorch random seed set with value: 42


In [3]:
class DatasetTransformer(Dataset):
  def __init__(self, X, y, tokenizer):
    self.tokenizer = tokenizer
    self.X = X
    self.y = y

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    text = self.X[idx]
    label = self.y[idx]

    text = text.replace("ţ", "ț").replace("ş", "ș").replace("Ţ", "Ț").replace("Ş", "Ș")
    text_tensor = self.tokenizer.encode(text, add_special_tokens=True, max_length=512, padding='max_length', return_tensors='pt', truncation=True)

    return text_tensor, torch.tensor(label)



In [4]:
class TransformerModel(nn.Module):
  def __init__(self, in_dim=768, no_classes=2):
    super(TransformerModel, self).__init__()

    self.transformer = AutoModel.from_pretrained(model_url)
    self.fc1 = nn.Linear(in_dim, no_classes)


  def forward(self, x):
    out = x.squeeze(1)
    out = self.transformer(out)[0][:, 0, :]
    out = F.dropout(out, p=0.1)
    out = self.fc1(out)
    return out

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_url)

In [6]:
def train_epoch(model, optim, loss_fn, dataloader, epoch_idx):
    """ Trains the model for one epoch and returns the loss together with a classification report
    """

    epoch_loss = 0
    # Put the model in training mode
    model.train()
    preds = []
    gt = []

    for idx, batch in enumerate(dataloader):
        # Reset gradients
        optim.zero_grad()

        inputs, labels = batch
        labels = torch.nn.functional.one_hot(labels, num_classes=2).float()

        # Move data to GPU
        inputs = inputs.to(device)
        labels = labels.to(device)

        output = model(inputs.to(device))
        # Calculate the loss and backpropagate
        loss = loss_fn(output, labels)
        loss.backward()
        # Update weights
        optim.step()

        epoch_loss += loss.item()

        probs = F.softmax(output, dim=-1)
        batch_preds = torch.argmax(probs, dim=1)

        preds.append(batch_preds.cpu().numpy())
        gt.append(labels.cpu().numpy())

    # Average the epoch losses
    epoch_loss /= len(dataloader)

    preds = np.concatenate(preds)
    gt = np.concatenate(gt)
    gt = np.argmax(gt, axis=1)

    # Get an epoch classification report
    clf_report = classification_report(gt, preds, output_dict=True)

    return epoch_loss, clf_report

def test(model, loss_fn, dataloader):

    test_loss = 0
    # put model in evaluation mode
    model.eval()
    preds = []
    gt = []

    # Tell PyTorch that we won't be computing gradients
    with torch.no_grad():
        for idx, batch in enumerate(dataloader):
            inputs, labels = batch
            labels = torch.nn.functional.one_hot(labels, num_classes=2).float()

            inputs = inputs.to(device)
            labels = labels.to(device)

            output = model(inputs)
            test_loss += loss_fn(output, labels).item()

            probs = F.softmax(output, dim=-1)
            batch_preds = torch.argmax(probs, dim=1)

            preds.append(batch_preds.cpu().numpy())
            gt.append(labels.cpu().numpy())

    test_loss /= len(dataloader)
    preds = np.concatenate(preds)
    gt = np.concatenate(gt)
    gt = np.argmax(gt, axis=1)

    # Get a classification report
    clf_report = classification_report(gt, preds, output_dict=True)
    clf_report_text = classification_report(gt, preds)

    return test_loss, clf_report, clf_report_text

In [7]:
train_path = 'train.csv'
test_path = 'test.csv'

In [8]:
def preprocess_df(df):
    df['title'] = df['title'].fillna('')
    df['content'] = df['content'].fillna('')
    df["title_content"] = df["title"] + ' ' + df['content']

    # Other preprocessing

    return df

In [9]:
# Takes the train dataset path and the test dataset path and returns the X and y dataset split into X_train, X_val, X_test, y_train, y_val
def train_test_valid(train_path=train_path, test_path=test_path, remove_punctuation=True, seed=42):

    df_train = pd.read_csv(train_path)
    df_train = preprocess_df(df_train)

    df_test = pd.read_csv(test_path)
    df_test = preprocess_df(df_test)

    X_train = []
    y_train = []
    X_test = []

    # Then we move them to some vectors in memory
    for index, sample in df_train.iterrows():
        text = sample['title_content']
        class_id = sample['class']

        if remove_punctuation:
            text = re.sub(r'[^\w\s]', '', text)

        X_train.append(text)
        y_train.append(int(class_id))

    for index, sample in df_test.iterrows():
        text = sample['title_content']

        if remove_punctuation:
            text = re.sub(r'[^\w\s]', '', text)

        X_test.append(text)


    # And we finally split the training data into train/valid
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=seed)
    y_test = [0 for _ in range(len(X_test))]

    return X_train, X_val, X_test, y_train, y_val, y_test


In [10]:
# Get the dataset splits once again, this time containing punctuation
X_train, X_val, X_test, y_train, y_val, y_test = train_test_valid(remove_punctuation=False)

# Train on a smaller ammount of data
data_size = 1000
X_train = X_train[:data_size]
y_train = y_train[:data_size]

In [11]:
X_train[0], y_train[0]

('Cum şi-a refuzat Comisarul Firinel mântuirea? Închis la şliţ pentru a nu i se vedea „România nor-ma-lă,” noul regim şi-a trimis sculele pe post de cerberi.Astfel nu se explică minunea. Din obermaister al fortăreţei de suflete închise, Marginea dâmboviţeană, comisarul Firinel Ungureanu a ajuns director general al Administraţiei Naţionale a Penitenciarelor. Din înalta postură de cerber al proscrişilor, comisarul a făcut tot ce e cătăneşte posibil să cadă bine în ochii noilor stăpâni. Nu l-au deranjat nici regulamentele, nici drepturile fundamentale ale omului, nici informaţia că Papa n-are divizii de luptă, ci doar ambasadori. Prin cel puţin trei decizii le-a arătat creştinilor că-şi refuză mântuirea. L-a luat pe Liviu Dragnea din regimul special de detenţie şi l-a dus printre cei de drept comun, periclitându-i viaţa, doar pentru a fi pe plac celor care l-au numit. N-are rost să ne mai întrebăm ce e la bază şi ce e la vârf. Băieţii ştiu că nu slobozesc pe fiştecine să fie câine de lagă

In [12]:
ds_train = DatasetTransformer(X_train, y_train, tokenizer)
ds_val = DatasetTransformer(X_val, y_val, tokenizer)
ds_test = DatasetTransformer(X_test, y_test, tokenizer)

BATCH_SIZE = 128

train_dataloader = DataLoader(
    ds_train, sampler=RandomSampler(ds_train), batch_size=BATCH_SIZE, num_workers=16
)

val_dataloader = DataLoader(
    ds_val, sampler=SequentialSampler(ds_val), batch_size=BATCH_SIZE, num_workers=16
)

test_dataloader = DataLoader(
    ds_test, sampler=SequentialSampler(ds_test), batch_size=BATCH_SIZE, num_workers=16
)

In [13]:
# Instantiate our model and move it to GPU
model = TransformerModel().to(device)

# Freeze all the Transformer parameters
for p in model.transformer.parameters():
    p.requires_grad = False
# ... except for the bias terms
trainable_params_transformer = [p for (n, p) in model.transformer.named_parameters() if "bias" in n]
for p in trainable_params_transformer:
    p.requires_grad = True

# We'll train the final layer and the bias terms
trainable_params = list(model.fc1.parameters())
trainable_params.extend(list(trainable_params_transformer))

# Define our loss and optimizer
loss_fn = nn.BCEWithLogitsLoss()
# We'll use AdamW for the Transformer
optim = torch.optim.AdamW(trainable_params)

no_epochs = 10
best_val_loss = 999

In [None]:
train_losses = []
val_losses = []

train_accs = []
val_accs = []

# use tqdm for a nice progress bar
for epoch_idx in tqdm(range(no_epochs)):
    # Train the model for one epoch
    train_loss, train_report = train_epoch(model, optim, loss_fn, train_dataloader, epoch_idx)
    train_losses.append(train_loss)
    train_accs.append(train_report['accuracy'])

    # Test the model on the validation set
    val_loss, val_report, val_report_text = test(model, loss_fn, val_dataloader)
    val_losses.append(val_loss)
    val_accs.append(val_report['accuracy'])

    # Save the model if the validation loss is the best we've seen so far
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pt')

    # Print the results for this epoch
    print(f"Epoch {epoch_idx}")
    print(f"Train loss: {train_loss}")
    print(f"Train accuracy: {train_report['accuracy']}")
    print(f"Validation loss: {val_loss}")
    print(f"Validation accuracy: {val_report['accuracy']}")
    print(val_report_text)


  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
model.load_state_dict(torch.load('best_transformer_model.pt'))

In [None]:
def predict(model, data_loader):
    model.eval()
    preds = []
    gt = []

    with torch.no_grad():
        for idx, batch in enumerate(data_loader):
            inputs, labels = batch
            labels = torch.nn.functional.one_hot(labels, num_classes=2).float()

            inputs = inputs.to(device)
            labels = labels.to(device)

            output = model(inputs)

            probs = F.softmax(output, dim=-1)
            batch_preds = torch.argmax(probs, dim=1)

            preds.append(batch_preds.cpu().numpy())
            gt.append(labels.cpu().numpy())

    preds = np.concatenate(preds)
    gt = np.concatenate(gt)
    gt = np.argmax(gt, axis=1)

    return gt, preds

In [None]:
gt, preds = predict(model, test_dataloader)

In [None]:
torch.__version__