<a href="https://colab.research.google.com/github/EclipseQuinten/Thesis/blob/main/THESIS_SENTIMENT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn.metrics
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoModel
import logging
logging.basicConfig(level=logging.ERROR)

In [None]:
# Mounting google drive and importing datasets.
from google.colab import drive
drive.mount('/content/drive')

trainVlaams = pd.read_excel('/content/drive/MyDrive/Thesis/TRAIN_VLAAMS.xlsx')
trainNederlands = pd.read_excel('/content/drive/MyDrive/Thesis/TRAIN_NEDERLANDS.xlsx')
trainVlaams = trainVlaams[['review', 'number']]
trainNederlands = trainNederlands[['review', 'number']]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Setting up the device for GPU usage
from torch import cuda
torch.cuda.empty_cache()
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
# Given list of lists, flatten.
def flattenList(givenList) : 
  result = []
  for item in givenList:
    result += item
  
  return result

In [None]:
# Given df, return a list of fivefolds.
# Splitting up the dataset into five/ten (parameter) for cross validation
def fiveFoldCrossValidation(df) :

# First note down how many of each type
  a = df[df.number == 2]
  b = df[df.number == 1]
  c = df[df.number == 0]

  parts = [a, b, c]

  fold1 = []
  fold2 = []
  fold3 = []
  fold4 = []
  fold5 = []
  test = []

  for i in parts:
    train1 = i.sample(frac = 1/6, random_state = 200)
    rest = i.drop(train1.index)
    train2 = rest.sample(frac = 1/5, random_state = 200)
    rest = rest.drop(train2.index)
    train3 = rest.sample(frac = 1/4, random_state = 200)
    rest = rest.drop(train3.index)
    train4 = rest.sample(frac = 1/3, random_state = 200)
    rest = rest.drop(train4.index)
    train5 = rest.sample(frac = 1/2, random_state = 200)
    test1 = rest.drop(train5.index)
    fold1.append(train1.values.tolist())
    fold2.append(train2.values.tolist())
    fold3.append(train3.values.tolist())
    fold4.append(train4.values.tolist())
    fold5.append(train5.values.tolist())
    test.append(test1.values.tolist())

  fold1df = pd.DataFrame(flattenList(fold1))
  fold2df = pd.DataFrame(flattenList(fold2))
  fold3df = pd.DataFrame(flattenList(fold3))
  fold4df = pd.DataFrame(flattenList(fold4))
  fold5df = pd.DataFrame(flattenList(fold5))
  testdf = pd.DataFrame(flattenList(test))

  return [fold1df, fold2df, fold3df, fold4df, fold5df, testdf]

In [None]:
# Uncomment the preferred tokenizer.
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")
#tokenizer = RobertaTokenizer.from_pretrained("pdelobelle/robbert-v2-dutch-base")

In [None]:
# Input Data for the BERT model.
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.review
        self.targets = self.data.number
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# BERT Class.
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        # Uncomment the preferred basemodel.
        #self.l1 = RobertaModel.from_pretrained("pdelobelle/robbert-v2-dutch-base")
        self.l1 = AutoModel.from_pretrained("bert-base-multilingual-uncased")
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 3)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        output = torch.nn.Softmax()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
# Creating train and test sets. Parameter j is between 0 and 4 and indicates which fold.
def setUpTrainAndTestSet(j):
  dataVlaams = fiveFoldCrossValidation(trainVlaams)
  dataNederlands = fiveFoldCrossValidation(trainNederlands)
  # Five fold or not?
  test_data_vlaams = dataVlaams[5]
  test_data_nederlands = dataNederlands[5]

  for i in dataVlaams:
    i.columns = ['review', 'number']
  for i in dataNederlands:
    i.columns = ['review', 'number']

  train_data = pd.merge(dataVlaams[j], dataNederlands[j], how="outer", on=["review", "number"])
  training_set = SentimentData(train_data, tokenizer, MAX_LEN)
  testing_set_vlaams = SentimentData(test_data_vlaams, tokenizer, MAX_LEN)
  testing_set_nederlands = SentimentData(test_data_nederlands, tokenizer, MAX_LEN)

  train_params = {'batch_size': TRAIN_BATCH_SIZE,
  'shuffle': True,
  'num_workers': 0
  }

  test_params = {'batch_size': VALID_BATCH_SIZE,
  'shuffle': False,
  'num_workers': 0
  }

  training_loader = DataLoader(training_set, **train_params)
  testing_loader_vlaams = DataLoader(testing_set_vlaams, **test_params)
  testing_loader_nederlands = DataLoader(testing_set_nederlands, **test_params)
  return [training_loader, testing_loader_vlaams, testing_loader_nederlands]

In [None]:
# Calculate accuracy between targets and predictions.
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:
# The training function, takes amount of epochs and the training loader.
def train(epoch, training_loader):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")

    return 

In [None]:
# Validation function, takes in the testing loader.
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0; predictionTot = 0; recallTot = 0; targetTot = []; idxTot = [];
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            targetTot += targets.cpu()
            outputs = model(ids, mask, token_type_ids).squeeze().cuda()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            idxTot += big_idx.cpu()
            n_correct += calcuate_accuracy(big_idx, targets)

  

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples

    prediction = sklearn.metrics.precision_score(targetTot, idxTot, labels=[0,1,2], average='weighted', sample_weight=None, zero_division='warn')
    recall = sklearn.metrics.recall_score(targetTot, idxTot, labels=[0,1,2], average='weighted', sample_weight=None, zero_division='warn')
    print(targetTot)
    print(idxTot)
    
    return epoch_accu, prediction, recall

In [None]:
# Calculating the accuracy, recall and prediction for both languages.
def calculateValid(testing_loader_vlaams, testing_loader_nederlands):
  accVlaams, predVlaams, recVlaams = valid(model, testing_loader_vlaams)
  accNederlands, predNederlands, recNederlands  = valid(model, testing_loader_nederlands)
  print("Accuracy on Flemish test data = %0.2f%%" % accVlaams)
  print("Recall on Flemish test data = %0.2f%%" % recVlaams)
  print("Prediction on Flemish test data = %0.2f%%" % predVlaams)
  print("Accuracy on Dutch test data = %0.2f%%" % accNederlands)
  print("Recall on Dutch test data = %0.2f%%" % recNederlands)
  print("Prediction on Dutch test data = %0.2f%%" % predNederlands)

In [None]:
def entireTrain(epochSize, trainingLoader):
  for epoch in range(epochSize):
    train(epoch, trainingLoader)


In [None]:
# Defining some key variables that will be used later on in the training
EPOCHS = 2
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 32
LEARNING_RATE = 1e-05
result = []
result = setUpTrainAndTestSet(4)
trainingLoader = result[0]
testing_loader_vl = result[1]
testing_loader_nl = result[2]

print(torch.cuda.memory_allocated())

model = RobertaClass()

print(torch.cuda.memory_allocated())

model.to('cuda')

# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE, betas=(0.4, 0.7))

entireTrain(EPOCHS, trainingLoader)

calculateValid(testing_loader_vl, testing_loader_nl)

del model
torch.cuda.empty_cache()

print(torch.cuda.memory_allocated())

0


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0


0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  del sys.path[0]
62it [00:44,  1.39it/s]


The Total Accuracy for Epoch 0: 80.38617886178862
Training Loss Epoch: 0.5399546719366505


62it [00:44,  1.40it/s]


The Total Accuracy for Epoch 1: 83.53658536585365
Training Loss Epoch: 0.4045388288795948


15it [00:07,  2.06it/s]
  _warn_prf(average, modifier, msg_start, len(result))


[tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2)

  del sys.path[0]
17it [00:08,  2.00it/s]

[tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2)


  _warn_prf(average, modifier, msg_start, len(result))
