In [2]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModel, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

from sklearn.metrics import f1_score



In [3]:
# Load our data
data = pd.read_csv('ProblemData_512.csv') 
data

Unnamed: 0,text,label
0,καλημερα σας κατοχος της υπαριθμον 30010100311...,0
1,αγαπητοι κυριοι καλησπερα σας σημερα 31/1 2022...,1
2,στης 19.01 μπηκα λεωφορειο 022 παω τερμα δυο σ...,2
3,σταθμος τραμ γηπεδο καραισκακη. τηλεφωνο εκτακ...,3
4,"σταθμος κεραμεικος, γραμμη 3 . επιχειρησα αναν...",3
...,...,...
7096,καλημερα σας! παρακαλω ηθελα ενημερωθω ακυρωση...,1
7097,καλημερα σας καλο μηνα. λεωφορειο 730 τερμα τη...,1
7098,"καλημερα σας, απαραδεκτος οδηγος, καθως κοπελα...",2
7099,δρομολογιο 7 20 ενημερωστε γιατι περασε ποτε.,3


In [4]:
# There was one leftover NaN value for some reason...
data=data.dropna()
data = data.reset_index(drop = True)
data

Unnamed: 0,text,label
0,καλημερα σας κατοχος της υπαριθμον 30010100311...,0
1,αγαπητοι κυριοι καλησπερα σας σημερα 31/1 2022...,1
2,στης 19.01 μπηκα λεωφορειο 022 παω τερμα δυο σ...,2
3,σταθμος τραμ γηπεδο καραισκακη. τηλεφωνο εκτακ...,3
4,"σταθμος κεραμεικος, γραμμη 3 . επιχειρησα αναν...",3
...,...,...
7095,καλημερα σας! παρακαλω ηθελα ενημερωθω ακυρωση...,1
7096,καλημερα σας καλο μηνα. λεωφορειο 730 τερμα τη...,1
7097,"καλημερα σας, απαραδεκτος οδηγος, καθως κοπελα...",2
7098,δρομολογιο 7 20 ενημερωστε γιατι περασε ποτε.,3


In [5]:
# extract texts and labels into arrays
texts = data.text.values
labels = data.label.values

In [6]:
# Shouldn't be necessary but just to be safe
for text in texts:
    str(text)  

In [7]:
texts[0:5]

array(['καλημερα σας κατοχος της υπαριθμον 3001010031156797 ανωνυμη καρτας .τη 29/1/2022 προσπαθησα φορτισω μεσω κινητου τηλεφωνου nfc καποιο λογο φορτισε φαινεται εκρεμει φορτιση της αποθηκευμενης αξιας 10 ευρω μπορω φορτισω ουτε μεσω τηλεφωνου διοτι μου εμφανιζει μυνημα υπαρχει ηδη χρηματικη αξια αναμονη γεγονος υφισταται ,ουτε μεσω μηχανηματων εκδοσης λειτουργιας online κρατηση διοτι μου εμφανιζει μυνημα υπαρχουν δεδομενα φορτιση .οταν μπσινω μεσω site www.oasa.gr μενου της καρτας μου φαινεται σαν προιον αναμονη αποθηκευμενη αξια 10 ευρω πλην ομως μπορω κανενα τροπο φορτισω .τι μπορω κανω χασω 10 ευρω? υποψιν μονιμος κατοικος κρητης τη στιγμη επεστρεψα νησι. ευχαριστω',
       'αγαπητοι κυριοι καλησπερα σας σημερα 31/1 2022 ωρα 7:30 απογευμα κορη μου ετων 12.5 βρισκοταν τερμα 122 σαρωνιδα προορισμο λαγονησι δρομολογιο 7:40 εκτελεστηκε θεση βρισκοταν υπ αριθμον χεη 8394 λεωφορειο σταθμευμενο εμπροσθεν της τασης πραγματοποιωντας λεγομενα οδηγου διαλειμμα οδηγος αγενεστατος οταν ρωτηθη

In [8]:
labels[0:5]

array([0, 1, 2, 3, 3])

In [9]:
#load Greek BERT tokenizer 
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1")

In [10]:
# testing tokens and token Ids for a random sentence
def print_rand_sentence():
    index = random.randint(0, len(texts)-1) #random index in texts list
    table = np.array([tokenizer.tokenize(texts[index]), 
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(texts[index]))]).T #tokenize random text in texts list
    print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))  #print in table format

print_rand_sentence()

╒═══════════╤═════════════╕
│ Tokens    │   Token IDs │
╞═══════════╪═════════════╡
│ λεωφορειο │        5491 │
├───────────┼─────────────┤
│ συμφωνα   │         446 │
├───────────┼─────────────┤
│ ημερησιο  │       12371 │
├───────────┼─────────────┤
│ προγραμμα │         534 │
├───────────┼─────────────┤
│ φευγει    │        7436 │
├───────────┼─────────────┤
│ 9         │         132 │
├───────────┼─────────────┤
│ :         │         143 │
├───────────┼─────────────┤
│ 05        │        1597 │
├───────────┼─────────────┤
│ ,         │         119 │
├───────────┼─────────────┤
│ ξεκινησε  │        1158 │
├───────────┼─────────────┤
│ 9         │         132 │
├───────────┼─────────────┤
│ :         │         143 │
├───────────┼─────────────┤
│ 10        │         427 │
╘═══════════╧═════════════╛


In [11]:
# Find max sequensce lenght

MAX_LEN = 0
for text in texts:
    tokenized = tokenizer(text,return_tensors='pt',add_special_tokens=True)
    MAX_LEN = max(MAX_LEN, tokenized['input_ids'].size()[1])

In [12]:
MAX_LEN

511

In [13]:
# Tokenize and encode texts then extract token ids, attention masks and labels in torch tensor format.
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
    return tokenizer.encode_plus(                    #returns dictionary with token ids, attention masks and token type ids
                        input_text,
                        add_special_tokens = True,   #[CLS], [SEP] tokens required by BERT
                        max_length = 511,            #calculated above
                        pad_to_max_length = True,    #padding tokens to be of the same size 
                        return_attention_mask = True,
                        return_tensors = 'pt'        #torch tensor format
                   )


for sample in texts:
    encoding_dict = preprocessing(sample, tokenizer)
    token_id.append(encoding_dict['input_ids']) 
    attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0) #concatinating the tesnors in a single dimension
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [14]:
token_id

tensor([[  101,  7040,   383,  ...,     0,     0,     0],
        [  101, 10557,  5296,  ...,     0,     0,     0],
        [  101,   372,   278,  ...,     0,     0,     0],
        ...,
        [  101,  7040,   383,  ...,     0,     0,     0],
        [  101, 14240,   130,  ...,     0,     0,     0],
        [  101,  2576,  4463,  ...,     0,     0,     0]])

In [15]:
attention_masks

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [16]:
labels

tensor([0, 1, 2,  ..., 2, 3, 3])

In [17]:
#print ids and masks for a random sentence
def print_rand_sentence_encoding():
    index = random.randint(0, len(text) - 1)
    tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
    token_ids = [i.numpy() for i in token_id[index]]
    attention = [i.numpy() for i in attention_masks[index]]
    table = np.array([tokens, token_ids, attention]).T
    print(tabulate(table, 
                 headers = ['Tokens', 'Token IDs', 'Attention Mask'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence_encoding()

╒═════════════╤═════════════╤══════════════════╕
│ Tokens      │   Token IDs │   Attention Mask │
╞═════════════╪═════════════╪══════════════════╡
│ [CLS]       │         101 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ καλησπερα   │        8299 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ σας         │         383 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ σημερα      │         432 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ στις        │         379 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ 12          │         463 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ /           │         122 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ 03          │        1824 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ /           │         122 │                1 │
├─────────────┼─────

In [18]:
# split data in training and validation sets

val_ratio = 0.2 
batch_size = 16

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# DataLoaders
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [19]:
# load Greek BERT model as a BertForSequenceClassification model

model = BertForSequenceClassification.from_pretrained(
    "nlpaueb/bert-base-greek-uncased-v1",
    num_labels = 5,
    output_attentions = False,
    output_hidden_states = False,
)


optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 3e-5,  # can also try 3e-5, 2e-5
                              eps = 1e-08 # AdamW's epsilon value. Probably optimal.
                              )


Some weights of the model checkpoint at nlpaueb/bert-base-greek-uncased-v1 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification 

In [20]:
# Run on GPU
model.cuda()
device = torch.device('cuda')
#device = torch.device('cpu')

In [21]:
epochs = 30

# Define variables for early stopping
patience = 3
min_delta = 0.001
best_val_loss = float('inf')
counter = 0

for epoch in trange(epochs, desc='Epoch'):
    # Set model to training mode for trainig loop
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
    
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
    # Set model to evaluation mode for validation loop
    model.eval()
    
    # Initialise metrics 
    val_loss = 0
    val_accuracy = []
    val_f1=[]

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            # Forward pass
            eval_output = model(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask, 
                              labels=b_labels)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate validation metrics

        # Calculate loss
        eval_loss = eval_output.loss
        val_loss += eval_loss.item()

        # Calculate accuracy
        preds = np.argmax(logits, axis=1)
        accuracy = np.sum(preds == label_ids) / len(label_ids)
        val_accuracy.append(accuracy)

        # Calculate F1 score
        f1 = f1_score(label_ids, preds, average='weighted')
        val_f1.append(f1)

    # Calculate average metrics over all batches
    avg_val_loss = val_loss / len(validation_dataloader)
    avg_val_accuracy = np.mean(val_accuracy)
    avg_val_f1 = np.mean(val_f1)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\n\t - Validation loss: {:.4f}'.format(avg_val_loss))
    print('\n\t - Validation accuracy: {:.4f}'.format(avg_val_accuracy))
    print('\n\t - Validation F1 score: {:.4f}'.format(avg_val_f1))

    # Check if validation loss improved
    if avg_val_loss < best_val_loss - min_delta:
        best_val_loss = avg_val_loss
        counter = 0
        # Save the model
        torch.save(model.state_dict(), 'best_model.pt')
    else:
        counter += 1
        if counter == patience:
            print("Validation loss did not improve for {} epochs. Early stopping...".format(patience))
            break

Epoch:   0%|          | 0/30 [00:00<?, ?it/s]


	 - Train loss: 1.0346

	 - Validation loss: 0.8807

	 - Validation accuracy: 0.6320

	 - Validation F1 score: 0.6144


Epoch:   3%|▎         | 1/30 [05:17<2:33:14, 317.05s/it]


	 - Train loss: 0.8045

	 - Validation loss: 0.8712

	 - Validation accuracy: 0.6573

	 - Validation F1 score: 0.6455


Epoch:  10%|█         | 3/30 [15:50<2:22:29, 316.66s/it]


	 - Train loss: 0.6335

	 - Validation loss: 0.8913

	 - Validation accuracy: 0.6355

	 - Validation F1 score: 0.6302


Epoch:  13%|█▎        | 4/30 [21:06<2:17:08, 316.49s/it]


	 - Train loss: 0.4655

	 - Validation loss: 1.0722

	 - Validation accuracy: 0.6678

	 - Validation F1 score: 0.6436


Epoch:  13%|█▎        | 4/30 [26:22<2:51:28, 395.70s/it]


	 - Train loss: 0.3019

	 - Validation loss: 1.1763

	 - Validation accuracy: 0.6559

	 - Validation F1 score: 0.6538
Validation loss did not improve for 3 epochs. Early stopping...



