In [2]:
import pandas as pd
import numpy as np

df=pd.read_csv('../input/clean-greek-v1/training_df.csv', sep=">")

In [3]:
df.head()

Unnamed: 0,Autor,Obra,Fragmento,Texto
0,Xenophon,Memorabilia,1.1.1,πολλακις εθαυμασα τισι ποτε λογοις αθηναιους ε...
1,Xenophon,Memorabilia,1.1.2,"πρωτον μεν ουν, ως ουκ ενομιζεν ους η πολις νο..."
2,Xenophon,Memorabilia,1.1.3,"δ᾽ ουδεν καινοτερον εισεφερε των αλλων, οσοι μ..."
3,Xenophon,Memorabilia,1.1.4,αλλ᾽ οι μεν πλειστοι φασιν υπο τε των ορνιθων ...
4,Xenophon,Memorabilia,1.1.5,καιτοι τις ουκ αν ομολογησειεν αυτον βουλεσθαι...


In [4]:
df.describe()

Unnamed: 0,Autor,Obra,Fragmento,Texto
count,159847,159847,159847,159833
unique,56,446,52717,155924
top,Homer,Histories,1,αλλως τε και τουτο το χωριον εν τω πολεμω δημε...
freq,25314,20047,1132,33


In [5]:
df.isnull().sum()

Autor         0
Obra          0
Fragmento     0
Texto        14
dtype: int64

In [6]:
df.dropna(inplace=True)

In [7]:
import unicodedata
import json
import wandb

from tqdm import tqdm
from ipywidgets import IntProgress

from sklearn.preprocessing import LabelEncoder

import torch
from transformers import AutoTokenizer, AutoModelWithLMHead, BertForSequenceClassification
from torch.nn import functional as F
from transformers import AdamW

In [8]:
autor_dict = {k:i for i, k in enumerate(df.Autor.unique())}
print(autor_dict)

{'Xenophon': 0, 'Pseudo-Xenophon': 1, 'Hyperides': 2, 'Lycurgus': 3, 'Eusebius of Caesarea': 4, 'Demosthenes': 5, 'Homer': 6, 'Sophocles': 7, 'Isocrates': 8, 'Aristophanes': 9, 'Theophrastus': 10, 'Julian the Emperor': 11, 'Herodotus': 12, 'Strabo': 13, 'Aeschines': 14, 'Pausanias': 15, 'Andocides': 16, 'Antiphon': 17, 'Dinarchus': 18, 'Arrian': 19, 'Callimachus': 20, 'Lysias': 21, 'Apollonius Rhodius': 22, 'Thucydides': 23, 'Philostratus the Athenian': 24, 'Aristotle': 25, 'John, of Damascus (attributed author)': 26, 'Aeschylus': 27, 'Theocritus': 28, 'Apollodorus': 29, 'Plutarch': 30, 'Euripides': 31, 'Polybius': 32, 'Athenaeus': 33, 'Aelian': 34, 'Dionysius of Halicarnassus': 35, 'Procopius': 36, 'Appian': 37, 'Hippocrates': 38, 'Plato': 39, 'Basil, Saint, Bishop of Caesarea': 40, 'Aeneas Tacticus': 41, 'Asclepiodotus': 42, 'Quintus Smyrnaeus': 43, 'Clement of Alexandria': 44, 'Nonnus of Panopolis': 45, 'Cassius Dio Cocceianus': 46, 'Longinus': 47, 'Marcus Aurelius': 48, 'Longus': 4

In [9]:
with open('labels_encoder.json', 'w') as f:
    f.write(json.dumps(autor_dict))

In [10]:
text_batch = df.Texto
df.labels = df.Autor.map(lambda x: autor_dict[x])

  


In [11]:
df.labels

0          0
1          0
2          0
3          0
4          0
          ..
159842    44
159843    44
159844    44
159845    44
159846    44
Name: Autor, Length: 159833, dtype: int64

In [14]:
tokenizer = AutoTokenizer.from_pretrained('nlpaueb/bert-base-greek-uncased-v1')
model = BertForSequenceClassification.from_pretrained('nlpaueb/bert-base-greek-uncased-v1',
                                                      num_labels=len(autor_dict)).to('cuda')

Some weights of the model checkpoint at nlpaueb/bert-base-greek-uncased-v1 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification 

In [15]:
df.Autor.value_counts()

Homer                                        25314
Euripides                                    16591
Aristophanes                                 12504
Polybius                                     12140
Sophocles                                     9250
Procopius                                     8258
Cassius Dio Cocceianus                        7128
Aeschylus                                     6731
Apollonius Rhodius                            5384
Herodotus                                     4329
Demosthenes                                   4277
Dionysius of Halicarnassus                    4258
Thucydides                                    3576
Eusebius of Caesarea                          3383
Pausanias                                     3170
Theophrastus                                  2658
Hippocrates                                   2623
Dio Chrysostom                                2570
Aristotle                                     2464
Plutarch                       

In [16]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(df.Texto, df.labels, test_size=.2, stratify=df.labels)

train_texts = list(train_texts)
val_texts = list(val_texts)

In [17]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

In [18]:
from torch.utils.data import Dataset, DataLoader

class GreekDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = list(labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [19]:
train_dataset = GreekDataset(train_encodings, train_labels)
val_dataset = GreekDataset(val_encodings, val_labels)

In [20]:
from transformers import Trainer, TrainingArguments

In [21]:
training_args = TrainingArguments(
    output_dir='./results-stratified',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    evaluation_strategy='steps',
    eval_steps=3000,
    save_steps=3000
)

In [22]:
val_labels

61611     13
86010     27
113283    32
10221      6
135034    39
          ..
133555    36
118762    32
149988    25
28835      6
108594    32
Name: Autor, Length: 31967, dtype: int64

In [23]:
wandb.login()

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


[34m[1mwandb[0m: Paste an API key from your profile and hit enter:  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [24]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mdleirado[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.30 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Step,Training Loss,Validation Loss,Runtime,Samples Per Second
3000,0.9369,0.880408,545.2406,58.629
6000,0.7588,0.694633,544.9275,58.663
9000,0.5096,0.608101,544.8247,58.674
12000,0.4806,0.57466,544.8619,58.67
15000,0.4033,0.52269,544.8752,58.668


TrainOutput(global_step=15984, training_loss=0.7255179240061594, metrics={'train_runtime': 18751.3076, 'train_samples_per_second': 0.852, 'total_flos': 8.874582216700723e+16, 'epoch': 2.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': -356102144, 'train_mem_gpu_alloc_delta': 1362513408, 'train_mem_cpu_peaked_delta': 356294656, 'train_mem_gpu_peaked_delta': 12996649984})

In [25]:
trainer.save_model('modelo_final')