In [None]:
import pandas as pd
import numpy as np

df=pd.read_csv('../input/textos-griegoscsv/textos_griegos.csv')

In [None]:
df.head()
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df

In [None]:
def sustituirchars(linea):
    return linea.replace('.',',').replace(',,','.')

In [None]:
df.dropna(inplace=True)

In [None]:
df

In [None]:
autores_more_than = df.Autor.value_counts()[df.Autor.value_counts() > 10].index

In [None]:
autores_more_than

In [None]:
df = df.loc[df.Autor.isin(autores_more_than)]

In [None]:
df.Texto=df.Texto.apply(lambda x: sustituirchars(x))

In [None]:
df.head()

In [None]:
import unicodedata

from tqdm import tqdm
from ipywidgets import IntProgress

from sklearn.preprocessing import LabelEncoder

import torch
from transformers import AutoTokenizer, AutoModelWithLMHead, BertForSequenceClassification
from torch.nn import functional as F
from transformers import AdamW

In [None]:
def strip_accents_and_lowercase(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn').lower()

In [None]:
df.Texto = df.Texto.apply(lambda x: strip_accents_and_lowercase(x))

In [None]:
text_batch = df.Texto
labels = LabelEncoder().fit_transform(df['Autor'])
df['labels']=labels

In [None]:
tokenizer = AutoTokenizer.from_pretrained('nlpaueb/bert-base-greek-uncased-v1')
model = BertForSequenceClassification.from_pretrained('nlpaueb/bert-base-greek-uncased-v1',
                                                      num_labels=len(np.unique(labels))).to('cuda')

In [None]:
df.Autor.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(df.Texto, df.labels, test_size=.2, stratify=df.labels)

train_texts = list(train_texts)
val_texts = list(val_texts)

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

In [None]:
from torch.utils.data import Dataset, DataLoader

class GreekDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = list(labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = GreekDataset(train_encodings, train_labels)
val_dataset = GreekDataset(val_encodings, val_labels)

In [None]:
from transformers import Trainer, TrainingArguments

In [None]:
training_args = TrainingArguments(
    output_dir='./results-stratified',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    evaluation_strategy='steps',
    eval_steps=3000,
    save_steps=3000
)

In [None]:
val_labels

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

In [None]:
trainer.save_model('modelo_final')