In [1]:
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 4.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 59.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 49.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 65.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import shutil
from transformers import TrainerCallback
class CheckpointLoaderCallback(TrainerCallback):
    def __init__(self):
      self.checkpoint_count = 0

    def on_save(self, args, state, control, **kwargs):
      self.checkpoint_count += 1 

      shutil.move(f"/content/{args.output_dir}/checkpoint-{self.checkpoint_count*args.save_steps}","/content/drive/MyDrive/ModelTakeTwo")
    


In [4]:
import numpy as np
import pandas as pd
from IPython.display import clear_output
import transformers
from transformers import BertForSequenceClassification,BertTokenizer,Trainer,TrainingArguments,DataCollatorWithPadding
import torch
from datasets import Dataset
import ast

In [5]:
authors = ['Ataol Behramoğlu', 'Attila İlhan', 'Cahit Sıtkı Tarancı', 'Cahit Zarifoğlu',
 'Can Yücel', 'Cemal Safi', 'Cemal Süreya', 'Edip Cansever',
 'Mehmet Akif Ersoy', 'Nazım Hikmet Ran', 'Necip Fazıl Kısakürek',
 'Orhan Veli Kanık', 'Turgut Uyar', 'Özdemir Asaf', 'Ümit Yaşar Oğuzcan']

In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/model/last", num_labels = len(authors))
checkpoint_loader = CheckpointLoaderCallback()
clear_output()

In [7]:
train_dataset = pd.read_csv("/content/drive/MyDrive/train_dataset.csv", sep=";")
for column in ["input_ids","token_type_ids","attention_mask"]:
    for i in range(0,len(train_dataset[column])):
        train_dataset[column][i] = np.array(ast.literal_eval(train_dataset[column][i]))

for i in range(0,len(train_dataset["label"])):
        train_dataset["label"][i] = np.array(train_dataset["label"][i])

clear_output()

In [8]:
test_dataset = pd.read_csv("/content/drive/MyDrive/test_dataset.csv",sep=";")
for column in ["input_ids","token_type_ids","attention_mask"]:
    for i in range(0,len(test_dataset[column])):
        test_dataset[column][i] = np.array(ast.literal_eval(test_dataset[column][i]))


for i in range(0,len(test_dataset["label"])):
    test_dataset["label"][i] = np.array(test_dataset["label"][i])

clear_output()

In [9]:
train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)

In [10]:
training_args = TrainingArguments(
    output_dir="test-trainer", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=15, # number of training epochs
    per_device_train_batch_size=8, # batch size for training
    per_device_eval_batch_size=8,  # batch size for evaluation
    eval_steps = 1000, # Number of update steps between two evaluations.
    logging_steps = 500,
    save_steps=1000, # after # steps model is saved
    prediction_loss_only=True,
    
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    callbacks = [checkpoint_loader]
)

In [12]:
trainer.train()

***** Running training *****
  Num examples = 41344
  Num Epochs = 15
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 77520


Step,Training Loss
500,1.4494
1000,1.4863
1500,1.5583
2000,1.4851
2500,1.5875
3000,1.5674
3500,1.5664
4000,1.5664
4500,1.561
5000,1.5741


Saving model checkpoint to test-trainer/checkpoint-1000
Configuration saved in test-trainer/checkpoint-1000/config.json
Model weights saved in test-trainer/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to test-trainer/checkpoint-2000
Configuration saved in test-trainer/checkpoint-2000/config.json
Model weights saved in test-trainer/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-2000/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-2000/special_tokens_map.json
Saving model checkpoint to test-trainer/checkpoint-3000
Configuration saved in test-trainer/checkpoint-3000/config.json
Model weights saved in test-trainer/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-3000/tokenizer_config.json
Special tokens file 

TrainOutput(global_step=77520, training_loss=0.7235410696797081, metrics={'train_runtime': 38351.2208, 'train_samples_per_second': 16.171, 'train_steps_per_second': 2.021, 'total_flos': 1.6318999769186304e+17, 'train_loss': 0.7235410696797081, 'epoch': 15.0})

In [None]:
model.save_pretrained("drive/MyDrive/ModelTakeTwo")