In [None]:
import os
if 'ozom671games.zip' not in os.listdir():
    from google.colab import drive
    drive.mount('/content/drive')
    !pip install transformers
    !pip install sentencepiece
    !pip install bitsandbytes
    !cp drive/MyDrive/ozom671games.zip ozom671games.zip
    !unzip ozom671games.zip
    !pip install --upgrade accelerate
    !pip install transformers==4.28.0

In [None]:
import pandas as pd
import json
from tqdm.notebook import tqdm
import numpy as np
from scipy.spatial.distance import cosine, euclidean
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import torch
import os

In [None]:
import random
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(228)

In [None]:
train = pd.read_parquet('train_data.parquet')
target = pd.read_parquet('train_pairs.parquet')

In [None]:
test = pd.read_parquet('test_data.parquet')
test_target = pd.read_parquet('test_pairs_wo_target.parquet')

In [None]:
# train['name'] = train['name'].apply(lambda x: x.replace('\n',' '))
# test['name'] = test['name'].apply(lambda x: x.replace('\n',' '))

text  = '\n'.join(sorted(set(train['name'].tolist() + test['name'].tolist())))
with open('train_text.txt','w') as f:
    f.write(text)

text  = '\n'.join(test[~test['name'].isin(train['name'])]['name'].unique().tolist()[:1] )
with open('val_text.txt','w') as f:
    f.write(text)

In [None]:
from transformers import (AutoModel,AutoModelForMaskedLM, 
                          AutoTokenizer, LineByLineTextDataset,
                          DataCollatorForLanguageModeling, PreTrainedTokenizerFast,
                          Trainer, TrainingArguments)
import re

In [None]:
model_name = "cointegrated/LaBSE-en-ru"
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.save_pretrained('./roberta-base');

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="train_text.txt", #mention train text file here
    block_size=256)

valid_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="val_text.txt", #mention valid text file here
    block_size=256)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

training_args = TrainingArguments(
    output_dir="./roberta_base_chk", #select model path for checkpoint
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy= 'steps',
    save_total_limit=5,
    eval_steps=66668,
    learning_rate = 2e-5,
    fp16 = True,
    fp16_full_eval = True,
    save_steps = 66668,
    gradient_accumulation_steps=1,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    # load_best_model_at_end =True,
    prediction_loss_only=True,
    report_to = "none")

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset)



In [None]:
trainer.train()
trainer.save_model('./roberta-base')

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
88891,1.0591,0.048872


In [None]:
!zip -r drive/MyDrive/pretrain_0.zip /content/roberta_base_chk/checkpoint-66668

  adding: content/roberta_base_chk/checkpoint-66668/ (stored 0%)
  adding: content/roberta_base_chk/checkpoint-66668/rng_state.pth (deflated 28%)
  adding: content/roberta_base_chk/checkpoint-66668/config.json (deflated 52%)
  adding: content/roberta_base_chk/checkpoint-66668/optimizer.pt (deflated 7%)
  adding: content/roberta_base_chk/checkpoint-66668/generation_config.json (deflated 8%)
  adding: content/roberta_base_chk/checkpoint-66668/scaler.pt (deflated 55%)
  adding: content/roberta_base_chk/checkpoint-66668/training_args.bin (deflated 48%)
  adding: content/roberta_base_chk/checkpoint-66668/pytorch_model.bin (deflated 7%)
  adding: content/roberta_base_chk/checkpoint-66668/trainer_state.json (deflated 82%)
  adding: content/roberta_base_chk/checkpoint-66668/scheduler.pt (deflated 48%)
