In [1]:
%%capture
!pip install datasets transformers[torch] evaluate -U

In [2]:
from datasets import load_from_disk, DatasetDict, Dataset
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from transformers import (
    AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, 
    TrainingArguments, DataCollatorWithPadding, AutoConfig
)
import evaluate
import os
from glob import glob
import warnings
import torch
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
warnings.filterwarnings('ignore')

2024-04-11 05:47:08.103844: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-11 05:47:08.103956: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-11 05:47:08.251126: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
os.environ["WANDB_MODE"] = 'dryrun'

In [4]:
!cp -r '/kaggle/input/sentence-similarity-dataset-csci393/sentence_similarity_dataset' './'

In [5]:
dataset = load_from_disk('sentence_similarity_dataset')
# dataset = DatasetDict({
#     split: dataset[split].select(range(200)) for split in dataset.keys()
# })

In [6]:
def tokenize_dataset(dataset, tokenizer_name):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    def tokenize_function(examples):
        return tokenizer(examples['sentence1'], examples['sentence2'], padding=False, truncation=True)
    return dataset.map(tokenize_function, batched=True), tokenizer

In [7]:
def setup_training(model_name, tokenizer, tokenized_dataset, batch_size=64, epochs=8, warmup_steps=100, compute_metrics=None):
    if '/' in model_name:
        model_directory = model_name.split('/')[-1]
    else:
        model_directory = model_name
        
    os.makedirs(model_directory, exist_ok = True)
    
    training_args = TrainingArguments(
        output_dir=f'{model_directory}/results',
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        warmup_steps=warmup_steps,
        weight_decay=0.1,
        logging_dir=f'{model_directory}/logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model='loss',
        save_total_limit=1,
        save_only_model = True,
    )

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

    metric = evaluate.load("glue", "mrpc")
    if compute_metrics is None:
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)
            return metric.compute(predictions=predictions, references=labels)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset['train'],
        eval_dataset=tokenized_dataset['validation'],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    return trainer

In [8]:
experiments = [
    {
        'model_name': 'google-bert/bert-base-multilingual-cased',
        'epochs': 4,
        'batch': 64,
    },
    {
        'model_name': 'kz-transformers/kaz-roberta-conversational',
        'epochs': 4,
        'batch': 96,
    },
    {
        'model_name': 'amandyk/KazakhBERTmulti',
        'epochs': 4,
        'batch': 64,
    },
    {
        'model_name': 'intfloat/multilingual-e5-base',
        'epochs': 5,
        'batch': 64,
    },
    {
        'model_name': 'sentence-transformers/LaBSE',
        'epochs': 5,
        'batch': 32,
    }
]

In [9]:
res = []
for exp in experiments:
    print('-'*60)
    print(f'Starting training for {exp["model_name"]}')
    tokenized_dataset, tokenizer = tokenize_dataset(dataset, exp['model_name'])
    
    trainer = setup_training(exp['model_name'], tokenizer, tokenized_dataset, batch_size=exp['batch'], epochs=exp['epochs'], warmup_steps=100)
    
    trainer.train()
    
    print('Evaluating...')
    
    print('Validation:')
    val_eval = trainer.evaluate(tokenized_dataset['validation'])
    val_eval['model'] = exp["model_name"]
    val_eval['split'] = 'validation'
    res.append(val_eval)
    for k, v in val_eval.items():
        print(f'\t{k}:{v}')
    print()
    
    print('Test:')
    test_eval = trainer.evaluate(tokenized_dataset['test'])
    test_eval['model'] = exp["model_name"]
    test_eval['split'] = 'test'
    res.append(test_eval)
    for k, v in test_eval.items():
        print(f'\t{k}:{v}')
    print()
    
    print(f'Finished training for {exp["model_name"]}')
    print('-'*60+'\n')

------------------------------------------------------------
Starting training for google-bert/bert-base-multilingual-cased


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/10668 [00:00<?, ? examples/s]

Map:   0%|          | 0/1397 [00:00<?, ? examples/s]

Map:   0%|          | 0/3736 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Tracking run with wandb version 0.16.5
[34m[1mwandb[0m: W&B syncing is set to [1m`offline`[0m in this directory.  
[34m[1mwandb[0m: Run [1m`wandb online`[0m or set [1mWANDB_MODE=online[0m to enable cloud syncing.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.531,0.488537,0.757337,0.7337
2,0.3948,0.452417,0.782391,0.781923
3,0.2298,0.578076,0.773801,0.747604
4,0.1217,0.698429,0.771654,0.745817


Evaluating...
Validation:


	eval_loss:0.45241668820381165
	eval_accuracy:0.7823908375089478
	eval_f1:0.781922525107604
	eval_runtime:6.0313
	eval_samples_per_second:231.625
	eval_steps_per_second:3.648
	epoch:4.0
	model:google-bert/bert-base-multilingual-cased
	split:validation

Test:
	eval_loss:0.4724922180175781
	eval_accuracy:0.7671306209850107
	eval_f1:0.7806354009077155
	eval_runtime:17.6621
	eval_samples_per_second:211.526
	eval_steps_per_second:3.34
	epoch:4.0
	model:google-bert/bert-base-multilingual-cased
	split:test

Finished training for google-bert/bert-base-multilingual-cased
------------------------------------------------------------

------------------------------------------------------------
Starting training for kz-transformers/kaz-roberta-conversational


config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.69M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

Map:   0%|          | 0/10668 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1397 [00:00<?, ? examples/s]

Map:   0%|          | 0/3736 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/334M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at kz-transformers/kaz-roberta-conversational and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5663,0.56063,0.707946,0.689024
2,0.4831,0.565894,0.720115,0.744277
3,0.254,0.648989,0.740873,0.739568
4,0.1667,0.823601,0.738726,0.740583


Evaluating...
Validation:


	eval_loss:0.5606304407119751
	eval_accuracy:0.7079455977093773
	eval_f1:0.6890243902439024
	eval_runtime:1.9376
	eval_samples_per_second:721.004
	eval_steps_per_second:7.742
	epoch:4.0
	model:kz-transformers/kaz-roberta-conversational
	split:validation

Test:
	eval_loss:0.575599193572998
	eval_accuracy:0.6929871520342612
	eval_f1:0.6937249666221628
	eval_runtime:5.6298
	eval_samples_per_second:663.616
	eval_steps_per_second:6.927
	epoch:4.0
	model:kz-transformers/kaz-roberta-conversational
	split:test

Finished training for kz-transformers/kaz-roberta-conversational
------------------------------------------------------------

------------------------------------------------------------
Starting training for amandyk/KazakhBERTmulti


tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.45M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Map:   0%|          | 0/10668 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1397 [00:00<?, ? examples/s]

Map:   0%|          | 0/3736 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/652M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at amandyk/KazakhBERTmulti and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6057,0.620865,0.65068,0.548148
2,0.5433,0.598176,0.693629,0.688501
3,0.3513,0.742321,0.680029,0.684989
4,0.2132,0.911031,0.669291,0.672805


Evaluating...
Validation:


	eval_loss:0.5981760025024414
	eval_accuracy:0.6936292054402291
	eval_f1:0.6885007278020379
	eval_runtime:3.3305
	eval_samples_per_second:419.461
	eval_steps_per_second:6.606
	epoch:4.0
	model:amandyk/KazakhBERTmulti
	split:validation

Test:
	eval_loss:0.5948558449745178
	eval_accuracy:0.691916488222698
	eval_f1:0.7183753364325911
	eval_runtime:9.5425
	eval_samples_per_second:391.51
	eval_steps_per_second:6.183
	epoch:4.0
	model:amandyk/KazakhBERTmulti
	split:test

Finished training for amandyk/KazakhBERTmulti
------------------------------------------------------------

------------------------------------------------------------
Starting training for intfloat/multilingual-e5-base


tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Map:   0%|          | 0/10668 [00:00<?, ? examples/s]

Map:   0%|          | 0/1397 [00:00<?, ? examples/s]

Map:   0%|          | 0/3736 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at intfloat/multilingual-e5-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4747,0.452331,0.769506,0.746855
2,0.3797,0.441985,0.793128,0.792832
3,0.2224,0.498226,0.798855,0.782676
4,0.1667,0.615415,0.801718,0.792198
5,0.0821,0.726316,0.80315,0.794623


Evaluating...
Validation:


	eval_loss:0.4419846534729004
	eval_accuracy:0.7931281317108089
	eval_f1:0.7928315412186381
	eval_runtime:4.4082
	eval_samples_per_second:316.91
	eval_steps_per_second:4.991
	epoch:5.0
	model:intfloat/multilingual-e5-base
	split:validation

Test:
	eval_loss:0.44836169481277466
	eval_accuracy:0.7853319057815846
	eval_f1:0.8014851485148514
	eval_runtime:13.0348
	eval_samples_per_second:286.618
	eval_steps_per_second:4.526
	epoch:5.0
	model:intfloat/multilingual-e5-base
	split:test

Finished training for intfloat/multilingual-e5-base
------------------------------------------------------------

------------------------------------------------------------
Starting training for sentence-transformers/LaBSE


tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/10668 [00:00<?, ? examples/s]

Map:   0%|          | 0/1397 [00:00<?, ? examples/s]

Map:   0%|          | 0/3736 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/LaBSE and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5061,0.4641,0.771654,0.727583
2,0.3393,0.426508,0.801718,0.800288
3,0.1074,0.600532,0.811024,0.806167
4,0.0402,0.851308,0.80816,0.800892
5,0.0245,1.038559,0.811024,0.800604


Evaluating...
Validation:


	eval_loss:0.426507830619812
	eval_accuracy:0.8017179670722978
	eval_f1:0.8002883922134103
	eval_runtime:3.8799
	eval_samples_per_second:360.06
	eval_steps_per_second:11.34
	epoch:5.0
	model:sentence-transformers/LaBSE
	split:validation

Test:
	eval_loss:0.41818997263908386
	eval_accuracy:0.8120985010706638
	eval_f1:0.8186046511627908
	eval_runtime:11.2738
	eval_samples_per_second:331.387
	eval_steps_per_second:10.378
	epoch:5.0
	model:sentence-transformers/LaBSE
	split:test

Finished training for sentence-transformers/LaBSE
------------------------------------------------------------



In [10]:
res_df = pd.DataFrame(res)
res_df.to_csv('results.csv', index = False)
res_df

Unnamed: 0,eval_loss,eval_accuracy,eval_f1,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch,model,split
0,0.452417,0.782391,0.781923,6.0313,231.625,3.648,4.0,google-bert/bert-base-multilingual-cased,validation
1,0.472492,0.767131,0.780635,17.6621,211.526,3.34,4.0,google-bert/bert-base-multilingual-cased,test
2,0.56063,0.707946,0.689024,1.9376,721.004,7.742,4.0,kz-transformers/kaz-roberta-conversational,validation
3,0.575599,0.692987,0.693725,5.6298,663.616,6.927,4.0,kz-transformers/kaz-roberta-conversational,test
4,0.598176,0.693629,0.688501,3.3305,419.461,6.606,4.0,amandyk/KazakhBERTmulti,validation
5,0.594856,0.691916,0.718375,9.5425,391.51,6.183,4.0,amandyk/KazakhBERTmulti,test
6,0.441985,0.793128,0.792832,4.4082,316.91,4.991,5.0,intfloat/multilingual-e5-base,validation
7,0.448362,0.785332,0.801485,13.0348,286.618,4.526,5.0,intfloat/multilingual-e5-base,test
8,0.426508,0.801718,0.800288,3.8799,360.06,11.34,5.0,sentence-transformers/LaBSE,validation
9,0.41819,0.812099,0.818605,11.2738,331.387,10.378,5.0,sentence-transformers/LaBSE,test


In [11]:
def get_mean_pooled_embeddings(texts, model_name, batch_size=32):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name, output_hidden_states=True)
    model.eval()  # Put the model in evaluation mode
    model.to('cuda' if torch.cuda.is_available() else 'cpu')  # Move model to GPU if available
    
    # Initialize an empty tensor for the pooled embeddings
    pooled_embeddings = torch.empty((0, model.config.hidden_size)).to(model.device)
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
        
        # Move inputs to the same device as the model
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Extract hidden states
        hidden_states = outputs.hidden_states
        last_layer_hidden_states = hidden_states[-1]
        input_mask_expanded = inputs['attention_mask'].unsqueeze(-1).expand(last_layer_hidden_states.size()).float()
        sum_embeddings = torch.sum(last_layer_hidden_states * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_pooled_embeddings = sum_embeddings / sum_mask
        
        # Concatenate the embeddings from the current batch with the rest
        pooled_embeddings = torch.cat((pooled_embeddings, mean_pooled_embeddings), dim=0)
    
    # Move pooled embeddings back to CPU if they were on GPU
    pooled_embeddings = pooled_embeddings.to('cpu')
    
    return pooled_embeddings

In [12]:
model_checkpoints = glob('/kaggle/working/*/results/checkpoint*')
model_checkpoints

['/kaggle/working/LaBSE/results/checkpoint-668',
 '/kaggle/working/KazakhBERTmulti/results/checkpoint-334',
 '/kaggle/working/multilingual-e5-base/results/checkpoint-334',
 '/kaggle/working/kaz-roberta-conversational/results/checkpoint-112',
 '/kaggle/working/bert-base-multilingual-cased/results/checkpoint-334']

In [13]:
evaluation = {
    'model': [],
    'auc_score': [],
}

for ckpt in model_checkpoints:
    l1 = dataset['test']['sentence1']
    l2 = dataset['test']['sentence2']

    e1 = get_mean_pooled_embeddings(l1, ckpt, batch_size=256)
    e2 = get_mean_pooled_embeddings(l2, ckpt, batch_size=256)

    res = F.cosine_similarity(e1, e2)


    norm_res = (res + 1) / 2
    auc_score = roc_auc_score(dataset['test']['label'], norm_res.tolist())
    
    evaluation['model'].append(ckpt.split('/')[-3])
    evaluation['auc_score'].append(auc_score)

auc_scores = pd.DataFrame(evaluation)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

Some weights of XLMRobertaModel were not initialized from the model checkpoint at /kaggle/working/multilingual-e5-base/results/checkpoint-334 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/15 [00:00<?, ?it/s]

Some weights of XLMRobertaModel were not initialized from the model checkpoint at /kaggle/working/multilingual-e5-base/results/checkpoint-334 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/15 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at /kaggle/working/kaz-roberta-conversational/results/checkpoint-112 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/15 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at /kaggle/working/kaz-roberta-conversational/results/checkpoint-112 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

In [14]:
auc_scores.to_csv('auc_scores_trained.csv', index = False)
auc_scores.sort_values(by='auc_score', ascending = False)

Unnamed: 0,model,auc_score
0,LaBSE,0.789717
2,multilingual-e5-base,0.772008
3,kaz-roberta-conversational,0.738202
1,KazakhBERTmulti,0.732397
4,bert-base-multilingual-cased,0.730427


In [15]:
base_models = [
    'google-bert/bert-base-multilingual-cased',
    'kz-transformers/kaz-roberta-conversational',
    'amandyk/KazakhBERTmulti',
    'intfloat/multilingual-e5-base',
    'sentence-transformers/LaBSE',
]

evaluation = {
    'model': [],
    'auc_score': [],
}

for ckpt in base_models:
    l1 = dataset['test']['sentence1']
    l2 = dataset['test']['sentence2']

    e1 = get_mean_pooled_embeddings(l1, ckpt, batch_size=256)
    e2 = get_mean_pooled_embeddings(l2, ckpt, batch_size=256)

    res = F.cosine_similarity(e1, e2)


    norm_res = (res + 1) / 2
    auc_score = roc_auc_score(dataset['test']['label'], norm_res.tolist())
    
    evaluation['model'].append(ckpt.split('/')[-1])
    evaluation['auc_score'].append(auc_score)

base_auc_scores = pd.DataFrame(evaluation)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at kz-transformers/kaz-roberta-conversational and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/15 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at kz-transformers/kaz-roberta-conversational and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/15 [00:00<?, ?it/s]

Some weights of BertModel were not initialized from the model checkpoint at amandyk/KazakhBERTmulti and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/15 [00:00<?, ?it/s]

Some weights of BertModel were not initialized from the model checkpoint at amandyk/KazakhBERTmulti and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

In [16]:
base_auc_scores.to_csv('auc_scores_base.csv', index = False)
base_auc_scores.sort_values(by='auc_score', ascending = False)

Unnamed: 0,model,auc_score
4,LaBSE,0.78431
3,multilingual-e5-base,0.781997
1,kaz-roberta-conversational,0.741885
2,KazakhBERTmulti,0.72848
0,bert-base-multilingual-cased,0.72714


In [17]:
model_info = []
for ckpt in model_checkpoints:
    print(ckpt.split('/')[-3])
    model = AutoModel.from_pretrained(ckpt)
    total_params = sum(p.numel() for p in model.parameters())
    config = AutoConfig.from_pretrained(ckpt)

    # Print the model type
    print('\tParams', total_params)
    print('\tConfig:')
    print('\tArhitectrues:', config.architectures)
    print('\tModel type:', config.model_type)
    print('-'*50)
    
    model_info.append({
        'model': ckpt.split('/')[-3],
        'params': total_params,
        'type': config.model_type
    })

pd.DataFrame(model_info).to_csv('model_info.csv', index = False)

LaBSE
	Params 470926848
	Config:
	Arhitectrues: ['BertForSequenceClassification']
	Model type: bert
--------------------------------------------------
KazakhBERTmulti
	Params 162841344
	Config:
	Arhitectrues: ['BertForSequenceClassification']
	Model type: bert
--------------------------------------------------
multilingual-e5-base


Some weights of XLMRobertaModel were not initialized from the model checkpoint at /kaggle/working/multilingual-e5-base/results/checkpoint-334 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at /kaggle/working/kaz-roberta-conversational/results/checkpoint-112 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


	Params 278043648
	Config:
	Arhitectrues: ['XLMRobertaForSequenceClassification']
	Model type: xlm-roberta
--------------------------------------------------
kaz-roberta-conversational
	Params 83450880
	Config:
	Arhitectrues: ['RobertaForSequenceClassification']
	Model type: roberta
--------------------------------------------------
bert-base-multilingual-cased
	Params 177853440
	Config:
	Arhitectrues: ['BertForSequenceClassification']
	Model type: bert
--------------------------------------------------
