In [17]:
from __future__ import annotations
import time, sys, gc, logging, random
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from sklearn.model_selection import StratifiedKFold
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType # type: ignore
from transformers import BitsAndBytesConfig
import torch
from transformers import AutoTokenizer, LlamaForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import transformers
import peft
from accelerate import Accelerator
import bitsandbytes
from sklearn.metrics import accuracy_score, roc_auc_score
from shutil import rmtree
import language_tool_python
# import optuna
import concurrent
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import wait

print(transformers.__version__)
print(peft.__version__)
print(torch.__version__)


4.33.2
0.5.0
2.1.1+cu118


In [18]:
language_tool = language_tool_python.LanguageTool('en-US')
N_FOLD = 5
SEED = 42
DEBUG = True
IS_TRAIN = True

# Seed the same seed to all 
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything()
# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

log_level = "DEBUG"

logger = logging.getLogger(__name__)
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
    level=logging.WARNING
)

# set the main code and the modules it uses to the same log-level according to the node
transformers.utils.logging.set_verbosity(log_level)

In [19]:
# Cross validation
def cv_split(train_data):
    skf = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)
    X = train_data.loc[:, train_data.columns != "label"]
    y = train_data.loc[:, train_data.columns == "label"]

    for fold, (train_index, valid_index) in enumerate(skf.split(X, y)):
        train_data.loc[valid_index, "fold"] = fold

    print(train_data.groupby("fold")["label"].value_counts())
    display(train_data.head())
    return train_data

def pre_processing_text(text):
    text = text.replace('\n', ' ')
    typos = language_tool.check(text) # typo is a list
    # Check how many typos
    #if len(typos) > 0:
    #print(f"The number of typos = {len(typos)}\n {typos}")
    text = language_tool.correct(text)
    return text

# Run pre-processing texts in parallel
def parallel_pre_processing_text(texts):
    print(f"Total number of texts {len(texts)}")
    results = []
    # run 'pre_processing' fucntions in the process pool
    with ThreadPoolExecutor(4) as executor:
        # results = list(tqdm(executor.map(pre_processing_text, texts)))
        # send in the tasks
        futures = [executor.submit(pre_processing_text, text) for text in texts]
        # wait for all tasks to complete
        for future in futures:
            results.append(future.result())
            if len(results) % 100 == 0:
                print(f"Finished {len(results)} / {len(texts)}\n", end='', flush=True)
    # wait for all tasks to complete
    print("results", len(results))
    return results
    
    
def load_train_data():
    train_df = pd.read_csv("/root/03-S_NLP/DetectAI/00-data/train_essays.csv", sep=',')
    train_prompts_df = pd.read_csv("/root/03-S_NLP/DetectAI/00-data/train_prompts.csv", sep=',')

    # rename column generated to label and remove used 'id' and 'prompt_id' columns
    # Label: 1 indicates generated texts (by LLMs) 
    train_df = train_df.rename(columns={'generated': 'label'})
    train_df = train_df.reset_index(drop=True)
    train_df = train_df.drop(['id', 'prompt_id'], axis=1)
#     print("Start processing training data's text")
#     start = time.time()
#     # Clear text in both train and test dataset
#     train_df['text'] = train_df['text'].progress_apply(lambda text: pre_processing_text(text))
#     display(train_df.head())
#     print(f"Correct the training data's texts with {time.time() - start : .1f} seconds")
    
    # Include external data
    external_df = pd.read_csv("/root/03-S_NLP/DetectAI/00-data/train_v2_drcat_02.csv", sep=',')
    # We only need 'text' and 'label' columns
    external_df = external_df[["text", "label"]]
    external_df["label"] = 1
    
#     print("Start processing external data's texts")
#     start = time.time()
#     external_df['text'] = parallel_pre_processing_text(external_df['text'].to_list())
#     print(f"Correct the external data's texts with {time.time() - start : .1f} seconds")
#     # external_df['text'] = external_df['text'].map(lambda text: pre_processing_text(text))
#     display(external_df.head())
#     external_df.to_csv('train_v2_drcat_02_fixed.csv', index=False)
    # Merge train and external data into train_data
    train_data = pd.concat([train_df, external_df])
    train_data.reset_index(inplace=True, drop=True)
    # print(f"Train data has shape: {train_data.shape}")
    print(f"Train data {train_data.value_counts('label')}") # 1: generated texts 0: human texts
    return train_data

In [20]:
load_train_data()

Train data label
1    44871
0     1375
Name: count, dtype: int64


Unnamed: 0,text,label
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0
...,...,...
46241,"Dear Senator,\n\nI am writing to you today to ...",1
46242,"Dear Senator,\n\nI am writing to you today to ...",1
46243,"Dear Senator,\n\nI am writing to you today to ...",1
46244,"Dear Senator,\n\nI am writing to you today to ...",1


In [21]:
# Load the pretrained model and add an extra layer with PEFT library for fine-tuning
def load_model(fold):
    TARGET_MODEL = "mistralai/Mistral-7B-v0.1"
    # TARGET_MODEL = "/kaggle/input/mistral-7b-v0-1/Mistral-7B-v0.1"
    # LoRA: Low-Rank Adaptation of Large Language Models
    peft_config = LoraConfig(
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.SEQ_CLS,
        inference_mode=False,
        target_modules=[
            "q_proj",
            "v_proj"
        ],
    )
    # Enable GPU to run the model with 4bit
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL, use_fast=False)
    tokenizer.pad_token = tokenizer.eos_token
    # Load the model
    base_model = LlamaForSequenceClassification.from_pretrained(TARGET_MODEL,
                                                                num_labels=2, # label is 0 or 1
                                                                quantization_config=bnb_config,                                                                 
                                                                device_map="auto")
    base_model.config.pretraining_tp = 1 # 1 is 7b
    base_model.config.pad_token_id = tokenizer.pad_token_id
    
    if IS_TRAIN:
        # Parameter-Efficient Fine-Tuning (PEFT) methods enable efficient adaptation of 
        # pre-trained language models (PLMs) to various downstream applications 
        # without fine-tuning all the model's parameters. 
        # https://github.com/huggingface/peft
        model = get_peft_model(base_model, peft_config)
    else:
        OUTPUT_DIR = f"{fold}"
        # OUTPUT_DIR = f"/kaggle/working/mistral_7b_fold{fold}"
        # Load the pretrained model with PEFT
        model = PeftModel.from_pretrained(base_model, str(OUTPUT_DIR))
    
    model.print_trainable_parameters() # Display the trainable parameters
    
    return model, tokenizer

In [22]:
def preprocess_function(examples, tokenizer, max_length=512):
    examples["text"] = list(map(lambda text: pre_processing_text(text), examples["text"]))
    return tokenizer(examples["text"], truncation=True, max_length=max_length, padding=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy_val = accuracy_score(labels, predictions)
    roc_auc_val = roc_auc_score(labels, predictions)
    r = { "accuracy": accuracy_val,
          "roc_auc": roc_auc_val}
    # logging.debug(f'{r}')
    return r


def train_model_by_fold(fold):
    torch.cuda.empty_cache()
    gc.collect()
    print(f"Start training the fold {fold} model")
    # Create train and valid dataset for a fold
    fold_valid_df = train_data[train_data["fold"] == fold]
    fold_train_df = train_data[train_data["fold"] != fold]
    # Train the model with small (for debugging) or large samples
    if DEBUG:
        fold_train_df = fold_train_df.sample(frac =.05, random_state=SEED)
        fold_valid_df = fold_valid_df.sample(frac =.05, random_state=SEED)
    else:
        fold_train_df = fold_train_df.sample(frac =.3, random_state=SEED)
        fold_valid_df = fold_valid_df.sample(frac =.3, random_state=SEED)

    print(f'fold_train_df {fold_train_df.groupby("fold")["label"].value_counts()}')
    print(f'fold_valid_df {fold_valid_df.groupby("fold")["label"].value_counts()}')
    # create the dataset
    train_ds = Dataset.from_pandas(fold_train_df)
    valid_ds = Dataset.from_pandas(fold_valid_df)

    # Load the pretrained model and tokenizer
    model, tokenizer = load_model(fold)

    # Tokenize the train and valid dataset and pass tokenizer as function argument
    train_tokenized_ds = train_ds.map(preprocess_function, batched=True,
                                      fn_kwargs={"tokenizer": tokenizer})
    valid_tokenized_ds = valid_ds.map(preprocess_function, batched=True,
                                      fn_kwargs={"tokenizer": tokenizer})
    # Create data collator with padding (padding to the longest sequence)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

    # Start training processing        
    TMP_DIR = Path(f"{fold}/")
    TMP_DIR.mkdir(exist_ok=True, parents=True)

    STEPS = 5 if DEBUG else 20
    EPOCHS = 1 if DEBUG else 10
    BATCH_SIZE = 2
    training_args = TrainingArguments(output_dir=TMP_DIR,
                                      learning_rate=5e-5,
                                      per_device_train_batch_size=BATCH_SIZE,
                                      per_device_eval_batch_size=1,
                                      gradient_accumulation_steps=16,
                                      max_grad_norm=0.3,
                                      optim='paged_adamw_32bit',
                                      lr_scheduler_type="cosine",
                                      num_train_epochs=EPOCHS,
                                      weight_decay=0.01,
                                      evaluation_strategy="epoch",
                                      save_strategy="epoch",
                                      load_best_model_at_end=True,
                                      push_to_hub=False,
                                      warmup_steps=STEPS,
                                      eval_steps=STEPS,
                                      logging_steps=STEPS,
                                      report_to='none', # if DEBUG else 'wandb'
                                      log_level='warning', # 'warning' is default level 
                                     )


    # Create the trainer 
    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=train_tokenized_ds,
                      eval_dataset=valid_tokenized_ds,
                      tokenizer=tokenizer,
                      data_collator=data_collator,
                      compute_metrics=compute_metrics)

    trainer.train()

    OUTPUT_DIR = Path(f"{fold}/")
    OUTPUT_DIR.mkdir(exist_ok=True, parents=True)
    # Save the fine-tuned model
    trainer.save_model(output_dir=str(OUTPUT_DIR))
    print(f"=== Finish the training for fold {fold} ===")
    del model, trainer, tokenizer
    torch.cuda.empty_cache()
    gc.collect()

In [26]:
# # Check if we need to fine-tune the LLM model
if IS_TRAIN:
    start = time.time()
    # Load train data
    train_data = load_train_data()
    # Cross validation with 5 fold
    train_data = cv_split(train_data)
    # Train the model  
    # fold = 0
    for fold in range(N_FOLD):
        train_model_by_fold(fold)
    #     # Add multiple threads to run each fold model concurrently

    #with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
   #     futures = [executor.submit(train_model_by_fold, fold) for fold in range(2)]
   #     # wait for all tasks to complete
    #    wait(futures)
    #    print('All training tasks are done!')
    
    for idx, fold in enumerate(range(N_FOLD)):
        sys.exit(f"Training time of fold {fold} = {time.time() - start: .1f} seconds")

Train data label
1    44871
0     1375
Name: count, dtype: int64
fold  label
0.0   1        8975
      0         275
1.0   1        8974
      0         275
2.0   1        8974
      0         275
3.0   1        8974
      0         275
4.0   1        8974
      0         275
Name: count, dtype: int64


Unnamed: 0,text,label,fold
0,Cars. Cars have been around since they became ...,0,1.0
1,Transportation is a large necessity in most co...,0,0.0
2,"""America's love affair with it's vehicles seem...",0,3.0
3,How often do you ride in a car? Do you drive a...,0,2.0
4,Cars are a wonderful thing. They are perhaps o...,0,0.0


Start training the fold 0 model
fold_train_df fold  label
1.0   1        464
      0         18
2.0   1        457
      0         14
3.0   1        449
      0          9
4.0   1        424
      0         15
Name: count, dtype: int64
fold_valid_df fold  label
0.0   1        446
      0         16
Name: count, dtype: int64


loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/5e9c98b96d071dce59368012254c55b0ec6f8658/tokenizer.model
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/5e9c98b96d071dce59368012254c55b0ec6f8658/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/5e9c98b96d071dce59368012254c55b0ec6f8658/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/5e9c98b96d071dce59368012254c55b0ec6f8658/config.json
You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Model config LlamaConfig {
  "architectures": [
    "MistralForCaus

trainable params: 27,279,360 || all params: 7,137,939,456 || trainable%: 0.38217415779661107


Map: 100%|██████████| 1850/1850 [09:51<00:00,  3.13 examples/s]
Map: 100%|██████████| 462/462 [02:30<00:00,  3.07 examples/s]
Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices


Epoch,Training Loss,Validation Loss,Accuracy,Roc Auc
0,0.7717,0.712891,0.958874,0.496637


=== Finish the training for fold 0 ===
Start training the fold 1 model
fold_train_df fold  label
0.0   1        434
      0         19
2.0   1        449
      0         14
3.0   1        439
      0         11
4.0   1        472
      0         12
Name: count, dtype: int64
fold_valid_df fold  label
1.0   1        447
      0         15
Name: count, dtype: int64


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 2/2 [00:14<00:00,  7.30s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 27,279,360 || all params: 7,137,939,456 || trainable%: 0.38217415779661107


Map: 100%|██████████| 1850/1850 [09:34<00:00,  3.22 examples/s]
Map: 100%|██████████| 462/462 [02:23<00:00,  3.21 examples/s]


Epoch,Training Loss,Validation Loss,Accuracy,Roc Auc
0,0.6764,0.616211,0.954545,0.525503


=== Finish the training for fold 1 ===
Start training the fold 2 model
fold_train_df fold  label
0.0   1        443
      0         14
1.0   1        429
      0         16
3.0   1        469
      0         18
4.0   1        453
      0          8
Name: count, dtype: int64
fold_valid_df fold  label
2.0   1        447
      0         15
Name: count, dtype: int64


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 2/2 [00:14<00:00,  7.34s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 27,279,360 || all params: 7,137,939,456 || trainable%: 0.38217415779661107


Map: 100%|██████████| 1850/1850 [09:35<00:00,  3.21 examples/s]
Map: 100%|██████████| 462/462 [02:23<00:00,  3.21 examples/s]


Epoch,Training Loss,Validation Loss,Accuracy,Roc Auc
0,0.8283,0.692871,0.95671,0.494407


=== Finish the training for fold 2 ===
Start training the fold 3 model
fold_train_df fold  label
0.0   1        437
      0         11
1.0   1        474
      0         14
2.0   1        441
      0         14
4.0   1        442
      0         17
Name: count, dtype: int64
fold_valid_df fold  label
3.0   1        447
      0         15
Name: count, dtype: int64


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 2/2 [00:14<00:00,  7.19s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 27,279,360 || all params: 7,137,939,456 || trainable%: 0.38217415779661107


Map: 100%|██████████| 1850/1850 [09:34<00:00,  3.22 examples/s]
Map: 100%|██████████| 462/462 [02:19<00:00,  3.31 examples/s]


Epoch,Training Loss,Validation Loss,Accuracy,Roc Auc
0,0.5318,0.646973,0.95671,0.494407


=== Finish the training for fold 3 ===
Start training the fold 4 model
fold_train_df fold  label
0.0   1        424
      0         14
1.0   1        465
      0         13
2.0   1        444
      0         12
3.0   1        461
      0         17
Name: count, dtype: int64
fold_valid_df fold  label
4.0   1        447
      0         15
Name: count, dtype: int64


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.94s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 27,279,360 || all params: 7,137,939,456 || trainable%: 0.38217415779661107


Map: 100%|██████████| 1850/1850 [09:31<00:00,  3.24 examples/s]
Map: 100%|██████████| 462/462 [02:17<00:00,  3.36 examples/s]


Epoch,Training Loss,Validation Loss,Accuracy,Roc Auc
0,0.7379,0.556152,0.948052,0.522148


=== Finish the training for fold 4 ===


SystemExit: Training time of fold 0 =  21579.4 seconds

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [27]:
import concurrent
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import wait
from scipy.special import expit as sigmoid
# Load test data
test_df = pd.read_csv("/root/03-S_NLP/DetectAI/00-data/test_essays.csv", sep=',')
test_df = test_df.rename(columns={'generated': 'label'})
test_df['text'] = test_df['text'].progress_apply(lambda text: pre_processing_text(text))
# print(f'test_df.shape: {test_df.shape}')
# test_df.head(3)

def clear_memory():
    torch.cuda.empty_cache()
    gc.collect()

# Sigmoid activation function can map 'x' between 0 and 1
def sigmoid(x):
    return 1 / (1 + np.exp(-x)) 

def predict_result_by_fold(fold):
    clear_memory()
    print(f"=== Start prediction with {fold} ===")
    model, tokenizer = load_model(fold) 
    # Load the test dataframe as dataset
    test_ds = Dataset.from_pandas(test_df)
    test_tokenized_ds = test_ds.map(preprocess_function, batched=True,
                                    fn_kwargs={"tokenizer": tokenizer})
    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, 
                                            padding="longest")
    # Create the trainer
    trainer = Trainer(model=model,
                      tokenizer=tokenizer,
                      data_collator=data_collator)
    pred_output = trainer.predict(test_tokenized_ds)
    logits = pred_output.predictions
    # Apply sigmoid to 
    probs = sigmoid(logits[:, 1])
    print(f"fold = {fold} probs = {probs}")
    global predictions
    for i, prob in enumerate(probs):
        predictions[i].append(prob)  
    # Clear memory
    del model, trainer, tokenizer, test_ds, test_tokenized_ds, data_collator
    clear_memory()
    
def predict_result():
    global predictions
    predictions = [[] for i in range(len(test_df))]
    start = time.time()
    print(f"=== Begin prediction  ===")

    for fold in range(N_FOLD):
       predict_result_by_fold(fold)
    # fold = 0
    # predict_result_by_fold(fold)
    print(f"Finish prediction in {time.time() - start: .1f} seconds")

    return predictions

100%|██████████| 3/3 [00:00<00:00,  5.05it/s]


In [28]:
predictions = predict_result()
probs = [np.mean(pred) for pred in predictions] 
print(probs)    

=== Begin prediction  ===
=== Start prediction with 0 ===


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 2/2 [00:15<00:00,  7.89s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 27,279,360 || all params: 7,137,939,456 || trainable%: 0.38217415779661107


Map: 100%|██████████| 3/3 [00:00<00:00, 16.57 examples/s]


fold = 0 probs = [1. 1. 1.]
=== Start prediction with 1 ===


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 2/2 [00:14<00:00,  7.24s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 27,279,360 || all params: 7,137,939,456 || trainable%: 0.38217415779661107


Map: 100%|██████████| 3/3 [00:00<00:00, 14.98 examples/s]


fold = 1 probs = [0.02785   0.0001465 0.0004637]
=== Start prediction with 2 ===


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 2/2 [00:14<00:00,  7.12s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 27,279,360 || all params: 7,137,939,456 || trainable%: 0.38217415779661107


Map: 100%|██████████| 3/3 [00:00<00:00, 15.56 examples/s]


fold = 2 probs = [0.02785   0.0001465 0.0004637]
=== Start prediction with 3 ===


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 2/2 [00:14<00:00,  7.05s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 27,279,360 || all params: 7,137,939,456 || trainable%: 0.38217415779661107


Map: 100%|██████████| 3/3 [00:00<00:00, 15.57 examples/s]


fold = 3 probs = [0.02785   0.0001465 0.0004637]
=== Start prediction with 4 ===


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 2/2 [00:14<00:00,  7.09s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 27,279,360 || all params: 7,137,939,456 || trainable%: 0.38217415779661107


Map: 100%|██████████| 3/3 [00:00<00:00, 16.92 examples/s]


fold = 4 probs = [0.02785   0.0001465 0.0004637]
Finish prediction in  111.2 seconds
[0.2223, 0.2001, 0.2003]


In [29]:
clear_memory()