# 1. Install Dependencies

In [None]:
!pip install clean-text unidecode hazm huggingface_hub



# 2. Imports

In [None]:
import torch
from torch.utils.data import Dataset
import os
from sklearn.metrics import precision_recall_fscore_support
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import get_linear_schedule_with_warmup
import re
import string

import nltk
from cleantext.clean import remove_emoji as clean_text_remove_emoji
from hazm import Normalizer as HazmNormalizer
from hazm import stopwords_list
from nltk.corpus import stopwords

import pandas as pd
from sklearn.model_selection import train_test_split
from huggingface_hub import notebook_login

# Utils

## Constants

In [None]:
base_path = '/content'
if not os.path.exists(f'{base_path}/stats'):
  os.mkdir(f'{base_path}/stats')

# Map classes to integers
label_dict = {
    "HAPPY": 0,  # in article: happiness (با مقاله فرق داره)
    "SAD": 1,
    "ANGRY": 2,
    "FEAR": 3,
    "SURPRISE": 4,
    "HATE": 5,
    "OTHER": 6
}

## Preprocess

In [None]:
# first we tried dadmatech tools but does not work well, so we implement our own preprocess
# we wanted to implement but did not have time (future works)
def replace_emojis(text):
    # Happy
    grin = 'خنده'
    laugh = 'خنده'
    happy = 'خوشحال'
    _text = re.sub(":D", grin, text)
    _text = re.sub(" (x|X)D", laugh, _text)
    _text = re.sub(":\)+", happy, _text)

    # Sad
    sad = 'ناراحت'
    annoyed = 'رنجیده'
    _text = re.sub(":\(+", sad, _text)
    _text = re.sub("-_+-", annoyed, _text)
    return _text


def remove_emojis(text):
    _text = clean_text_remove_emoji(text)
    return _text


def remove_url(text):
    _text = re.sub(r"https?:\S+", '', text)
    return _text


def remove_punc(text):
    _text = text.translate(str.maketrans('', '', string.punctuation)) # remove punctuations from text using string.punctuation
    persian_virgol = '،'  # noqa
    _text = _text.replace(persian_virgol, ' ')
    return _text


def remove_numbers(text):
    _text = re.sub(r'\d+', '', text)
    return _text


def remove_hashtags(text):
    _text = re.sub(r'#\S+', '', text)
    return _text


def remove_mentions(text):
    _text = re.sub(r'@\S+', '', text)
    return _text


def remove_duplicate_spaces(text):
    _text = " ".join(text.split())
    return _text


def clean_text(text) -> str:
    _text = remove_punc(
        remove_numbers(
            remove_mentions(
                remove_hashtags(
                    remove_duplicate_spaces(
                        remove_url(
                            remove_emojis(text)
                        )
                    )
                )
            )
        )
    )

    normalizer = HazmNormalizer() # we use hazm for normalizing the text (removing extra spaces, etc.) (e.g. "می‌روم" -> "می روم", "خوبییییییییییییییی؟" -> "خوبی", "خوبی" -> "خوبی؟")
    _text = normalizer.normalize(_text)

    return _text


# def combined_preprocess(text: str) -> str:
#     normalizer = Normalizer(full_cleaning=True)
#     normalizer.remove_stop_word = False  # if it's True, it reduces the accuracy
#     normalizer.remove_puncs = False  # we remove punctuations in clean_text function
#     normalized_text = normalizer.normalize(text)
#     return clean_text(normalized_text)

## Dataset

In [None]:
class SentenceDataset(Dataset):  # Create a custom dataset suitable for the task with bert
    def __init__(self, sentences, labels, tokenizer, label_dict):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.label2idx = {label: idx for label, idx in label_dict.items()}

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = str(self.sentences[idx])
        label = self.label2idx[self.labels[idx]]

        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            # max_length=512,  # Pad or truncate all sentences to the same length
            max_length=128,
            padding="max_length",  # Add padding to the sentences
            truncation=True,  # Truncate sentences that exceed the max length
            return_tensors="pt",  # Return PyTorch tensors
        )

        input_ids = encoding["input_ids"].squeeze()  # Remove the batch dimension
        attention_mask = encoding["attention_mask"].squeeze()  # Remove the batch dimension

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,  # do not pay attention to padding tokens
            "labels": torch.tensor(label, dtype=torch.long),
        }

## Train with early stopping

In [None]:
# with Early Stopping to prevent overfitting
def train_bert_early_stopping(model_name, cache_dir, device: torch.device, label_dict, train_sentences, train_labels, val_sentences, val_labels, base_path, epochs=12, early_stop_patience=6, batch_size=16):
    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_dict), cache_dir=cache_dir, ignore_mismatched_sizes=True)
    model = model.to(device)
    results_csv_path = f"{base_path}/stats/{model_name.split('/')[-1]}_train.csv"
    f = open(results_csv_path, "w")
    f.write("epoch,train_loss,train_accuracy,val_loss,val_accuracy\n")

    train_dataset = SentenceDataset(
        sentences=train_sentences.to_list(),
        labels=train_labels.to_list(),
        tokenizer=tokenizer,
        label_dict=label_dict
    )
    val_dataset = SentenceDataset(
        sentences=val_sentences.to_list(),
        labels=val_labels.to_list(),
        tokenizer=tokenizer,
        label_dict=label_dict
    )

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

    optimizer = AdamW(model.parameters(), lr=2e-5)
    best_val_loss = float('inf')
    best_epoch = 0
    no_improvement_counter = 0

    # Learning rate scheduler setup
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0

        description = f"Training Epoch {epoch + 1}"
        progress_bar = tqdm(train_dataloader, desc=description, colour='green')
        for batch in progress_bar:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)

            loss = outputs.loss
            train_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            _, predicted = torch.max(outputs.logits, dim=1)
            train_total += batch['labels'].size(0)
            train_correct += (predicted == batch['labels']).sum().item()
            progress_bar.set_postfix({"Loss": loss.item()})

        train_average_loss = train_loss / len(train_dataloader)
        train_accuracy = train_correct / train_total
        print(f"\nTrain Loss: {train_average_loss:.4f} | Train Accuracy: {train_accuracy:.4f}")

        # Evaluate the model
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            description = f"Validation Epoch {epoch + 1}"
            progress_bar = tqdm(val_dataloader, desc=description, colour='yellow')
            for batch in progress_bar:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)

                loss = outputs.loss
                val_loss += loss.item()

                _, predicted = torch.max(outputs.logits, dim=1)
                val_total += batch['labels'].size(0)
                val_correct += (predicted == batch['labels']).sum().item()

            val_average_loss = val_loss / len(val_dataloader)
            val_accuracy = val_correct / val_total
            print(f"Validation Loss: {val_average_loss:.4f} | Validation Accuracy: {val_accuracy:.4f}")
            print('*' * 50)
        f.write(
            f"{epoch + 1},{train_average_loss:.4f},{train_accuracy:.4f},{val_average_loss:.4f},{val_accuracy:.4f}\n")

        # Early stopping and saving the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_epoch = epoch
            no_improvement_counter = 0
            print(f"Saving new best model at epoch {epoch + 1}")
            output_dir = f"{base_path}/models/{model_name}/best"
            model.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)
        else:
            no_improvement_counter += 1
            if no_improvement_counter >= early_stop_patience:
                print(f"Early stopping at epoch {epoch + 1}. Best epoch: {best_epoch + 1}")
                break

    f.close()

## Train with L2

In [None]:
# L2 regularization with AdamW (better training with better ثبات)
def train_bert_with_l2(model_name, cache_dir, device, label_dict, train_sentences, train_labels, val_sentences, val_labels, base_path, epochs=12, weight_decay=1e-3, batch_size=16):
    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_dict),
                                                               cache_dir=cache_dir, ignore_mismatched_sizes=True)
    # ignore_mismatched_sizes -> ignore the mismatched sizes between the model and the pretrained model
    # e.g. digikala: 2 class -> we have 7 class

    model.to(device)
    results_csv_path = os.path.join(base_path, "stats", f"{model_name.split('/')[-1]}_train.csv")

    with open(results_csv_path, "w") as f:
        f.write("epoch,train_loss,train_accuracy,val_loss,val_accuracy\n")

        train_dataset = SentenceDataset(sentences=train_sentences.to_list(), labels=train_labels.to_list(),
                                        tokenizer=tokenizer, label_dict=label_dict)
        val_dataset = SentenceDataset(sentences=val_sentences.to_list(), labels=val_labels.to_list(),
                                      tokenizer=tokenizer, label_dict=label_dict)

        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

        optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=weight_decay)  # with lambda parameter
        total_steps = len(train_dataloader) * epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

        best_val_loss = float('inf')
        for epoch in range(epochs):
            model.train()
            train_loss, train_correct, train_total = 0, 0, 0
            progress_bar = tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}", colour='green')
            for batch in progress_bar:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)

                loss = outputs.loss
                train_loss += loss.item()

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                scheduler.step()

                _, predicted = torch.max(outputs.logits, dim=1)
                train_total += batch['labels'].size(0)
                train_correct += (predicted == batch['labels']).sum().item()

            train_average_loss = train_loss / len(train_dataloader)
            train_accuracy = train_correct / train_total
            print(f"Epoch {epoch + 1}: Train Loss: {train_average_loss:.4f} | Train Accuracy: {train_accuracy:.4f}")

            model.eval()
            val_loss, val_correct, val_total = 0, 0, 0
            with torch.no_grad():
                for batch in tqdm(val_dataloader, desc=f"Validation Epoch {epoch + 1}", colour='yellow'):
                    batch = {k: v.to(device) for k, v in batch.items()}
                    outputs = model(**batch)

                    loss = outputs.loss
                    val_loss += loss.item()

                    _, predicted = torch.max(outputs.logits, dim=1)
                    val_total += batch['labels'].size(0)
                    val_correct += (predicted == batch['labels']).sum().item()

                val_average_loss = val_loss / len(val_dataloader)
                val_accuracy = val_correct / val_total
                print(f"Epoch {epoch + 1}: Val Loss: {val_average_loss:.4f} | Val Accuracy: {val_accuracy:.4f}")
                print('*' * 50)

            f.write(
                f"{epoch + 1},{train_average_loss:.4f},{train_accuracy:.4f},{val_average_loss:.4f},{val_accuracy:.4f}\n")

            if val_average_loss < best_val_loss:  # best result is the lowest validation loss
                best_val_loss = val_average_loss
                output_dir = os.path.join(base_path, "models", model_name, "best")
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)
                print(
                    f"Epoch {epoch + 1}: New best model saved with val_loss {best_val_loss:.4f} & val_acc {val_accuracy:.4f}")

## Test BERT

In [None]:
def test_bert(model_name, cache_dir, device: torch.device, label_dict, test_sentences, test_labels, base_path, batch_size=16, use_url=False):
    # Load the best model
    if use_url:
        model_path = model_name
    else:
        model_path = f"{base_path}/models/{model_name}/best"
    tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir=cache_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=len(label_dict),
                                                               cache_dir=cache_dir, ignore_mismatched_sizes=True)
    model = model.to(device)
    results_csv_path = f"{base_path}/stats/{model_name.split('/')[-1]}_test.csv"
    f = open(results_csv_path, "w")
    f.write("test_loss,test_accuracy,precision,recall,f1\n")

    test_dataset = SentenceDataset(
        sentences=test_sentences,
        labels=test_labels,
        tokenizer=tokenizer,
        label_dict=label_dict
    )

    # Assuming `test_dataset` is an instance of `SentenceDataset` and already defined
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

    # Test the model
    model.eval()
    test_loss = 0
    test_correct = 0
    test_total = 0
    predictions, true_labels = [], []

    with torch.no_grad():
        progress_bar = tqdm(test_dataloader, desc="Testing", colour='blue')
        for batch in progress_bar:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)

            loss = outputs.loss
            test_loss += loss.item()

            _, predicted = torch.max(outputs.logits, dim=1)
            test_total += batch['labels'].size(0)
            test_correct += (predicted == batch['labels']).sum().item()

            # Collect the predictions and true labels for each batch
            predictions.extend(predicted.view(-1).cpu().numpy())
            true_labels.extend(batch['labels'].view(-1).cpu().numpy())

        # Calculate the average loss and accuracy over all test data
        test_average_loss = test_loss / len(test_dataloader)
        test_accuracy = test_correct / test_total
        print(f"Test Loss: {test_average_loss:.4f} | Test Accuracy: {test_accuracy:.4f}")

        # Compute precision, recall, and F1 score
        precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')
        # average='macro': calculate metrics for each label, and find their unweighted mean (each class is equally weighted and has same value)

        print(f"Precision: {precision:.4f} | Recall: {recall:.4f} | F1 Score: {f1:.4f}")
        print('*' * 50)
        f.write(f"{test_average_loss:.4f},{test_accuracy:.4f},{precision:.4f},{recall:.4f},{f1:.4f}\n")
        f.close()

## Predict BERT

In [None]:
def predict_bert(model_name, cache_dir, device, label_dict, text, base_path, use_url=False):
    # Load the model and tokenizer
    if use_url:
        model_path = model_name
    else:
        model_path = f"{base_path}/models/{model_name}/best"
    tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir=cache_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_path, cache_dir=cache_dir)
    model = model.to(device)
    model.eval()  # Set the model to evaluation mode

    # Tokenize the input text
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)

    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Convert logits to probabilities (optional)
    probabilities = torch.softmax(logits, dim=1)

    # Get the predicted label index
    predicted_label_index = logits.argmax(dim=1).item()

    # Map the predicted label index to its corresponding label name
    predicted_label_name = {v: k for k, v in label_dict.items()}[predicted_label_index]

    return predicted_label_name, probabilities[0][predicted_label_index].item()

## Evaluate BERT

In [None]:
# gets a csv, and cleans it, and evaluates it: returns accuracy, precision, recall, f1
def evaluate_bert(model_name, cache_dir, device, label_dict, csv_path, base_path, batch_size=16, use_url=False):
    df = pd.read_csv(csv_path, sep="\t", header=None, names=["sentence", "label"])
    # df = pd.read_csv(csv_path)
    df['sentence'] = df['sentence'].apply(clean_text)
    df['label'] = df['label'].apply(lambda x: x.upper())
    test_sentences, test_labels = df['sentence'], df['label']

    test_dataset = SentenceDataset(
        sentences=test_sentences,
        labels=test_labels,
        tokenizer=AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir),
        label_dict=label_dict
    )
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

    # Load the best model
    if use_url:
        model_path = model_name
    else:
        model_path = f"{base_path}/models/{model_name}/best"
    tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir=cache_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=len(label_dict), cache_dir=cache_dir, ignore_mismatched_sizes=True)
    model = model.to(device)

    # Test the model
    model.eval()
    test_loss = 0
    test_correct = 0
    test_total = 0
    predictions, true_labels = [], []

    with torch.no_grad():
        progress_bar = tqdm(test_dataloader, desc="Testing", colour='blue')
        for batch in progress_bar:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)

            loss = outputs.loss
            test_loss += loss.item()

            _, predicted = torch.max(outputs.logits, dim=1)
            test_total += batch['labels'].size(0)
            test_correct += (predicted == batch['labels']).sum().item()

            # Collect the predictions and true labels for each batch
            predictions.extend(predicted.view(-1).cpu().numpy())
            true_labels.extend(batch['labels'].view(-1).cpu().numpy())

        # Calculate the average loss and accuracy over all test data
        test_average_loss = test_loss / len(test_dataloader)
        test_accuracy = test_correct / test_total
        print(f"Test Loss: {test_average_loss:.4f} | Test Accuracy: {test_accuracy:.4f}")

        # Compute precision, recall, and F1 score
        precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')
        print(f"Precision: {precision:.4f} | Recall: {recall:.4f} | F1 Score: {f1:.4f}")

# Load datasets and Preprocess

In [None]:
# pd.set_option('future.no_silent_downcasting', True)  # noqa

# Load datasets
train_df = pd.read_csv(f"{base_path}/train.tsv", sep="\t", header=None, names=["sentence", "label"])
test_df = pd.read_csv(f"{base_path}/test.tsv", sep="\t", header=None, names=["sentence", "label"])

# print distinct labels
print(train_df.label.unique())
print(test_df.label.unique())
print(set(train_df.label.unique()) == set(test_df.label.unique()))
print('*' * 50)

print(label_dict)
print(train_df.loc[0, "sentence"])
print(train_df.loc[0, "label"])
print('*' * 50)

print(train_df.dtypes)

['SAD' 'HATE' 'OTHER' 'FEAR' 'ANGRY' 'HAPPY' 'SURPRISE']
['SAD' 'HAPPY' 'OTHER' 'SURPRISE' 'FEAR' 'HATE' 'ANGRY']
True
**************************************************
{'HAPPY': 0, 'SAD': 1, 'ANGRY': 2, 'FEAR': 3, 'SURPRISE': 4, 'HATE': 5, 'OTHER': 6}
خیلی کوچیک هستن و سایزشون بدرد نمیخوره میخوام پس بدم
SAD
**************************************************
sentence    object
label       object
dtype: object


## Raw: This is time consuming because of single thread

In [None]:
# clean data and save
tqdm.pandas()
if not os.path.exists(f"{base_path}/train_cleaned.tsv"):
    train_df['sentence'] = train_df['sentence'].progress_apply(clean_text)
    train_df.to_csv(f"{base_path}/train_cleaned.tsv", sep="\t", index=False)
if not os.path.exists(f"{base_path}/test_cleaned.tsv"):
    test_df['sentence'] = test_df['sentence'].progress_apply(clean_text)
    test_df.to_csv(f"{base_path}/test_cleaned.tsv", sep="\t", index=False)

# Function to apply preprocessing in parallel using joblib
# def parallel_apply(df, func):
#     processed_sentences = Parallel(n_jobs=-1)(
#         delayed(func)(text=sentence) for sentence in df['sentence'])
#     return processed_sentences
#
#
# if not os.path.exists(f"{base_path}/train_cleaned.tsv"):
#     train_df['sentence'] = parallel_apply(train_df, combined_preprocess)
#
# if not os.path.exists(f"{base_path}/test_cleaned.tsv"):
#     test_df['sentence'] = parallel_apply(test_df, combined_preprocess)

## Clean: We prefer to load saved clean ones

In [None]:
# First row is header
train_df = pd.read_csv(f"{base_path}/train_cleaned.tsv", sep="\t")
test_df = pd.read_csv(f"{base_path}/test_cleaned.tsv", sep="\t")

print(train_df.loc[0:5, "sentence"])
print(test_df.loc[0:5, "sentence"])
print(test_df.loc[6, "sentence"])

0    خیلی کوچیک هستن و سایزشون بدرد نمیخوره میخوام ...
1       از صدای پرنده دم دمای صبح متنفرم متنفرم متنفرم
2    کیفیتش خیلی خوبه با شک خریدم ولی واقعا راضیم ب...
3    چون همش با دوربین ثبت‌شده ایا میشه اعتراض زد؟؟...
4                    این وضع ب طرز خنده داری گریه داره
5    خب من رسما از یک نفر متنفرم چون از گربه بدش می...
Name: sentence, dtype: object
0    این شاید اولین عزای عمومی واقعی است که یاد دار...
1    دیشب بعد از ارسال تویت مربوط به آثار باستانی ت...
2    کدوم شعبه پول نداده بگو الان برات آمار دقیق بد...
3    امروز وسط یه بحث با بابا مامانم گفتم آدم باید ...
4    امشب گفت نامزدی دوستش که ادم روشنفکری است بهم ...
5    به امید موفقیت تیم ملی و پیروزی در بازی امروز ...
Name: sentence, dtype: object
با آرزوی موفقیت و پیروزی


## Number of samples per label

In [None]:
# Split train dataset for validation
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_df['sentence'], train_df['label'], test_size=0.1, random_state=42
)
print(f"Number of training sentences: {len(train_sentences)}")
print(f"Number of validation sentences: {len(val_sentences)}")
print(f"Number of test sentences: {len(test_df)}")

Number of training sentences: 5512
Number of validation sentences: 613
Number of test sentences: 1151


# Varations of BERT and XLM Roberta

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

model_names = [
    # 'HooshvareLab/bert-fa-base-uncased-sentiment-snappfood',  # noqa
    # 'HooshvareLab/bert-fa-base-uncased-sentiment-digikala',  # noqa
    # 'HooshvareLab/bert-fa-base-uncased',  # noqa
    # 'HooshvareLab/bert-fa-zwnj-base',  # ParsBERT (v3.0) # noqa

    # 'FacebookAI/xlm-roberta-base',  # noqa
    # 'FacebookAI/xlm-roberta-large',  # noqa

    # 'HooshvareLab/roberta-fa-zwnj-base',  # noqa
    'pedramyazdipoor/persian_xlm_roberta_large',  # keeping the best one for showcasing
]
cache_dir = f'{base_path}/models/huggingface_cache'

Device: cuda


## Train Models

In [None]:
for model_name in model_names:
    print(f"Model name: {model_name}")
    train_bert_with_l2(
        model_name=model_name,
        cache_dir=cache_dir,
        device=device,
        label_dict=label_dict,
        train_sentences=train_sentences,
        train_labels=train_labels,
        val_sentences=val_sentences,
        val_labels=val_labels,
        base_path=base_path,
        batch_size=20,
        epochs=8,
    )

Model name: pedramyazdipoor/persian_xlm_roberta_large


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at pedramyazdipoor/persian_xlm_roberta_large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|[32m██████████[0m| 276/276 [06:19<00:00,  1.37s/it]


Epoch 1: Train Loss: 1.5077 | Train Accuracy: 0.4269


Validation Epoch 1: 100%|[33m██████████[0m| 31/31 [00:12<00:00,  2.53it/s]


Epoch 1: Val Loss: 0.9628 | Val Accuracy: 0.6737
**************************************************
Epoch 1: New best model saved with val_loss 0.9628 & val_acc 0.6737


Training Epoch 2: 100%|[32m██████████[0m| 276/276 [06:17<00:00,  1.37s/it]


Epoch 2: Train Loss: 0.8747 | Train Accuracy: 0.6885


Validation Epoch 2: 100%|[33m██████████[0m| 31/31 [00:12<00:00,  2.53it/s]


Epoch 2: Val Loss: 0.8727 | Val Accuracy: 0.6819
**************************************************
Epoch 2: New best model saved with val_loss 0.8727 & val_acc 0.6819


Training Epoch 3: 100%|[32m██████████[0m| 276/276 [06:17<00:00,  1.37s/it]


Epoch 3: Train Loss: 0.6590 | Train Accuracy: 0.7783


Validation Epoch 3: 100%|[33m██████████[0m| 31/31 [00:12<00:00,  2.52it/s]


Epoch 3: Val Loss: 0.8452 | Val Accuracy: 0.7129
**************************************************
Epoch 3: New best model saved with val_loss 0.8452 & val_acc 0.7129


Training Epoch 4: 100%|[32m██████████[0m| 276/276 [06:17<00:00,  1.37s/it]


Epoch 4: Train Loss: 0.4725 | Train Accuracy: 0.8452


Validation Epoch 4: 100%|[33m██████████[0m| 31/31 [00:12<00:00,  2.53it/s]


Epoch 4: Val Loss: 0.9028 | Val Accuracy: 0.7210
**************************************************


Training Epoch 5: 100%|[32m██████████[0m| 276/276 [06:17<00:00,  1.37s/it]


Epoch 5: Train Loss: 0.3295 | Train Accuracy: 0.8940


Validation Epoch 5: 100%|[33m██████████[0m| 31/31 [00:12<00:00,  2.53it/s]


Epoch 5: Val Loss: 0.9215 | Val Accuracy: 0.7194
**************************************************


Training Epoch 6: 100%|[32m██████████[0m| 276/276 [06:17<00:00,  1.37s/it]


Epoch 6: Train Loss: 0.2252 | Train Accuracy: 0.9298


Validation Epoch 6: 100%|[33m██████████[0m| 31/31 [00:12<00:00,  2.53it/s]


Epoch 6: Val Loss: 1.0237 | Val Accuracy: 0.7325
**************************************************


Training Epoch 7: 100%|[32m██████████[0m| 276/276 [06:17<00:00,  1.37s/it]


Epoch 7: Train Loss: 0.1533 | Train Accuracy: 0.9530


Validation Epoch 7: 100%|[33m██████████[0m| 31/31 [00:12<00:00,  2.53it/s]


Epoch 7: Val Loss: 1.1105 | Val Accuracy: 0.7210
**************************************************


Training Epoch 8: 100%|[32m██████████[0m| 276/276 [06:17<00:00,  1.37s/it]


Epoch 8: Train Loss: 0.1098 | Train Accuracy: 0.9677


Validation Epoch 8: 100%|[33m██████████[0m| 31/31 [00:12<00:00,  2.53it/s]


Epoch 8: Val Loss: 1.1072 | Val Accuracy: 0.7308
**************************************************


## Free GPU memory

In [None]:
# !sudo fuser -v /dev/nvidia*
import gc
torch.cuda.empty_cache()
gc.collect()

0

## Test fine-tuned models

In [None]:
for model_name in model_names:
    print(f"Model name: {model_name}")
    test_bert(
        model_name=model_name,
        cache_dir=cache_dir,
        device=device,
        label_dict=label_dict,
        test_sentences=test_df['sentence'].to_list(),
        test_labels=test_df['label'].to_list(),
        base_path=base_path,
        batch_size=16,
    )

Model name: pedramyazdipoor/persian_xlm_roberta_large


Testing: 100%|[34m██████████[0m| 72/72 [00:25<00:00,  2.82it/s]


Test Loss: 1.0107 | Test Accuracy: 0.6994
Precision: 0.7031 | Recall: 0.6744 | F1 Score: 0.6716
**************************************************


## Test with the best model

In [None]:
model_name = "farzanrahmani/persian_xlm_roberta_large"
print(f"Model name: {model_name}")
test_bert(
    model_name=model_name,
    cache_dir=cache_dir,
    device=device,
    label_dict=label_dict,
    test_sentences=test_df['sentence'].to_list(),
    test_labels=test_df['label'].to_list(),
    base_path=base_path,
    batch_size=16,
    use_url=True
)

Model name: farzanrahmani/persian_xlm_roberta_large


Testing: 100%|[34m██████████[0m| 72/72 [00:25<00:00,  2.84it/s]


Test Loss: 0.8306 | Test Accuracy: 0.7298
Precision: 0.7492 | Recall: 0.7099 | F1 Score: 0.7182
**************************************************


# Push the models to the huggingface

In [None]:
notebook_login()
TOKEN_OF_HAMED='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'
TOKEN_OF_FARZAN='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'

In [None]:
TOKEN=TOKEN_OF_HAMED
path_and_name = [
    # (f"{base_path}/models/HooshvareLab/bert-fa-base-uncased-sentiment-snappfood/best", "bert-fa-base-uncased-sentiment-snappfood"),
    # (f"{base_path}/models/FacebookAI/xlm-roberta-base/best", "xlm-roberta-base"),
    # (f"{base_path}/models/FacebookAI/xlm-roberta-large/best", "xlm-roberta-large"),
    (f"{base_path}/models/pedramyazdipoor/persian_xlm_roberta_large/best", "persian_xlm_roberta_large"),
    # (f"{base_path}/models/HooshvareLab/roberta-fa-zwnj-base/best", "roberta-fa-zwnj-base")
]

for item in path_and_name:
  tokenizer = AutoTokenizer.from_pretrained(item[0], cache_dir=cache_dir)
  model = AutoModelForSequenceClassification.from_pretrained(item[0], num_labels=len(label_dict), cache_dir=cache_dir, ignore_mismatched_sizes=True)
  model.push_to_hub(item[1], use_auth_token=TOKEN)
  tokenizer.push_to_hub(item[1],  use_auth_token=TOKEN, commit_message="Upload Tokenizer")



model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

# Example usage

In [None]:
# model_name = "hamedhf/persian_xlm_roberta_large"
model_name = "farzanrahmani/persian_xlm_roberta_large"
texts = [
    "من این محصول رو دوست داشتم",  # noqa
    "حالم از این وضع بهم می‌خوره",  # noqa
]

for text in texts:
    label, probability = predict_bert(model_name, cache_dir, device, label_dict, text, base_path, use_url=True)
    print(f"Text: {text}")
    print(f"Predicted label: {label} with probability: {probability}")
    print('*' * 50)

tokenizer_config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Text: من این محصول رو دوست داشتم
Predicted label: HAPPY with probability: 0.9479843378067017
**************************************************
Text: حالم از این وضع بهم می‌خوره
Predicted label: HATE with probability: 0.6271396279335022
**************************************************


In [None]:
csv_content = f"""text,label
من این محصول رو دوست داشتم,happy
حالم از این وضع بهم می‌خوره,sad
"""
csv_path = f"{base_path}/test_custom.csv"
with open(csv_path, "w") as f:
    f.write(csv_content)

model_name = "hamedhf/persian_xlm_roberta_large"
evaluate_bert(model_name, cache_dir, device, label_dict, csv_path, base_path, use_url=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Testing: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.41s/it]


Test Loss: 0.2426 | Test Accuracy: 1.0000
Precision: 1.0000 | Recall: 1.0000 | F1 Score: 1.0000


In [None]:
csv_path = "/content/Project_Test - testset.tsv"
# with open(csv_path, "w") as f:
#     f.write(csv_content)

# model_name = "farzanrahmani/persian_xlm_roberta_large"
model_name = "hamedhf/persian_xlm_roberta_large"
evaluate_bert(model_name, cache_dir, device, label_dict, csv_path, base_path, use_url=True)

tokenizer_config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Testing: 100%|[34m██████████[0m| 3/3 [00:00<00:00,  3.89it/s]


Test Loss: 0.8148 | Test Accuracy: 0.7273
Precision: 0.9000 | Recall: 0.6417 | F1 Score: 0.6738
