# install the required packages and dependencies

In [1]:
!pip install clean-text unidecode hazm



# unzip the necessary files in collab

In [None]:
!unzip "SA_collab.zip"

Archive:  SA_collab.zip
replace SA_collab/.idea/.gitignore? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: SA_collab/.idea/.gitignore  
  inflating: SA_collab/.idea/deployment.xml  
  inflating: SA_collab/.idea/inspectionProfiles/profiles_settings.xml  
  inflating: SA_collab/.idea/misc.xml  
  inflating: SA_collab/.idea/modules.xml  
  inflating: SA_collab/.idea/SA.iml  
  inflating: SA_collab/.idea/vcs.xml  
  inflating: SA_collab/.idea/workspace.xml  
  inflating: SA_collab/data/test.tsv  
  inflating: SA_collab/data/test_cleaned.tsv  
  inflating: SA_collab/data/train.tsv  
  inflating: SA_collab/data/train_cleaned.tsv  
  inflating: SA_collab/poetry.lock   
  inflating: SA_collab/poetry.toml   
  inflating: SA_collab/pyproject.toml  
  inflating: SA_collab/README.MD     
  inflating: SA_collab/src/main.ipynb  
  inflating: SA_collab/src/test.ipynb  
  inflating: SA_collab/src/utils/constants.py  
  inflating: SA_collab/src/utils/dataset.py  
  inflating: SA_collab/src/utils/

# Change directories to the location of the files

In [2]:
# !cd "SA_collab"
# !cd './sample_data'
import os
os.chdir("/content/SA_collab/src")

In [None]:
!pwd

/content/SA_collab/src


# import the necessary libraries

In [3]:
import os

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from utils.constants import base_path, label_dict
from utils.parsbert import train_parsbert, test_parsbert, train_parsbert_with_l2, predict_parsbert
from utils.preprocess import clean_text

# Load raw datasets

In [4]:
# pd.set_option('future.no_silent_downcasting', True)  # noqa

# Load datasets
train_df = pd.read_csv(f"{base_path}/data/train.tsv", sep="\t", header=None, names=["sentence", "label"])
test_df = pd.read_csv(f"{base_path}/data/test.tsv", sep="\t", header=None, names=["sentence", "label"])

# print distinct labels
print(train_df.label.unique())
print(test_df.label.unique())
print(set(train_df.label.unique()) == set(test_df.label.unique()))
print('*' * 50)

print(label_dict)
print(train_df.loc[0, "sentence"])
print(train_df.loc[0, "label"])
print('*' * 50)

print(train_df.dtypes)

['SAD' 'HATE' 'OTHER' 'FEAR' 'ANGRY' 'HAPPY' 'SURPRISE']
['SAD' 'HAPPY' 'OTHER' 'SURPRISE' 'FEAR' 'HATE' 'ANGRY']
True
**************************************************
{'HAPPY': 0, 'SAD': 1, 'ANGRY': 2, 'FEAR': 3, 'SURPRISE': 4, 'HATE': 5, 'OTHER': 6}
خیلی کوچیک هستن و سایزشون بدرد نمیخوره میخوام پس بدم
SAD
**************************************************
sentence    object
label       object
dtype: object


In [5]:
# clean data and save
tqdm.pandas()
if not os.path.exists(f"{base_path}/data/train_cleaned.tsv"):
    train_df['sentence'] = train_df['sentence'].progress_apply(clean_text)
    train_df.to_csv(f"{base_path}/data/train_cleaned.tsv", sep="\t", index=False)
if not os.path.exists(f"{base_path}/data/test_cleaned.tsv"):
    test_df['sentence'] = test_df['sentence'].progress_apply(clean_text)
    test_df.to_csv(f"{base_path}/data/test_cleaned.tsv", sep="\t", index=False)

# Function to apply preprocessing in parallel using joblib
# def parallel_apply(df, func):
#     processed_sentences = Parallel(n_jobs=-1)(
#         delayed(func)(text=sentence) for sentence in df['sentence'])
#     return processed_sentences
#
#
# if not os.path.exists(f"{base_path}/data/train_cleaned.tsv"):
#     train_df['sentence'] = parallel_apply(train_df, combined_preprocess)
#
# if not os.path.exists(f"{base_path}/data/test_cleaned.tsv"):
#     test_df['sentence'] = parallel_apply(test_df, combined_preprocess)

# Load cleaned datasets

In [6]:
# First row is header
train_df = pd.read_csv(f"{base_path}/data/train_cleaned.tsv", sep="\t")
test_df = pd.read_csv(f"{base_path}/data/test_cleaned.tsv", sep="\t")

print(train_df.loc[0:5, "sentence"])
print(test_df.loc[0:5, "sentence"])
print(test_df.loc[6, "sentence"])

0    خیلی کوچیک هستن و سایزشون بدرد نمیخوره میخوام ...
1       از صدای پرنده دم دمای صبح متنفرم متنفرم متنفرم
2    کیفیتش خیلی خوبه با شک خریدم ولی واقعا راضیم ب...
3    چون همش با دوربین ثبت‌شده ایا میشه اعتراض زد؟؟...
4                    این وضع ب طرز خنده داری گریه داره
5    خب من رسما از یک نفر متنفرم چون از گربه بدش می...
Name: sentence, dtype: object
0    این شاید اولین عزای عمومی واقعی است که یاد دار...
1    دیشب بعد از ارسال تویت مربوط به آثار باستانی ت...
2    کدوم شعبه پول نداده بگو الان برات آمار دقیق بد...
3    امروز وسط یه بحث با بابا مامانم گفتم آدم باید ...
4    امشب گفت نامزدی دوستش که ادم روشنفکری است بهم ...
5    به امید موفقیت تیم ملی و پیروزی در بازی امروز ...
Name: sentence, dtype: object
با آرزوی موفقیت و پیروزی


In [7]:
# Split train dataset for validation
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_df['sentence'], train_df['label'], test_size=0.1, random_state=42
)
print(f"Number of training sentences: {len(train_sentences)}")
print(f"Number of validation sentences: {len(val_sentences)}")
print(f"Number of test sentences: {len(test_df)}")

Number of training sentences: 5512
Number of validation sentences: 613
Number of test sentences: 1151


# Clear the GPU memory cache

In [None]:
# !sudo fuser -v /dev/nvidia*
import gc
torch.cuda.empty_cache()
gc.collect()

0

In [None]:
!pwd

/content/SA_collab/src


# Load the tokenizer and model

### in this part we test varations of Persian XLM Roberta models and compare their performance

In [9]:
model_names = [
    'pedramyazdipoor/persian_xlm_roberta_large',  # noqa
    'HooshvareLab/roberta-fa-zwnj-base',  # noqa
]
cache_dir = f'{base_path}/models/huggingface_cache'

# Set up the training arguments

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

for model_name in model_names:
    print(f"Model name: {model_name}")
    # train_parsbert(
    train_parsbert_with_l2(
        model_name=model_name,
        cache_dir=cache_dir,
        device=device,
        label_dict=label_dict,
        train_sentences=train_sentences,
        train_labels=train_labels,
        val_sentences=val_sentences,
        val_labels=val_labels,
        base_path=base_path,
        batch_size=16,
        epochs=8,
    )

Device: cuda
Model name: pedramyazdipoor/persian_xlm_roberta_large


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at pedramyazdipoor/persian_xlm_roberta_large and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|[32m██████████[0m| 345/345 [07:16<00:00,  1.26s/it]


Epoch 1: Train Loss: 1.9039 | Train Accuracy: 0.2480


Validation Epoch 1: 100%|[33m██████████[0m| 39/39 [00:14<00:00,  2.63it/s]


Epoch 1: Val Loss: 1.7891 | Val Accuracy: 0.3148
**************************************************
Epoch 1: New best model saved with val_loss 1.7891 & val_acc 0.3148


Training Epoch 2: 100%|[32m██████████[0m| 345/345 [07:16<00:00,  1.27s/it]


Epoch 2: Train Loss: 1.8478 | Train Accuracy: 0.2743


Validation Epoch 2: 100%|[33m██████████[0m| 39/39 [00:14<00:00,  2.64it/s]


Epoch 2: Val Loss: 1.8485 | Val Accuracy: 0.3083
**************************************************


Training Epoch 3: 100%|[32m██████████[0m| 345/345 [07:15<00:00,  1.26s/it]


Epoch 3: Train Loss: 1.6178 | Train Accuracy: 0.3757


Validation Epoch 3: 100%|[33m██████████[0m| 39/39 [00:14<00:00,  2.63it/s]


Epoch 3: Val Loss: 1.0602 | Val Accuracy: 0.6476
**************************************************
Epoch 3: New best model saved with val_loss 1.0602 & val_acc 0.6476


Training Epoch 4: 100%|[32m██████████[0m| 345/345 [07:16<00:00,  1.26s/it]


Epoch 4: Train Loss: 0.9908 | Train Accuracy: 0.6477


Validation Epoch 4: 100%|[33m██████████[0m| 39/39 [00:14<00:00,  2.64it/s]


Epoch 4: Val Loss: 0.8169 | Val Accuracy: 0.6998
**************************************************
Epoch 4: New best model saved with val_loss 0.8169 & val_acc 0.6998


Training Epoch 5: 100%|[32m██████████[0m| 345/345 [07:16<00:00,  1.26s/it]


Epoch 5: Train Loss: 0.7548 | Train Accuracy: 0.7366


Validation Epoch 5: 100%|[33m██████████[0m| 39/39 [00:14<00:00,  2.65it/s]


Epoch 5: Val Loss: 0.7466 | Val Accuracy: 0.7325
**************************************************
Epoch 5: New best model saved with val_loss 0.7466 & val_acc 0.7325


Training Epoch 6: 100%|[32m██████████[0m| 345/345 [07:17<00:00,  1.27s/it]


Epoch 6: Train Loss: 0.5773 | Train Accuracy: 0.8033


Validation Epoch 6: 100%|[33m██████████[0m| 39/39 [00:14<00:00,  2.64it/s]


Epoch 6: Val Loss: 0.7794 | Val Accuracy: 0.7129
**************************************************


Training Epoch 7: 100%|[32m██████████[0m| 345/345 [07:15<00:00,  1.26s/it]


Epoch 7: Train Loss: 0.4520 | Train Accuracy: 0.8489


Validation Epoch 7: 100%|[33m██████████[0m| 39/39 [00:14<00:00,  2.64it/s]


Epoch 7: Val Loss: 0.7808 | Val Accuracy: 0.7308
**************************************************


Training Epoch 8: 100%|[32m██████████[0m| 345/345 [07:15<00:00,  1.26s/it]


Epoch 8: Train Loss: 0.3573 | Train Accuracy: 0.8857


Validation Epoch 8: 100%|[33m██████████[0m| 39/39 [00:14<00:00,  2.65it/s]


Epoch 8: Val Loss: 0.8276 | Val Accuracy: 0.7374
**************************************************
Model name: HooshvareLab/roberta-fa-zwnj-base


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/roberta-fa-zwnj-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|[32m██████████[0m| 345/345 [02:06<00:00,  2.73it/s]


Epoch 1: Train Loss: 1.3229 | Train Accuracy: 0.5034


Validation Epoch 1: 100%|[33m██████████[0m| 39/39 [00:04<00:00,  8.44it/s]


Epoch 1: Val Loss: 0.9318 | Val Accuracy: 0.6493
**************************************************
Epoch 1: New best model saved with val_loss 0.9318 & val_acc 0.6493


Training Epoch 2: 100%|[32m██████████[0m| 345/345 [02:06<00:00,  2.73it/s]


Epoch 2: Train Loss: 0.8106 | Train Accuracy: 0.7056


Validation Epoch 2: 100%|[33m██████████[0m| 39/39 [00:04<00:00,  8.56it/s]


Epoch 2: Val Loss: 0.8770 | Val Accuracy: 0.6721
**************************************************
Epoch 2: New best model saved with val_loss 0.8770 & val_acc 0.6721


Training Epoch 3: 100%|[32m██████████[0m| 345/345 [02:06<00:00,  2.72it/s]


Epoch 3: Train Loss: 0.5157 | Train Accuracy: 0.8322


Validation Epoch 3: 100%|[33m██████████[0m| 39/39 [00:04<00:00,  8.50it/s]


Epoch 3: Val Loss: 0.9620 | Val Accuracy: 0.6835
**************************************************


Training Epoch 4: 100%|[32m██████████[0m| 345/345 [02:06<00:00,  2.74it/s]


Epoch 4: Train Loss: 0.2641 | Train Accuracy: 0.9251


Validation Epoch 4: 100%|[33m██████████[0m| 39/39 [00:04<00:00,  8.56it/s]


Epoch 4: Val Loss: 1.1100 | Val Accuracy: 0.6770
**************************************************


Training Epoch 5: 100%|[32m██████████[0m| 345/345 [02:06<00:00,  2.74it/s]


Epoch 5: Train Loss: 0.1409 | Train Accuracy: 0.9641


Validation Epoch 5: 100%|[33m██████████[0m| 39/39 [00:04<00:00,  8.56it/s]


Epoch 5: Val Loss: 1.3184 | Val Accuracy: 0.6525
**************************************************


Training Epoch 6: 100%|[32m██████████[0m| 345/345 [02:06<00:00,  2.73it/s]


Epoch 6: Train Loss: 0.0785 | Train Accuracy: 0.9826


Validation Epoch 6: 100%|[33m██████████[0m| 39/39 [00:04<00:00,  8.60it/s]


Epoch 6: Val Loss: 1.4042 | Val Accuracy: 0.6623
**************************************************


Training Epoch 7: 100%|[32m██████████[0m| 345/345 [02:06<00:00,  2.74it/s]


Epoch 7: Train Loss: 0.0452 | Train Accuracy: 0.9926


Validation Epoch 7: 100%|[33m██████████[0m| 39/39 [00:04<00:00,  8.64it/s]


Epoch 7: Val Loss: 1.4391 | Val Accuracy: 0.6542
**************************************************


Training Epoch 8: 100%|[32m██████████[0m| 345/345 [02:06<00:00,  2.74it/s]


Epoch 8: Train Loss: 0.0356 | Train Accuracy: 0.9937


Validation Epoch 8: 100%|[33m██████████[0m| 39/39 [00:04<00:00,  8.51it/s]

Epoch 8: Val Loss: 1.4451 | Val Accuracy: 0.6591
**************************************************





# Test trained models on the test set

In [11]:
for model_name in model_names:
    print(f"Model name: {model_name}")
    test_parsbert(
        model_name=model_name,
        cache_dir=cache_dir,
        device=device,
        label_dict=label_dict,
        test_sentences=test_df['sentence'].to_list(),
        test_labels=test_df['label'].to_list(),
        base_path=base_path,
        batch_size=16,
    )

Model name: pedramyazdipoor/persian_xlm_roberta_large


Testing: 100%|[34m██████████[0m| 72/72 [00:29<00:00,  2.48it/s]


Test Loss: 0.8306 | Test Accuracy: 0.7298
Precision: 0.7492 | Recall: 0.7099 | F1 Score: 0.7182
**************************************************
Model name: HooshvareLab/roberta-fa-zwnj-base


Testing: 100%|[34m██████████[0m| 72/72 [00:08<00:00,  8.06it/s]

Test Loss: 1.3254 | Test Accuracy: 0.5517
Precision: 0.6195 | Recall: 0.5245 | F1 Score: 0.5376
**************************************************





# Push the model to the huggingface model hub

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [12]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_path = f"/content/SA_collab/models/pedramyazdipoor/persian_xlm_roberta_large/best"
tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir=cache_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=len(label_dict),
                                                                cache_dir=cache_dir, ignore_mismatched_sizes=True)
model.push_to_hub("persian_xlm_roberta_large", use_auth_token="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")
tokenizer.push_to_hub("persian_xlm_roberta_large",  use_auth_token="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", commit_message="Upload Tokenizer")



model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/farzanrahmani/persian_xlm_roberta_large/commit/68c8449a133801e554eeb5983cbed0c10d85055b', commit_message='Upload Tokenizer', commit_description='', oid='68c8449a133801e554eeb5983cbed0c10d85055b', pr_url=None, pr_revision=None, pr_num=None)

In [13]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_path = f"/content/SA_collab/models/HooshvareLab/roberta-fa-zwnj-base/best"
tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir=cache_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=len(label_dict),
                                                                cache_dir=cache_dir, ignore_mismatched_sizes=True)
model.push_to_hub("roberta-fa-zwnj-base", use_auth_token="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")
tokenizer.push_to_hub("roberta-fa-zwnj-base",  use_auth_token="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", commit_message="Upload Tokenizer")

model.safetensors:   0%|          | 0.00/473M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/farzanrahmani/roberta-fa-zwnj-base/commit/b623a2948d6efa7209ed2be27fa9b245c16f229a', commit_message='Upload Tokenizer', commit_description='', oid='b623a2948d6efa7209ed2be27fa9b245c16f229a', pr_url=None, pr_revision=None, pr_num=None)