In [1]:
import os

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from utils.constants import base_path, label_dict
from utils.parsbert import train_parsbert, test_parsbert
from utils.preprocess import clean_text

# Load raw datasets

In [2]:
pd.set_option('future.no_silent_downcasting', True)  # noqa

# Load datasets
train_df = pd.read_csv(f"{base_path}/data/train.tsv", sep="\t", header=None, names=["sentence", "label"])
test_df = pd.read_csv(f"{base_path}/data/test.tsv", sep="\t", header=None, names=["sentence", "label"])

# print distinct labels
print(train_df.label.unique())
print(test_df.label.unique())
print(set(train_df.label.unique()) == set(test_df.label.unique()))
print('*' * 50)

print(label_dict)
print(train_df.loc[0, "sentence"])
print(train_df.loc[0, "label"])
print('*' * 50)

print(train_df.dtypes)

['SAD' 'HATE' 'OTHER' 'FEAR' 'ANGRY' 'HAPPY' 'SURPRISE']
['SAD' 'HAPPY' 'OTHER' 'SURPRISE' 'FEAR' 'HATE' 'ANGRY']
True
**************************************************
{'HAPPY': 0, 'SAD': 1, 'ANGRY': 2, 'FEAR': 3, 'SURPRISE': 4, 'HATE': 5, 'OTHER': 6}
خیلی کوچیک هستن و سایزشون بدرد نمیخوره میخوام پس بدم
SAD
**************************************************
sentence    object
label       object
dtype: object


In [3]:
# clean data and save
tqdm.pandas()
if not os.path.exists(f"{base_path}/data/train_cleaned.tsv"):
    train_df['sentence'] = train_df['sentence'].progress_apply(clean_text)
    train_df.to_csv(f"{base_path}/data/train_cleaned.tsv", sep="\t", index=False)
if not os.path.exists(f"{base_path}/data/test_cleaned.tsv"):
    test_df['sentence'] = test_df['sentence'].progress_apply(clean_text)
    test_df.to_csv(f"{base_path}/data/test_cleaned.tsv", sep="\t", index=False)

# Function to apply preprocessing in parallel using joblib
# def parallel_apply(df, func):
#     processed_sentences = Parallel(n_jobs=-1)(
#         delayed(func)(text=sentence) for sentence in df['sentence'])
#     return processed_sentences
# 
# 
# if not os.path.exists(f"{base_path}/data/train_cleaned.tsv"):
#     train_df['sentence'] = parallel_apply(train_df, combined_preprocess)
# 
# if not os.path.exists(f"{base_path}/data/test_cleaned.tsv"):
#     test_df['sentence'] = parallel_apply(test_df, combined_preprocess)

# Load cleaned datasets

In [4]:
# First row is header
train_df = pd.read_csv(f"{base_path}/data/train_cleaned.tsv", sep="\t")
test_df = pd.read_csv(f"{base_path}/data/test_cleaned.tsv", sep="\t")

print(train_df.loc[0:5, "sentence"])
print(test_df.loc[0:5, "sentence"])
print(test_df.loc[6, "sentence"])

0    خیلی کوچیک هستن و سایزشون بدرد نمیخوره میخوام ...
1       از صدای پرنده دم دمای صبح متنفرم متنفرم متنفرم
2    کیفیتش خیلی خوبه با شک خریدم ولی واقعا راضیم ب...
3    چون همش با دوربین ثبت‌شده ایا میشه اعتراض زد؟؟...
4                    این وضع ب طرز خنده داری گریه داره
5    خب من رسما از یک نفر متنفرم چون از گربه بدش می...
Name: sentence, dtype: object
0    این شاید اولین عزای عمومی واقعی است که یاد دار...
1    دیشب بعد از ارسال تویت مربوط به آثار باستانی ت...
2    کدوم شعبه پول نداده بگو الان برات آمار دقیق بد...
3    امروز وسط یه بحث با بابا مامانم گفتم آدم باید ...
4    امشب گفت نامزدی دوستش که ادم روشنفکری است بهم ...
5    به امید موفقیت تیم ملی و پیروزی در بازی امروز ...
Name: sentence, dtype: object
با آرزوی موفقیت و پیروزی


In [5]:
# Split train dataset for validation
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_df['sentence'], train_df['label'], test_size=0.2, random_state=42
)
print(f"Number of training sentences: {len(train_sentences)}")
print(f"Number of validation sentences: {len(val_sentences)}")
print(f"Number of test sentences: {len(test_df)}")

Number of training sentences: 4900
Number of validation sentences: 1225
Number of test sentences: 1151


# Load the tokenizer and model

In [6]:
model_names = [
    'HooshvareLab/bert-fa-base-uncased-sentiment-snappfood',  # noqa
    'HooshvareLab/bert-fa-base-uncased-sentiment-digikala',  # noqa
    'HooshvareLab/bert-fa-base-uncased',  # noqa
    'HooshvareLab/bert-fa-zwnj-base'  # ParsBERT (v3.0) # noqa
]
cache_dir = f'{base_path}/models/huggingface_cache'

# Set up the training arguments

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

for model_name in model_names:
    print(f"Model name: {model_name}")
    train_parsbert(
        model_name=model_name,
        cache_dir=cache_dir,
        device=device,
        label_dict=label_dict,
        train_sentences=train_sentences,
        train_labels=train_labels,
        val_sentences=val_sentences,
        val_labels=val_labels,
        base_path=base_path
    )

Device: cuda
Model name: HooshvareLab/bert-fa-base-uncased-sentiment-snappfood


  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased-sentiment-snappfood and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|[32m██████████[0m| 613/613 [04:00<00:00,  2.55it/s, Loss=1.34] 



Train Loss: 1.2156 | Train Accuracy: 0.5627


Validation Epoch 1: 100%|[33m██████████[0m| 154/154 [00:20<00:00,  7.34it/s]


Validation Loss: 0.8682 | Validation Accuracy: 0.6906
**************************************************
Saving new best model at epoch 1


Training Epoch 2: 100%|[32m██████████[0m| 613/613 [04:02<00:00,  2.53it/s, Loss=1.48]  



Train Loss: 0.6635 | Train Accuracy: 0.7771


Validation Epoch 2: 100%|[33m██████████[0m| 154/154 [00:19<00:00,  7.72it/s]


Validation Loss: 0.8854 | Validation Accuracy: 0.6939
**************************************************


Training Epoch 3: 100%|[32m██████████[0m| 613/613 [04:04<00:00,  2.51it/s, Loss=0.185] 



Train Loss: 0.3032 | Train Accuracy: 0.9088


Validation Epoch 3: 100%|[33m██████████[0m| 154/154 [00:19<00:00,  7.79it/s]


Validation Loss: 1.2488 | Validation Accuracy: 0.6490
**************************************************


Training Epoch 4: 100%|[32m██████████[0m| 613/613 [04:03<00:00,  2.52it/s, Loss=0.488]  



Train Loss: 0.1258 | Train Accuracy: 0.9659


Validation Epoch 4: 100%|[33m██████████[0m| 154/154 [00:21<00:00,  7.33it/s]


Validation Loss: 1.2613 | Validation Accuracy: 0.6849
**************************************************


Training Epoch 5: 100%|[32m██████████[0m| 613/613 [04:02<00:00,  2.53it/s, Loss=0.00785]



Train Loss: 0.0710 | Train Accuracy: 0.9804


Validation Epoch 5: 100%|[33m██████████[0m| 154/154 [00:20<00:00,  7.42it/s]


Validation Loss: 1.4506 | Validation Accuracy: 0.6694
**************************************************


Training Epoch 6: 100%|[32m██████████[0m| 613/613 [04:03<00:00,  2.52it/s, Loss=0.0453] 



Train Loss: 0.0404 | Train Accuracy: 0.9886


Validation Epoch 6: 100%|[33m██████████[0m| 154/154 [00:20<00:00,  7.34it/s]


Validation Loss: 1.5195 | Validation Accuracy: 0.6694
**************************************************


Training Epoch 7: 100%|[32m██████████[0m| 613/613 [04:03<00:00,  2.52it/s, Loss=0.0143] 



Train Loss: 0.0347 | Train Accuracy: 0.9896


Validation Epoch 7: 100%|[33m██████████[0m| 154/154 [00:20<00:00,  7.35it/s]


Validation Loss: 1.5756 | Validation Accuracy: 0.6816
**************************************************
Early stopping at epoch 7. Best epoch: 1
Model name: HooshvareLab/bert-fa-base-uncased-sentiment-digikala


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/651M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased-sentiment-digikala and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|[32m██████████[0m| 613/613 [03:47<00:00,  2.69it/s, Loss=1.1]  



Train Loss: 1.2020 | Train Accuracy: 0.5608


Validation Epoch 1: 100%|[33m██████████[0m| 154/154 [00:20<00:00,  7.37it/s]


Validation Loss: 0.8663 | Validation Accuracy: 0.6816
**************************************************
Saving new best model at epoch 1


Training Epoch 2: 100%|[32m██████████[0m| 613/613 [03:57<00:00,  2.58it/s, Loss=1.8]   



Train Loss: 0.6816 | Train Accuracy: 0.7655


Validation Epoch 2: 100%|[33m██████████[0m| 154/154 [00:20<00:00,  7.34it/s]


Validation Loss: 0.9621 | Validation Accuracy: 0.6710
**************************************************


Training Epoch 3: 100%|[32m██████████[0m| 613/613 [03:59<00:00,  2.56it/s, Loss=0.132] 



Train Loss: 0.3099 | Train Accuracy: 0.9051


Validation Epoch 3: 100%|[33m██████████[0m| 154/154 [00:19<00:00,  7.79it/s]


Validation Loss: 1.0855 | Validation Accuracy: 0.6792
**************************************************


Training Epoch 4: 100%|[32m██████████[0m| 613/613 [04:00<00:00,  2.55it/s, Loss=0.0184] 



Train Loss: 0.1418 | Train Accuracy: 0.9571


Validation Epoch 4: 100%|[33m██████████[0m| 154/154 [00:20<00:00,  7.36it/s]


Validation Loss: 1.2407 | Validation Accuracy: 0.6898
**************************************************


Training Epoch 5: 100%|[32m██████████[0m| 613/613 [03:59<00:00,  2.56it/s, Loss=0.14]   



Train Loss: 0.0622 | Train Accuracy: 0.9841


Validation Epoch 5: 100%|[33m██████████[0m| 154/154 [00:20<00:00,  7.37it/s]


Validation Loss: 1.4113 | Validation Accuracy: 0.6931
**************************************************


Training Epoch 6: 100%|[32m██████████[0m| 613/613 [03:59<00:00,  2.56it/s, Loss=0.00318]



Train Loss: 0.0430 | Train Accuracy: 0.9884


Validation Epoch 6: 100%|[33m██████████[0m| 154/154 [00:20<00:00,  7.66it/s]


Validation Loss: 1.4495 | Validation Accuracy: 0.6890
**************************************************


Training Epoch 7: 100%|[32m██████████[0m| 613/613 [04:00<00:00,  2.55it/s, Loss=0.00519]



Train Loss: 0.0265 | Train Accuracy: 0.9929


Validation Epoch 7: 100%|[33m██████████[0m| 154/154 [00:20<00:00,  7.52it/s]


Validation Loss: 1.5158 | Validation Accuracy: 0.7004
**************************************************
Early stopping at epoch 7. Best epoch: 1
Model name: HooshvareLab/bert-fa-base-uncased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|[32m██████████[0m| 613/613 [03:55<00:00,  2.60it/s, Loss=1.36] 



Train Loss: 1.2278 | Train Accuracy: 0.5514


Validation Epoch 1: 100%|[33m██████████[0m| 154/154 [00:20<00:00,  7.34it/s]


Validation Loss: 0.9095 | Validation Accuracy: 0.6776
**************************************************
Saving new best model at epoch 1


Training Epoch 2: 100%|[32m██████████[0m| 613/613 [03:59<00:00,  2.56it/s, Loss=1.16]  



Train Loss: 0.6792 | Train Accuracy: 0.7682


Validation Epoch 2: 100%|[33m██████████[0m| 154/154 [00:20<00:00,  7.54it/s]


Validation Loss: 0.8828 | Validation Accuracy: 0.6922
**************************************************
Saving new best model at epoch 2


Training Epoch 3: 100%|[32m██████████[0m| 613/613 [04:00<00:00,  2.55it/s, Loss=0.0165]



Train Loss: 0.3294 | Train Accuracy: 0.8967


Validation Epoch 3: 100%|[33m██████████[0m| 154/154 [00:21<00:00,  7.31it/s]


Validation Loss: 1.0791 | Validation Accuracy: 0.6759
**************************************************


Training Epoch 4: 100%|[32m██████████[0m| 613/613 [03:59<00:00,  2.56it/s, Loss=0.0211] 



Train Loss: 0.1273 | Train Accuracy: 0.9629


Validation Epoch 4: 100%|[33m██████████[0m| 154/154 [00:21<00:00,  7.29it/s]


Validation Loss: 1.2289 | Validation Accuracy: 0.6849
**************************************************


Training Epoch 5: 100%|[32m██████████[0m| 613/613 [03:58<00:00,  2.57it/s, Loss=0.117]  



Train Loss: 0.0655 | Train Accuracy: 0.9820


Validation Epoch 5: 100%|[33m██████████[0m| 154/154 [00:20<00:00,  7.53it/s]


Validation Loss: 1.3353 | Validation Accuracy: 0.6931
**************************************************


Training Epoch 6: 100%|[32m██████████[0m| 613/613 [03:59<00:00,  2.56it/s, Loss=0.00488]



Train Loss: 0.0385 | Train Accuracy: 0.9910


Validation Epoch 6: 100%|[33m██████████[0m| 154/154 [00:21<00:00,  7.28it/s]


Validation Loss: 1.3817 | Validation Accuracy: 0.7029
**************************************************


Training Epoch 7: 100%|[32m██████████[0m| 613/613 [03:59<00:00,  2.56it/s, Loss=0.248]  



Train Loss: 0.0300 | Train Accuracy: 0.9929


Validation Epoch 7: 100%|[33m██████████[0m| 154/154 [00:21<00:00,  7.28it/s]


Validation Loss: 1.4625 | Validation Accuracy: 0.7004
**************************************************


Training Epoch 8: 100%|[32m██████████[0m| 613/613 [03:59<00:00,  2.56it/s, Loss=0.00371]



Train Loss: 0.0175 | Train Accuracy: 0.9949


Validation Epoch 8: 100%|[33m██████████[0m| 154/154 [00:21<00:00,  7.30it/s]


Validation Loss: 1.5663 | Validation Accuracy: 0.7037
**************************************************
Early stopping at epoch 8. Best epoch: 2
Model name: HooshvareLab/bert-fa-zwnj-base


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-zwnj-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|[32m██████████[0m| 613/613 [03:51<00:00,  2.64it/s, Loss=0.522]



Train Loss: 1.2881 | Train Accuracy: 0.5243


Validation Epoch 1: 100%|[33m██████████[0m| 154/154 [00:21<00:00,  7.30it/s]


Validation Loss: 0.9698 | Validation Accuracy: 0.6449
**************************************************
Saving new best model at epoch 1


Training Epoch 2: 100%|[32m██████████[0m| 613/613 [03:53<00:00,  2.63it/s, Loss=0.585]



Train Loss: 0.7859 | Train Accuracy: 0.7249


Validation Epoch 2: 100%|[33m██████████[0m| 154/154 [00:21<00:00,  7.29it/s]


Validation Loss: 0.8919 | Validation Accuracy: 0.6743
**************************************************
Saving new best model at epoch 2


Training Epoch 3: 100%|[32m██████████[0m| 613/613 [03:54<00:00,  2.62it/s, Loss=1.1]   



Train Loss: 0.4584 | Train Accuracy: 0.8494


Validation Epoch 3: 100%|[33m██████████[0m| 154/154 [00:21<00:00,  7.31it/s]


Validation Loss: 1.0340 | Validation Accuracy: 0.6702
**************************************************


Training Epoch 4: 100%|[32m██████████[0m| 613/613 [03:54<00:00,  2.61it/s, Loss=0.0191]



Train Loss: 0.2026 | Train Accuracy: 0.9396


Validation Epoch 4: 100%|[33m██████████[0m| 154/154 [00:19<00:00,  8.07it/s]


Validation Loss: 1.2498 | Validation Accuracy: 0.6596
**************************************************


Training Epoch 5: 100%|[32m██████████[0m| 613/613 [03:54<00:00,  2.62it/s, Loss=0.0138] 



Train Loss: 0.0929 | Train Accuracy: 0.9753


Validation Epoch 5: 100%|[33m██████████[0m| 154/154 [00:21<00:00,  7.28it/s]


Validation Loss: 1.4132 | Validation Accuracy: 0.6473
**************************************************


Training Epoch 6: 100%|[32m██████████[0m| 613/613 [03:54<00:00,  2.61it/s, Loss=0.00242]



Train Loss: 0.0555 | Train Accuracy: 0.9869


Validation Epoch 6: 100%|[33m██████████[0m| 154/154 [00:19<00:00,  7.71it/s]


Validation Loss: 1.4666 | Validation Accuracy: 0.6718
**************************************************


Training Epoch 7: 100%|[32m██████████[0m| 613/613 [03:53<00:00,  2.62it/s, Loss=0.00829]



Train Loss: 0.0374 | Train Accuracy: 0.9910


Validation Epoch 7: 100%|[33m██████████[0m| 154/154 [00:21<00:00,  7.26it/s]


Validation Loss: 1.5017 | Validation Accuracy: 0.6678
**************************************************


Training Epoch 8: 100%|[32m██████████[0m| 613/613 [03:55<00:00,  2.61it/s, Loss=0.00285]



Train Loss: 0.0208 | Train Accuracy: 0.9951


Validation Epoch 8: 100%|[33m██████████[0m| 154/154 [00:21<00:00,  7.28it/s]

Validation Loss: 1.6148 | Validation Accuracy: 0.6759
**************************************************
Early stopping at epoch 8. Best epoch: 2





In [8]:
for model_name in model_names:
    print(f"Model name: {model_name}")
    test_parsbert(
        model_name=model_name,
        cache_dir=cache_dir,
        device=device,
        label_dict=label_dict,
        test_sentences=test_df['sentence'].to_list(),
        test_labels=test_df['label'].to_list(),
        base_path=base_path
    )

Model name: HooshvareLab/bert-fa-base-uncased-sentiment-snappfood


Testing: 100%|[34m██████████[0m| 144/144 [00:18<00:00,  7.93it/s]


Test Loss: 1.1351 | Test Accuracy: 0.5995
Precision: 0.6524 | Recall: 0.5897 | F1 Score: 0.5936
**************************************************
Model name: HooshvareLab/bert-fa-base-uncased-sentiment-digikala


Testing: 100%|[34m██████████[0m| 144/144 [00:18<00:00,  7.89it/s]


Test Loss: 1.1684 | Test Accuracy: 0.5934
Precision: 0.6430 | Recall: 0.5804 | F1 Score: 0.5848
**************************************************
Model name: HooshvareLab/bert-fa-base-uncased


Testing: 100%|[34m██████████[0m| 144/144 [00:18<00:00,  7.91it/s]


Test Loss: 1.1109 | Test Accuracy: 0.6229
Precision: 0.6745 | Recall: 0.5994 | F1 Score: 0.6134
**************************************************
Model name: HooshvareLab/bert-fa-zwnj-base


Testing: 100%|[34m██████████[0m| 144/144 [00:18<00:00,  7.95it/s]

Test Loss: 1.1629 | Test Accuracy: 0.6003
Precision: 0.6630 | Recall: 0.5911 | F1 Score: 0.6030
**************************************************



