In [2]:
import pandas as pd
import random
import numpy as np
import torch
from torch import nn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoTokenizer, AutoModel

seed = 10
random.seed(10)
np.random.seed(10)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False

# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print(device)

cpu


In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from IPython import display
def rtl_print(outputs, font_size="15px", n_to_br=False):
    outputs = outputs if isinstance(outputs, list) else [outputs] 
    if n_to_br:
        outputs = [output.replace('\n', '') for output in outputs]
        
    outputs = [f'{output}' for output in outputs]
    display.display(display.HTML(' '.join(outputs)))


In [5]:
df = pd.read_excel('./data/datasets/sentipers.xlsx', sheet_name=0)
df.head()

Unnamed: 0,text,polarity
0,اینک قصد داریم پرینتر دیگری از پرینترهای لیزری...,neutral
1,پرینتری چند کاره از رده‌ی Entry Level یا سطح م...,neutral
2,به هر صورت اکنون ما در دنیایی زندگی می‌کنیم، ...,neutral
3,به صورتی که توانایی کپی کردن، اسکن، فکس، پر...,neutral
4,به هر صورت معمولا چیزی که بیشتر کاربران از پری...,very good


In [6]:
label_encoder = LabelEncoder()
df['polarity_id'] = label_encoder.fit_transform(df['polarity'])

In [22]:
df.head()

Unnamed: 0,text,polarity,polarity_id
0,اینک قصد داریم پرینتر دیگری از پرینترهای لیزری...,neutral,2
1,پرینتری چند کاره از رده‌ی Entry Level یا سطح م...,neutral,2
2,به هر صورت اکنون ما در دنیایی زندگی می‌کنیم، ...,neutral,2
3,به صورتی که توانایی کپی کردن، اسکن، فکس، پر...,neutral,2
4,به هر صورت معمولا چیزی که بیشتر کاربران از پری...,very good,4


In [7]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=seed)
val_df, test_df =  train_test_split(test_df, test_size=0.5, random_state=seed)

In [23]:
text = df.iloc[0]["text"]

In [31]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = 50

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row["text"]
        polarity = row["polarity"]
        polarity_id = row["polarity_id"]
        # Tokenize inputs
        encoded_inputs = self.tokenizer.encode_plus(
            text,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        input_ids = encoded_inputs["input_ids"].squeeze()
        attention_mask = encoded_inputs["attention_mask"].squeeze()
        token_type_id = encoded_inputs["token_type_ids"].squeeze()
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_id,
            "polarity": polarity,
            "polarity_id": torch.tensor(polarity_id, dtype=torch.long),
        }

In [32]:
from multiprocessing import cpu_count

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/labse")
train_dataset = Dataset(train_df, tokenizer)
val_dataset = Dataset(val_df, tokenizer)
test_dataset = Dataset(test_df, tokenizer)


batch_size = 32
num_workers = cpu_count() - 2
pin_memory = True if device == "cuda" else False
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers,
    pin_memory=pin_memory,
)
val_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers,
    pin_memory=pin_memory,
)
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers,
    pin_memory=pin_memory,
)

In [21]:
config = AutoConfig.from_pretrained("sentence-transformers/labse")
model = AutoModel.from_pretrained("sentence-transformers/LaBSE")

Downloading (…)lve/main/config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [26]:
class LaBSEModel(nn.Module):
    """Performs prediction, given the input of BERT embeddings."""

    def __init__(
        self,
        label_num=3,
        reinit_num=0,
        freeze_layers=True,
    ):
        super(LaBSEModel, self).__init__()
        # self.bert = BertModel.from_pretrained('bert-base-uncased')
        # self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.config = AutoConfig.from_pretrained("sentence-transformers/LaBSE")
        # self.tokenizer = AutoTokenizer.from_pretrained(
        #     "sentence-transformers/LaBSE"
        # )
        self.model = AutoModel.from_pretrained("sentence-transformers/LaBSE")

        self.num_hidden_layers = self.config.num_hidden_layers
        self.vdim = self.config.hidden_size

        self.nli_head1 = nn.Linear(self.vdim, 250)
        self.nli_head2 = nn.Linear(250, label_num)
        self.sm = nn.Softmax(dim=1)
        self.reinit(layer_num=reinit_num, freeze=freeze_layers)


    def reinit(self, layer_num, freeze):
        """Reinitialise parameters of last N layers and freeze all others"""
        if freeze:
            for _, pp in self.model.named_parameters():
                pp.requires_grad = False

        # if layer_num >= 0:
        #     layer_idx = [self.num_hidden_layers - 1 - i for i in range(layer_num)]
        #     layer_names = ["encoder.layer.{}".format(j) for j in layer_idx]
        #     for pn, pp in self.model.named_parameters():
        #         if any([ln in pn for ln in layer_names]) or "pooler." in pn:
        #             pp.data = torch.randn(pp.shape) * 0.02
        #             pp.requires_grad = True


    def forward(self, input_ids, attention_mask, token_type_ids, checkpoint=False):
        cls_vecs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )[1]

        logits = self.nli_head1(cls_vecs)
        logits = self.nli_head2(logits)
        probs = self.sm(logits)

        torch.cuda.empty_cache() # releases all unoccupied cached memory

        return logits, probs

In [27]:
model = LaBSEModel(label_num=len(label_encoder.classes_))
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(),lr=1e-4,eps=1e-6)

In [33]:

from train import train
from easydict import EasyDict

dataset = EasyDict(
    {
        "train": train_dataset,
        "val": test_dataset,
        "test": test_dataset,
    }
)
data_loader = EasyDict(
    {
        "train": train_loader,
        "val": test_loader,
        "test": test_loader,
    }
)



start = train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    data_loader=data_loader,
    dataset=dataset,
    device=device,
    epochs=20,
)

model/state-LaBSEModel-optimizer-AdamW-loss-CrossEntropyLoss.pth Not exist
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after paralleli

                                       

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/amir/.pyenv/versions/3.11.2/envs/uni/lib/python3.11/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/amir/.pyenv/versions/3.11.2/envs/uni/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
    return self.collate_fn(data)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/amir/.pyenv/versions/3.11.2/envs/uni/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 264, in default_collate
    return collate(batch, collate_fn_map=default_collate_fn_map)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/amir/.pyenv/versions/3.11.2/envs/uni/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 127, in collate
    return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/amir/.pyenv/versions/3.11.2/envs/uni/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 127, in <dictcomp>
    return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/amir/.pyenv/versions/3.11.2/envs/uni/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 119, in collate
    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/amir/.pyenv/versions/3.11.2/envs/uni/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 162, in collate_tensor_fn
    return torch.stack(batch, 0, out=out)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: stack expects each tensor to be equal size, but got [52] at entry 0 and [50] at entry 1
