In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from hazm import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
import re
import emoji
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline

In [2]:
import os

folder_path = "../stopwords"
STOPWORDS = set([
    "از", "به", "در", "با", "که", "را", "تا", "و", "یا", "اما", "اگر", "برای", "بر", 
    "این", "آن", "یک", "هر", "هم", "همه", "چند", "چنین", "دیگر", "چون", "مثل", 
    "مانند", "چرا", "زیرا", "ولی", "آیا", "اگرچه", "لذا", "نیز", "باید", "می", 
    "باشد", "است", "بود", "هست", "شد", "شو", "باش", "کرد", "کن", "کند", "کرده", 
    "شده", "می‌شود", "خواهد", "خواهند", "خواهی", "خواهیم", "توان", "تواند", 
    "توانند", "توانست", "توانسته", "بوده", "نبود", "نباشد", "نیست", "نیستند", 
    "بودند", "باشند", "هستند", "دارم", "داری", "دارد", "دارند", "داریم", "داشت", 
    "داشتند", "داشته", "داشتم", "ندارم", "ندارد", "ندارند", "نداریم", "نداشت", 
    "نداشتند", "نداشته", "ای", "ایم", "اید", "اند", "ام", "ت", "ها", "های", "هایی", 
    "شان", "ش", "مان", "تان", "اینها", "آنها", "چیز", "چیزی", "چرا", "چه", "که", 
    "کدام", "چگونه", "چقدر", "چراکه", "آنان", "او", "آن", "ایشان", "ما", "شما", 
    "آنچه", "آنجا", "اینجا", "اینجاست", "آنجاست", "همان", "خود", "همه‌اش", 
    "هیچ", "هیچ‌کدام", "هرگز", "هیچگاه", "حالا", "اکنون", "دیروز", "امروز", 
    "فردا", "شب", "روز", "بعد", "قبل", "ساعت", "وقت", "زمان", "چندین", "بار", 
    "کم", "بیشتر", "کمتر", "حتی", "فقط", "تنها", "بالا", "پایین", "روی", "زیر", 
    "جلو", "پشت", "نزدیک", "دور", "وسط", "بیرون", "درون", "داخل", "کنار", 
    "اینجا", "آنجا", "هیچ‌جا", "هرجا", "هرکجا", "جا", "مکان", "محل", "چپ", "راست", 
    "بعدا", "سپس", "آنگاه", "دیگر", "چیزهای", "یعنی", "خب", "آره", "نه", "باشه", 
    "آها", "بله", "نمیدانم", "کسی", "دیگری", "هیچ‌کسی", "چیزها"
])
# for filename in os.listdir(folder_path):
#     file_path = os.path.join(folder_path, filename)

#     if os.path.isfile(file_path) and filename.endswith(".txt"):  
#         with open(file_path, "r", encoding="utf-8") as file:
#             words = file.read().split()
#             STOPWORDS.update(words)  

In [3]:
reviews = pd.read_csv('../data/BaSalam.reviews.csv', low_memory=False)

In [4]:
df = reviews[(reviews['description'].notna())][['description', 'star']]

In [5]:
def is_sticker(token):
    # بررسی فرمت فایل
    if re.match(r'.*\.(webp|png|gif|jpg)$', token):
        return True
    # بررسی ایموجی
    if emoji.is_emoji(token):
        return True
    # بررسی لینک
    if re.match(r'https?://[^\s]+', token):
        return True
    return False

In [6]:
normalizer = Normalizer()
stemmer = Stemmer()


def preprocessing(comment):
    # حذف ایموجی‌ها
    comment = emoji.replace_emoji(comment, replace="")
    # حذف لینک‌ها
    comment = re.sub(r'https?://\S+|www\.\S+', '', comment)
    # حذف علامت‌های نگارشی
    comment = re.sub(r'[^\w\s]', '', comment)
    # حذف اعداد
    comment = re.sub(r'\d+', '', comment)
    text =  comment
    normalized = normalizer.normalize(text)
    tokens = word_tokenize(normalized)
    filtered = []
    for token in tokens:
        token = str(token)
        token = token.lower()
        token = re.sub(r'[\u200c\u200b\u200d]', ' ', token)
        if not token in STOPWORDS and not token.isdigit() and not is_sticker(token):
            filtered.append(token)
    return ' '.join(filtered)

# def preprocessing(text):
#     punc_removed = text.translate(str.maketrans('', '', string.punctuation))
#     normalized = normalizer.normalize(punc_removed)
#     stemmed = stemmer.stem(normalized)
#     tokens = word_tokenize(stemmed)
#     filtered = []
#     for token in tokens:
#         token = str(token)
#         token = token.lower()
#         if not token in stopwords_list() and not token.isdigit():
#             filtered.append(token)
#     return filtered

In [7]:
df['satisfaction'] = df['star'].apply(lambda x: 1 if x > 3 else 0)

In [8]:
df_train, to_much = train_test_split(df, test_size=0.95, random_state=42)

In [9]:
df_train['cleaned_comment'] = df_train['description'].apply(preprocessing)

In [10]:
df_train = df_train.drop(df_train[df_train['cleaned_comment'].apply(lambda x: len(word_tokenize(x)) > 1)==False].index)

In [11]:
data = df_train.reset_index()[['cleaned_comment', 'satisfaction']]
data.columns = ['comment', 'label']
data.head()

Unnamed: 0,comment,label
0,همچی عین تو عکس دقیق اندازه ممنون,1
1,سلام یه کفش بی کیفیت لطفا خرید نکنید رایگان به...,0
2,بسیار خوش عطر خورشت روغن انداخت لعاب داد کاملا...,1
3,نظرم شبیه عکس تو عکس زیاد کیفیت ممنون غرفه دار...,0
4,من خیلی کاربردی مخصوصا قهوه,1


In [12]:
data['label'] =data['label'].map({
    0:'negative',
    1:'positive'
}) 

In [13]:
negative_data = data[data['label'] == 'negative']
positive_data = data[data['label'] == 'positive']

cutting_point = min(len(negative_data), len(positive_data))

if cutting_point <= len(negative_data):
    negative_data = negative_data.sample(n=cutting_point).reset_index(drop=True)

if cutting_point <= len(positive_data):
    positive_data = positive_data.sample(n=cutting_point).reset_index(drop=True)

new_data = pd.concat([negative_data, positive_data])
new_data = new_data.sample(frac=1).reset_index(drop=True)
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18470 entries, 0 to 18469
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  18470 non-null  object
 1   label    18470 non-null  object
dtypes: object(2)
memory usage: 288.7+ KB


In [14]:
labels = list(sorted(data['label'].unique()))
labels

['negative', 'positive']

In [15]:
new_data['label_id'] = new_data['label'].apply(lambda t: labels.index(t))

train, test = train_test_split(new_data, test_size=0.1, random_state=1, stratify=new_data['label'])
train, valid = train_test_split(train, test_size=0.1, random_state=1, stratify=train['label'])

train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
test = test.reset_index(drop=True)

x_train, y_train = train['comment'].values.tolist(), train['label_id'].values.tolist()
x_valid, y_valid = valid['comment'].values.tolist(), valid['label_id'].values.tolist()
x_test, y_test = test['comment'].values.tolist(), test['label_id'].values.tolist()

print(train.shape)
print(valid.shape)
print(test.shape)

(14960, 3)
(1663, 3)
(1847, 3)


In [16]:
from transformers import BertConfig, BertTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch
import torch.nn as nn
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'device: {device}')

train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

device: cpu
CUDA is not available.  Training on CPU ...


In [18]:
# general config
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16

EPOCHS = 3
EEVERY_EPOCH = 1000
LEARNING_RATE = 2e-5
CLIP = 0.0

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = '/content/bert-fa-base-uncased-sentiment-taaghceh/pytorch_model.bin'

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [19]:
# create a key finder based on label 2 id and id to label

label2id = {label: i for i, label in enumerate(labels)}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

label2id: {'negative': 0, 'positive': 1}
id2label: {0: 'negative', 1: 'positive'}


In [20]:
# setup the tokenizer and configuration

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "negative",
    "1": "positive"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 0,
    "positive": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 100000
}



In [21]:
idx = np.random.randint(0, len(train))
sample_comment = train.iloc[idx]['comment']
sample_label = train.iloc[idx]['label']

print(f'Sample: \n{sample_comment}\n{sample_label}')

Sample: 
فعلا بدستم رسیده استفاده کردم حتما نظر میدم ممنون ازغرفه دار
negative


In [22]:
tokens = tokenizer.tokenize(sample_comment)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f'  Comment: {sample_comment}')
print(f'   Tokens: {tokenizer.convert_tokens_to_string(tokens)}')
print(f'Token IDs: {token_ids}')

  Comment: فعلا بدستم رسیده استفاده کردم حتما نظر میدم ممنون ازغرفه دار
   Tokens: فعلا بدستم رسیده استفاده کردم حتما نظر میدم ممنون ازغرفه دار
Token IDs: [9771, 5529, 2015, 4583, 2988, 5501, 4567, 3138, 27409, 25303, 2791, 7969, 3395, 2916]


In [23]:
encoding = tokenizer.encode_plus(
    sample_comment,
    max_length=32,
    truncation=True,
    add_special_tokens=True, # Add '[CLS]' and '[SEP]'
    return_token_type_ids=True,
    return_attention_mask=True,
    padding='max_length',
    return_tensors='pt',  # Return PyTorch tensors
)

print(f'Keys: {encoding.keys()}\n')
for k in encoding.keys():
    print(f'{k}:\n{encoding[k]}')

Keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

input_ids:
tensor([[    2,  9771,  5529,  2015,  4583,  2988,  5501,  4567,  3138, 27409,
         25303,  2791,  7969,  3395,  2916,     4,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]])
token_type_ids:
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask:
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])


In [24]:
class TaaghcheDataset(torch.utils.data.Dataset):
    """ Create a PyTorch dataset for Taaghche. """

    def __init__(self, tokenizer, comments, targets=None, label_list=None, max_len=128):
        self.comments = comments
        self.targets = targets
        self.has_target = isinstance(targets, list) or isinstance(targets, np.ndarray)

        self.tokenizer = tokenizer
        self.max_len = max_len

        
        self.label_map = {label: i for i, label in enumerate(label_list)} if isinstance(label_list, list) else {}
    
    def __len__(self):
        return len(self.comments)

    def __getitem__(self, item):
        comment = str(self.comments[item])

        if self.has_target:
            target = self.label_map.get(str(self.targets[item]), str(self.targets[item]))

        encoding = self.tokenizer.encode_plus(
            comment,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt')
        
        inputs = {
            'comment': comment,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
        }

        if self.has_target:
            inputs['targets'] = torch.tensor(target, dtype=torch.long)
        
        return inputs


def create_data_loader(x, y, tokenizer, max_len, batch_size, label_list):
    dataset = TaaghcheDataset(
        comments=x,
        targets=y,
        tokenizer=tokenizer,
        max_len=max_len, 
        label_list=label_list)
    
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size)

In [25]:
label_list = ['negative', 'positive']
train_data_loader = create_data_loader(train['comment'].to_numpy(), train['label'].to_numpy(), tokenizer, MAX_LEN, TRAIN_BATCH_SIZE, label_list)
valid_data_loader = create_data_loader(valid['comment'].to_numpy(), valid['label'].to_numpy(), tokenizer, MAX_LEN, VALID_BATCH_SIZE, label_list)
test_data_loader = create_data_loader(test['comment'].to_numpy(), None, tokenizer, MAX_LEN, TEST_BATCH_SIZE, label_list)

In [26]:
sample_data = next(iter(train_data_loader))

print(sample_data.keys())

print(sample_data['comment'])
print(sample_data['input_ids'].shape)
print(sample_data['input_ids'][0, :])
print(sample_data['attention_mask'].shape)
print(sample_data['attention_mask'][0, :])
print(sample_data['token_type_ids'].shape)
print(sample_data['token_type_ids'][0, :])
print(sample_data['targets'].shape)
print(sample_data['targets'][0])

dict_keys(['comment', 'input_ids', 'attention_mask', 'token_type_ids', 'targets'])
['توجه قیمت خوب نظر کیفیت متوسط ارسال موقع', 'اصلا راضی نیستم مثلا سایز سفارش دادم اومده می بینم سایز کوچیک تره', 'همه اش سبز زرد داشتولی تمیز', 'ممنون وتشکر', 'بسیار بداونقدر خشک نمی شد کوبید بشه خورد من جاهای دیگه هلیله سیاه دیده بودماینطوری نبودن', 'سلام ممنون سپاسگزارم جناب آقای پریانی بابت ارسال سریع بسته بندی بسیار خوب کالای باکیفیت مرغوب آرزوی سلامتی خیر برکت برایتان', 'برام ارسال کردن آوردن آبکاری اش خوب مرجوع کردم یکبار دیگه برام ارسال کردن گلس هاش شکسته بودن آخر سر خودشون برام سه گلس خودشون برام آوردن دستشون درد نکنه آخر درست', 'باتشکر عوامل وغرفه داران باسلام سفارش رسید', 'تشکر غرفه دار محترم واقعا حلوا بسیار عالی ارسالشون سریع پیشنهاد می کنم یکبار حلوا استفاده کنید', 'سلام بسته من رسید قشنگه اونی تو عکس شلوارش کلا فرق میکنه', 'مزه شیره بد تمیز داخلش آشغال', 'کیفیت عالی', 'ممنون فروشنده بسته بندی وارسال خیلی خوب یه مزه آهن میده', 'کالا اشتباه فرستاده_شد', 'لباس راضیم برخورد فروشنده خوب', 'ارزا

In [27]:
sample_test = next(iter(test_data_loader))
print(sample_test.keys())

dict_keys(['comment', 'input_ids', 'attention_mask', 'token_type_ids'])


In [28]:
class SentimentModel(nn.Module):

    def __init__(self, config):
        super(SentimentModel, self).__init__()

        self.bert = BertModel.from_pretrained(MODEL_NAME_OR_PATH)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        _, pooled_output = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids)
        
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits 

In [29]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()
pt_model = None

!nvidia-smi

NVIDIA-SMI has failed because you do not have suffient permissions. Please try running as an administrator.


In [30]:
pt_model = SentimentModel(config=config)
pt_model = pt_model.to(device)

print('pt_model', type(pt_model))

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pt_model <class '__main__.SentimentModel'>


In [33]:
# sample data output

sample_data_comment = sample_data['comment']
sample_data_input_ids = sample_data['input_ids']
sample_data_attention_mask = sample_data['attention_mask']
sample_data_token_type_ids = sample_data['token_type_ids']
sample_data_targets = sample_data['targets']

# available for using in GPU
sample_data_input_ids = sample_data_input_ids.to(device)
sample_data_attention_mask = sample_data_attention_mask.to(device)
sample_data_token_type_ids = sample_data_token_type_ids.to(device)
sample_data_targets = sample_data_targets.to(device)


# outputs = F.softmax(
#     pt_model(sample_data_input_ids, sample_data_attention_mask, sample_data_token_type_ids), 
#     dim=1)

outputs = pt_model(sample_data_input_ids, sample_data_attention_mask, sample_data_token_type_ids)
_, preds = torch.max(outputs, dim=1)

print(outputs[:5, :])
print(preds[:5])

TypeError: dropout(): argument 'input' (position 1) must be Tensor, not str

In [None]:
def simple_accuracy(y_true, y_pred):
    return (y_true == y_pred).mean()

def acc_and_f1(y_true, y_pred, average='weighted'):
    acc = simple_accuracy(y_true, y_pred)
    f1 = f1_score(y_true=y_true, y_pred=y_pred, average=average)
    return {
        "acc": acc,
        "f1": f1,
    }

def y_loss(y_true, y_pred, losses):
    y_true = torch.stack(y_true).cpu().detach().numpy()
    y_pred = torch.stack(y_pred).cpu().detach().numpy()
    y = [y_true, y_pred]
    loss = np.mean(losses)

    return y, loss


def eval_op(model, data_loader, loss_fn):
    model.eval()

    losses = []
    y_pred = []
    y_true = []

    with torch.no_grad():
        for dl in tqdm(data_loader, total=len(data_loader), desc="Evaluation... "):
            
            input_ids = dl['input_ids']
            attention_mask = dl['attention_mask']
            token_type_ids = dl['token_type_ids']
            targets = dl['targets']

            # move tensors to GPU if CUDA is available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            targets = targets.to(device)

            # compute predicted outputs by passing inputs to the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)
            
            # convert output probabilities to predicted class
            _, preds = torch.max(outputs, dim=1)

            # calculate the batch loss
            loss = loss_fn(outputs, targets)

            # accumulate all the losses
            losses.append(loss.item())

            y_pred.extend(preds)
            y_true.extend(targets)
    
    eval_y, eval_loss = y_loss(y_true, y_pred, losses)
    return eval_y, eval_loss


def train_op(model, 
             data_loader, 
             loss_fn, 
             optimizer, 
             scheduler, 
             step=0, 
             print_every_step=100, 
             eval=False,
             eval_cb=None,
             eval_loss_min=np.Inf,
             eval_data_loader=None, 
             clip=0.0):
    
    model.train()

    losses = []
    y_pred = []
    y_true = []

    for dl in tqdm(data_loader, total=len(data_loader), desc="Training... "):
        step += 1

        input_ids = dl['input_ids']
        attention_mask = dl['attention_mask']
        token_type_ids = dl['token_type_ids']
        targets = dl['targets']

        # move tensors to GPU if CUDA is available
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        targets = targets.to(device)

        # clear the gradients of all optimized variables
        optimizer.zero_grad()

        # compute predicted outputs by passing inputs to the model
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids)
        
        # convert output probabilities to predicted class
        _, preds = torch.max(outputs, dim=1)

        # calculate the batch loss
        loss = loss_fn(outputs, targets)

        # accumulate all the losses
        losses.append(loss.item())

        # compute gradient of the loss with respect to model parameters
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        if clip > 0.0:
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip)

        # perform optimization step
        optimizer.step()

        # perform scheduler step
        scheduler.step()

        y_pred.extend(preds)
        y_true.extend(targets)

        if eval:
            train_y, train_loss = y_loss(y_true, y_pred, losses)
            train_score = acc_and_f1(train_y[0], train_y[1], average='weighted')

            if step % print_every_step == 0:
                eval_y, eval_loss = eval_op(model, eval_data_loader, loss_fn)
                eval_score = acc_and_f1(eval_y[0], eval_y[1], average='weighted')

                if hasattr(eval_cb, '__call__'):
                    eval_loss_min = eval_cb(model, step, train_score, train_loss, eval_score, eval_loss, eval_loss_min)

    train_y, train_loss = y_loss(y_true, y_pred, losses)

    return train_y, train_loss, step, eval_loss_min