In [1]:
import pandas as pd
import numpy as np
from hazm import stopwords_list
from tqdm import tqdm


In [2]:
reviews = pd.read_csv('data/BaSalam.reviews.csv', low_memory=False)

In [3]:
reviews.iloc[0]

_id                             661ba7096a6e1c5d7e653541
productId                                         824662
star                                                   5
user_id                                         15127771
isPost                                             False
isPublic                                            True
id                                              11220580
createdAt                            2024-04-03T23:45:57
updatedAt                            2024-04-03T23:45:57
hashId                                            rBNa53
isPosted                                           False
isLikedByCurrentUser                               False
isDislikedByCurrentUser                            False
likeCount                                              0
dislikeCount                                           0
attachments                {'photos': [], 'video': None}
history_count                                          1
user_id_of_user                

# بیز ساده‌لوحانه (Naive Bayes)

In [13]:
from hazm import Normalizer, Stemmer, word_tokenize
import string

normalizer = Normalizer()
stemmer = Stemmer()

def preprocessing(text):
    punc_removed = text.translate(str.maketrans('', '', string.punctuation))
    normalized = normalizer.normalize(punc_removed)
    stemmed = stemmer.stem(normalized)
    tokens = word_tokenize(stemmed)
    filtered = []
    for token in tokens:
        token = str(token)
        token = token.lower()
        if not token in stopwords_list() and not token.isdigit():
            filtered.append(token)
    return filtered

In [62]:
df = reviews[(reviews['description'].notna())][['description', 'star']]

In [63]:
df['satisfaction'] = df['star'].apply(lambda x: 1 if x > 3 else 0)

In [64]:
df.shape

(2001788, 3)

In [65]:
from sklearn.model_selection import train_test_split
df, blah = train_test_split(df, test_size=0.95, random_state=42)
data, test = train_test_split(df, test_size=0.2, random_state=42)

In [68]:
class_count = data['satisfaction'].value_counts().to_dict()

prior_probability = {key: value / len(data) for key, value in class_count.items()}
prior_probability

{1: 0.8955427058485594, 0: 0.1044572941514406}

In [69]:
def token_counter(texts):
    count_dict = {}
    for text in tqdm(texts):
        preprocessed = preprocessing(text)
        for token in preprocessed:
            if token in count_dict:
                count_dict[token] += 1
            else:
                count_dict[token] = 1
    return count_dict

In [70]:
negative_class_count = token_counter(data[data['satisfaction'] == 0]['description'])
print(f'Negative class - Vocab size: {len(negative_class_count)}, Total count: {sum(negative_class_count.values())}')

100%|██████████| 8364/8364 [00:28<00:00, 292.30it/s]

Negative class - Vocab size: 12889, Total count: 75578





In [78]:
negative_class_count

{'انتظار': 84,
 'داشته_باشید': 11,
 'ریمل\u200cهای': 1,
 'دیگه': 306,
 'مژه': 5,
 'هاتون': 2,
 'فرم': 6,
 'بلندی': 5,
 'بده': 51,
 'اصلا': 946,
 'اینجوری': 13,
 '،': 1643,
 'رنگ': 399,
 'خاصی': 34,
 'داره': 462,
 'مژه\u200cها': 2,
 'مشکی': 76,
 'میکنه': 112,
 'واقع': 17,
 'فقط\u200cی': 2,
 'مقدار': 45,
 'شکل': 31,
 'میده': 116,
 'اونم': 31,
 'میمونه': 18,
 'آب': 75,
 'زده_باشید': 1,
 'بوش': 24,
 'خوبه': 560,
 'برخورد': 80,
 'غرفه': 646,
 'راضی': 423,
 'سپاسگزار': 9,
 'نپسندید': 7,
 'بدنبود': 23,
 'ظاهر': 36,
 'شیک': 15,
 'جنس': 521,
 'بی\u200cکیف': 41,
 'ممنون': 369,
 'فروشنده': 458,
 'اصلاتاثیرندا': 1,
 'قرقره': 2,
 'یکم': 104,
 'گیر': 16,
 'بدنیس': 6,
 'درودnعسل': 1,
 'ارسالی': 67,
 'بو': 68,
 'طعم': 133,
 'کمتر': 37,
 'عسل': 49,
 'معمولی': 136,
 'اس': 119,
 'فوق\u200cالعاده': 35,
 'رفتار': 26,
 'زشت': 8,
 'ستاره': 85,
 'دادم': 356,
 'اومده': 27,
 'نوشته': 48,
 'حیف': 43,
 'اون': 244,
 'دوتا': 50,
 'جورابی': 2,
 'اشانتیون': 9,
 'لیاقت': 1,
 'نداری': 3,
 'درصورتیکه': 4,
 'جفت': 12,
 '

In [71]:
positive_class_count = token_counter(data[data['satisfaction'] == 1]['description'])
print(f'Positive class - Vocab size: {len(positive_class_count)}, Total count: {sum(positive_class_count.values())}')

100%|██████████| 71707/71707 [02:58<00:00, 401.18it/s]

Positive class - Vocab size: 36499, Total count: 447743





In [72]:
class_based_count = [negative_class_count, positive_class_count]
vocab_size = len(set(list(negative_class_count.keys()) + list(positive_class_count.keys())))
total_count = [sum(negative_class_count.values()), sum(positive_class_count.values())] 

In [73]:
def compute_probability(text, cls):
    total_probability = 1.0
    preprocessed = preprocessing(text)
    for token in preprocessed:
        try:
            word_count = class_based_count[cls][token]
        except:
            word_count = 0
        word_prob = (word_count + 1) / (total_count[cls] + vocab_size + 1)
        total_probability = total_probability * word_prob
    total_probability = total_probability * prior_probability[cls]
    return total_probability

In [74]:
def predict(test):
    predictions = []
    for text in test:
        neg_prob = compute_probability(text, 0)
        pos_prob = compute_probability(text, 1)
        if neg_prob > pos_prob:
            predictions.append(0)
        else:
            predictions.append(1)
    return np.array(predictions)

In [85]:
from sklearn.metrics import accuracy_score, classification_report


In [81]:

train_predictions = predict(data['description'])
accuracy_score(data['satisfaction'], train_predictions)

0.9313609171859975

In [88]:
test_predictions = predict(test['description'])
accuracy_score(test['satisfaction'], test_predictions)

0.9117793985413128

In [89]:
print(classification_report(data['satisfaction'], train_predictions))

              precision    recall  f1-score   support

           0       0.71      0.57      0.63      8364
           1       0.95      0.97      0.96     71707

    accuracy                           0.93     80071
   macro avg       0.83      0.77      0.80     80071
weighted avg       0.93      0.93      0.93     80071



In [90]:
print(classification_report(test['satisfaction'], test_predictions))

              precision    recall  f1-score   support

           0       0.58      0.52      0.55      2057
           1       0.95      0.96      0.95     17961

    accuracy                           0.91     20018
   macro avg       0.76      0.74      0.75     20018
weighted avg       0.91      0.91      0.91     20018



In [91]:
test_result = pd.DataFrame({"predict" : test_predictions,
                            "real": test['satisfaction'],
                            "comment": test['description'],
                            'star': test['star']})