In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from hazm import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
import re
import emoji
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline

In [2]:
folder_path = "../stopwords"
STOPWORDS = set([
    "از", "به", "در", "با", "که", "را", "تا", "و", "یا", "اما", "اگر", "برای", "بر",
    "این", "آن", "یک", "هر", "هم", "همه", "چند", "چنین", "دیگر", "چون", "مثل",
    "مانند", "چرا", "زیرا", "ولی", "آیا", "اگرچه", "لذا", "نیز", "باید", "می",
    "باشد", "است", "بود", "هست", "شد", "شو", "باش", "کرد", "کن", "کند", "کرده",
    "شده", "می‌شود", "خواهد", "خواهند", "خواهی", "خواهیم", "توان", "تواند",
    "توانند", "توانست", "توانسته", "بوده", "نبود", "نباشد", "نیست", "نیستند",
    "بودند", "باشند", "هستند", "دارم", "داری", "دارد", "دارند", "داریم", "داشت",
    "داشتند", "داشته", "داشتم", "ندارم", "ندارد", "ندارند", "نداریم", "نداشت",
    "نداشتند", "نداشته", "ای", "ایم", "اید", "اند", "ام", "ت", "ها", "های", "هایی",
    "شان", "ش", "مان", "تان", "اینها", "آنها", "چیز", "چیزی", "چرا", "چه", "که",
    "کدام", "چگونه", "چقدر", "چراکه", "آنان", "او", "آن", "ایشان", "ما", "شما",
    "آنچه", "آنجا", "اینجا", "اینجاست", "آنجاست", "همان", "خود", "همه‌اش",
    "هیچ", "هیچ‌کدام", "هرگز", "هیچگاه", "حالا", "اکنون", "دیروز", "امروز",
    "فردا", "شب", "روز", "بعد", "قبل", "ساعت", "وقت", "زمان", "چندین", "بار",
    "کم", "بیشتر", "کمتر", "حتی", "فقط", "تنها", "بالا", "پایین", "روی", "زیر",
    "جلو", "پشت", "نزدیک", "دور", "وسط", "بیرون", "درون", "داخل", "کنار",
    "اینجا", "آنجا", "هیچ‌جا", "هرجا", "هرکجا", "جا", "مکان", "محل", "چپ", "راست",
    "بعدا", "سپس", "آنگاه", "دیگر", "چیزهای", "یعنی", "خب", "آره", "نه", "باشه",
    "آها", "بله", "نمیدانم", "کسی", "دیگری", "هیچ‌کسی", "چیزها"
])

In [3]:
def is_sticker(token):
    # بررسی فرمت فایل
    if re.match(r'.*\.(webp|png|gif|jpg)$', token):
        return True
    # بررسی ایموجی
    if emoji.is_emoji(token):
        return True
    # بررسی لینک
    if re.match(r'https?://[^\s]+', token):
        return True
    return False

In [4]:
normalizer = Normalizer()
stemmer = Stemmer()


def preprocessing(comment):
    # حذف ایموجی‌ها
    comment = emoji.replace_emoji(comment, replace="")
    # حذف لینک‌ها
    comment = re.sub(r'https?://\S+|www\.\S+', '', comment)
    # حذف علامت‌های نگارشی
    comment = re.sub(r'[^\w\s]', '', comment)
    # حذف اعداد
    comment = re.sub(r'\d+', '', comment)
    text =  comment
    normalized = normalizer.normalize(text)
    tokens = word_tokenize(normalized)
    filtered = []
    for token in tokens:
        token = str(token)
        token = token.lower()
        token = re.sub(r'[\u200c\u200b\u200d]', ' ', token)
        if not token in STOPWORDS and not token.isdigit() and not is_sticker(token):
            filtered.append(token)
    return ' '.join(filtered)


In [5]:
train = pd.read_csv('../data/snappfood/train.csv', sep='\t')
dev = pd.read_csv('../data/snappfood/dev.csv', sep='\t')
test = pd.read_csv('../data/snappfood/test.csv', sep='\t')

In [6]:
train['processed_comment'] = train['comment'].map(preprocessing)
dev['processed_comment'] = dev['comment'].map(preprocessing)
test['processed_comment'] = test['comment'].map(preprocessing)

In [None]:
train.head()

In [None]:
cnt = dict(train['label'].value_counts())

labels = list(cnt.keys())
sizes = list(cnt.values())
fig = px.histogram(x=labels, y=sizes)
fig.show()

In [9]:
def tokenize(text):
    return word_tokenize(text)

In [10]:
x_train, y_train = train['processed_comment'], train['label_id']
x_dev, y_dev = dev['processed_comment'], dev['label_id']
x_test, y_test = test['processed_comment'], test['label_id']

In [None]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

def get_comments(df):
    return df.values

pipeline = Pipeline([
    ('get_comments', FunctionTransformer(get_comments, validate=False)),
    ('tfidf', TfidfVectorizer())
])

x_train = pipeline.fit_transform(x_train)
x_dev = pipeline.transform(x_dev)
x_test = pipeline.transform(x_test)

In [None]:
x_train

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    tf.keras.layers.Dense(units=128, activation='relu', input_shape=(27052,)),  # لایه پنهان اول
    tf.keras.layers.Dense(units=64, activation='relu'),  # لایه پنهان دوم
    tf.keras.layers.Dense(units=32, activation='relu'),  # لایه پنهان سوم
    tf.keras.layers.Dense(units=1, activation='sigmoid')  # لایه خروجی با یک نورون
])

model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['f1_score'])


In [None]:
model.fit(x_train, y_train, epochs=10)

In [None]:
y_predict_train = model.predict(x_train)

In [None]:
y_predict_dev = model.predict(x_dev)
y_predict_test = model.predict(x_test)

In [None]:
fig = px.scatter(y_predict_train)
fig.show()

In [None]:
fig = px.scatter(y_predict_dev)
fig.show()

In [None]:
fig = px.scatter(y_predict_test)
fig.show()

In [136]:
threshold = 0.05

In [None]:
predicted_label_train = np.zeros(len(y_predict_train))
for i in range(len(y_predict_train)):
  if y_predict_train[i] > threshold:
    predicted_label_train[i] = 1
  else:
    predicted_label_train[i] = 0
print( "f1 score : ", f1_score(predicted_label_train, y_train))
print(classification_report(predicted_label_train, y_train))

In [None]:
predicted_label_dev = np.zeros(len(y_predict_dev))
for i in range(len(y_predict_dev)):
  if y_predict_dev[i] > threshold:
    predicted_label_dev[i] = 1
  else:
    predicted_label_dev[i] = 0
print( "f1 score : ", f1_score(predicted_label_dev, y_dev))
print(classification_report(predicted_label_dev, y_dev))

In [None]:
predicted_label_test = np.zeros(len(y_predict_test))
for i in range(len(y_predict_test)):
  if y_predict_test[i] > 0.1:
    predicted_label_test[i] = 1
  else:
    predicted_label_test[i] = 0
print( "f1 score : ", f1_score(predicted_label_test, y_test))
print(classification_report(predicted_label_test, y_test))

In [154]:
df = pd.DataFrame({
    'comment': test['comment'],
    'true_label': y_test,
    'predicted_label': predicted_label_test,
    'probibilty': [round(x[0], 2) for x in y_predict_test.tolist()]
})

In [None]:
df[df['true_label'] == df['predicted_label']]

In [6]:
import tensorflow as tf

loaded_model = tf.keras.models.load_model('my_model.keras')

In [None]:
loaded_model.summary()

In [None]:
import pandas as pd
data = pd.read_csv('../data/BaSalam.reviews.csv')

In [34]:
reviews = data[data['description'].notna()]['description'].sample(10000)

In [35]:
x_train = pd.read_csv('../data/snappfood/train.csv', sep='\t')['comment']

In [None]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
def get_comments(df):
    return df.values

pipeline = Pipeline([
    ('get_comments', FunctionTransformer(get_comments, validate=False)),
    ('tfidf', TfidfVectorizer())
])

x_train = pipeline.fit_transform(x_train)
reviews = pipeline.transform(reviews)
reviews

In [None]:
loaded_model.predict(reviews)