In [1]:
import keras
import kagglehub
import pandas as pd
import numpy as np
from keras.models import Model
from keras.layers import (
    Dense,
    LSTM,
    Embedding,
    Dropout,
    TextVectorization,
    BatchNormalization,
    Input,
    Concatenate,
    Conv1D,
    GlobalMaxPooling1D,
)
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from hazm import *
import tensorflow as tf

print(
    "\n",
    50 * "=",
    "\nNum GPUs Available: ",
    len(tf.config.list_physical_devices("GPU")),
    "\n",
    50 * "=",
)

# Download latest version
path = kagglehub.dataset_download(
    "soheiltehranipour/snappfood-persian-sentiment-analysis"
)
_ds = pd.read_csv(path + "/Snappfood - Sentiment Analysis.csv", sep="\t")
_ds.head(15)

2025-09-24 18:37:34.948397: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-09-24 18:37:34.948425: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-09-24 18:37:34.949118: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-09-24 18:37:34.953212: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-24 18:37:37.068163: I external/local_xla/xla/


Num GPUs Available:  0 


Unnamed: 0.1,Unnamed: 0,comment,label,label_id
0,,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح,SAD,1.0
1,,قرار بود ۱ ساعته برسه ولی نیم ساعت زودتر از مو...,HAPPY,0.0
2,,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...,SAD,1.0
3,,عالللی بود همه چه درست و به اندازه و کیفیت خوب...,HAPPY,0.0
4,,شیرینی وانیلی فقط یک مدل بود.,HAPPY,0.0
5,,بدترین پیتزایی که تا به حال خورده بودم,SAD,1.0
6,,از همه لحاظ عالی ممنونم,HAPPY,0.0
7,,کیفیت غذا متوسط رو به پایین بود انگار داخل یه ...,SAD,1.0
8,,همه اقلام تازه و به روز وخیلیییییی سریع بدستم ...,HAPPY,0.0
9,,همه چی خوب ولی هات داگ دورش کلا سوخته بود و دا...,SAD,1.0


In [2]:
_ds = _ds[["comment", "label_id"]]
_ds.columns = ["text", "label"]
map_label = {0: "Happy", 1: "Sad"}
_ds.head()

Unnamed: 0,text,label
0,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح,1.0
1,قرار بود ۱ ساعته برسه ولی نیم ساعت زودتر از مو...,0.0
2,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...,1.0
3,عالللی بود همه چه درست و به اندازه و کیفیت خوب...,0.0
4,شیرینی وانیلی فقط یک مدل بود.,0.0


In [3]:
_ds["text"] = _ds["text"].astype("string")
_ds = _ds.dropna()
_ds.info()

<class 'pandas.core.frame.DataFrame'>
Index: 69480 entries, 0 to 69999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   text    69480 non-null  string 
 1   label   69480 non-null  float64
dtypes: float64(1), string(1)
memory usage: 1.6 MB


In [4]:
def TextPreprocess(text: list[str]) -> list[list[str]]:
    final_text = []
    for i in range(len(text)):
        _t = text[i]
        # Keep text as string for normalizers
        _t = Normalizer().normalize(_t)
        # Tokenize
        _t = WordTokenizer().tokenize(_t)
        
        # Lemmatize each token
        lemmatizer = Lemmatizer()
        _t = [lemmatizer.lemmatize(tok) for tok in _t]

        final_text.append(_t)

    return final_text

print(TextPreprocess(["من یه آدم خوشحال هستم. خیلی خری کچل بی خاصیت نادون ابله!"]))

[['من', 'یه', 'آدم', 'خوشحال', '#هست', '.', 'خیلی', 'خرید#خر', 'کچل', 'بی\u200cخاصیت', 'نادون', 'ابله', '!']]


In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Hazm-based preprocessing: normalize -> tokenize -> lemmatize

def hazm_preprocess_texts(texts: list[str]) -> list[list[str]]:
    normalizer = Normalizer()
    word_tokenizer = WordTokenizer()
    lemmatizer = Lemmatizer()

    processed: list[list[str]] = []
    for t in texts:
        t = "" if t is None else str(t)
        t = normalizer.normalize(t)
        tokens = word_tokenizer.tokenize(t)
        tokens = [lemmatizer.lemmatize(tok) for tok in tokens]
        processed.append(tokens)
    return processed

# Prepare tokens using Hazm
texts = _ds["text"].tolist()
labels = _ds["label"].to_numpy()

hazm_tokens = hazm_preprocess_texts(texts)

# Keras Tokenizer configured for Persian (no ASCII-only filters)
# Passing pre-tokenized lists allows us to keep Hazm's tokenization
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>", filters="")

tokenizer.fit_on_texts(hazm_tokens)
sequences = tokenizer.texts_to_sequences(hazm_tokens)
X = pad_sequences(sequences, maxlen=128, padding="post", truncating="post")
y = labels

print("X shape:", X.shape)
print("Vocab size:", len(tokenizer.word_index) + 1)
X[:2], y[:2]


X shape: (69480, 128)
Vocab size: 27065


(array([[   29,   435,   318,     7,   974,   525,  3527,    93,   142,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,

In [6]:
input_layer = Input(shape=(None,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128)(input_layer)

cnn_path = Conv1D(kernel_size=3, filters=128, activation='relu')(embedding_layer)
cnn_path = GlobalMaxPooling1D()(cnn_path)

lstm_path = LSTM(units=128)(embedding_layer)
lstm_path = Dropout(0.4)(lstm_path)

concatenated_layer = Concatenate()([cnn_path, lstm_path])
dense_path = Dense(128, activation='relu')(concatenated_layer)
batchnorm_layer = BatchNormalization()(dense_path)

# final output layer
output_layer = Dense(5, activation='softmax')(batchnorm_layer)

In [7]:
model = Model(inputs=input_layer, outputs=output_layer)
model.compile('adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 128)            3464320   ['input_1[0][0]']             
                                                                                                  
 conv1d (Conv1D)             (None, None, 128)            49280     ['embedding[0][0]']           
                                                                                                  
 lstm (LSTM)                 (None, 128)                  131584    ['embedding[0][0]']           
                                                                                              

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

print(f"Train shapes: {x_train.shape}")
print(f"Test shapes: {x_test.shape}")

Train shapes: (55584, 128)
Test shapes: (13896, 128)


In [9]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10),
    ReduceLROnPlateau(monitor='val_loss', patience=4, factor=0.1)
]

model.fit(x_train, y_train, batch_size=512, epochs=50, validation_split=0.2, callbacks=callbacks)

Epoch 1/50


20/87 [=====>........................] - ETA: 20s - loss: 1.1629 - accuracy: 0.6416

KeyboardInterrupt: 

In [None]:
model.evaluate(x_test, y_test)