# SnapFood Persian Comments Sentiment Analysis

In [38]:
import kagglehub
import tensorflow as tf
import keras
from keras.layers import (Dense, TextVectorization, GlobalAveragePooling1D, Input, Embedding)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pandas as pd
import numpy as np
from keras import Sequential

# Download latest version
path = kagglehub.dataset_download("soheiltehranipour/snappfood-persian-sentiment-analysis")
dataset = pd.read_csv(path + "/Snappfood - Sentiment Analysis.csv", on_bad_lines='warn', sep='\t')
dataset.head()

Unnamed: 0.1,Unnamed: 0,comment,label,label_id
0,,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح,SAD,1.0
1,,قرار بود ۱ ساعته برسه ولی نیم ساعت زودتر از مو...,HAPPY,0.0
2,,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...,SAD,1.0
3,,عالللی بود همه چه درست و به اندازه و کیفیت خوب...,HAPPY,0.0
4,,شیرینی وانیلی فقط یک مدل بود.,HAPPY,0.0


In [39]:
dataset = dataset[["comment", "label_id"]]
print(dataset.info())
print(dataset["label_id"].value_counts())
dataset = dataset.dropna()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   comment   70000 non-null  object 
 1   label_id  69480 non-null  float64
dtypes: float64(1), object(1)
memory usage: 1.1+ MB
None
label_id
0.0    34916
1.0    34564
Name: count, dtype: int64


In [40]:
vectorizer_layer = TextVectorization(max_tokens=10000, output_mode='int', output_sequence_length=128)
vectorizer_layer.adapt(dataset["comment"])

print("Vocabulary size:", vectorizer_layer.vocabulary_size())
print("Example vectorized comment:", vectorizer_layer(tf.constant(["This is a great restaurant!"])))

Vocabulary size: 10000
Example vectorized comment: tf.Tensor(
[[9882 5463 5466 9910    1    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]], shape=(1, 128), dtype=int64)


In [42]:
x = dataset["comment"].values
y = dataset["label_id"].values

vectorized_x = vectorizer_layer(tf.constant(x)).numpy()

xtrain, xtest, ytrain, ytest = train_test_split(vectorized_x, y, test_size=0.15, random_state=42)

In [50]:
model = Sequential(
    [
        Embedding(
            input_dim=vectorizer_layer.vocabulary_size(),
            output_dim=128,
            input_length=128,
            mask_zero=True
        ),
        GlobalAveragePooling1D(),
        Dense(128, activation="relu"),
        Dense(1, activation="sigmoid"),
    ]
)

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [51]:
callbacks = [
    keras.callbacks.EarlyStopping(
        patience=20, restore_best_weights=True, monitor="val_loss"
    ),
    keras.callbacks.ReduceLROnPlateau(patience=5, factor=0.5, monitor="val_loss"),
]
model.fit(
    xtrain,
    ytrain,
    epochs=100,
    batch_size=1024,
    validation_split=0.15,
    callbacks=callbacks,
)

Epoch 1/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 78ms/step - accuracy: 0.7929 - loss: 0.5150 - val_accuracy: 0.8449 - val_loss: 0.3836 - learning_rate: 0.0010
Epoch 2/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 76ms/step - accuracy: 0.8553 - loss: 0.3510 - val_accuracy: 0.8519 - val_loss: 0.3654 - learning_rate: 0.0010
Epoch 3/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 76ms/step - accuracy: 0.8680 - loss: 0.3260 - val_accuracy: 0.8525 - val_loss: 0.3616 - learning_rate: 0.0010
Epoch 4/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 75ms/step - accuracy: 0.8760 - loss: 0.3093 - val_accuracy: 0.8500 - val_loss: 0.3609 - learning_rate: 0.0010
Epoch 5/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 76ms/step - accuracy: 0.8817 - loss: 0.2949 - val_accuracy: 0.8467 - val_loss: 0.3616 - learning_rate: 0.0010
Epoch 6/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4

<keras.src.callbacks.history.History at 0x782c7f66e6f0>

In [52]:
model.evaluate(xtest, ytest)

[1m326/326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8515 - loss: 0.3546


[0.35456475615501404, 0.851468026638031]