<a href="https://colab.research.google.com/github/2077DevWave/Sentiment-Survey-Analyzer/blob/main/NoteBooks/comment_analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
![ ! -d "/content/Sentiment-Survey-Analyzer" ] && git clone https://github.com/2077DevWave/Sentiment-Survey-Analyzer.git || echo "Repository already exists."
%cd /content/Sentiment-Survey-Analyzer/

Repository already exists.
/content/Sentiment-Survey-Analyzer


In [5]:
!pip install numpy
!pip install git+https://github.com/2077DevWave/hazm.git
!pip install  tensorflow
!pip install "nltk"
!pip install "scikit-learn"
!pip install "regex"
!pip install --upgrade --force-reinstall "pandas==2.2.2" tqdm gensim

Collecting git+https://github.com/2077DevWave/hazm.git
  Cloning https://github.com/2077DevWave/hazm.git to /tmp/pip-req-build-8tyoto42
  Running command git clone --filter=blob:none --quiet https://github.com/2077DevWave/hazm.git /tmp/pip-req-build-8tyoto42
  Resolved https://github.com/2077DevWave/hazm.git to commit 68cf615599db360d30f260f4428c81d8423d6cc4
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pandas==2.2.2
  Using cached pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting gensim
  Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy>=1.23.2 (from pandas==2.2.2)
  Using cached numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadat

In [3]:
!curl -L -o big_train.csv "https://huggingface.co/datasets/2077devwave/persian_commercial_comments_filing/resolve/main/train.csv?download=true"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1171  100  1171    0     0   5515      0 --:--:-- --:--:-- --:--:--  5523
100  101M  100  101M    0     0   128M      0 --:--:-- --:--:-- --:--:--  128M


In [14]:
import pandas as pd
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
import gc
import torch
from numba import cuda
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from hazm import Normalizer, word_tokenize, Stemmer, stopwords_list
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from multiprocessing import Pool, cpu_count

In [8]:
# Check for GPU
print("Num GPUs Available:", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available: 1


In [10]:
# Load Dataset
train_data = pd.read_csv('big_train.csv', usecols=['body', 'recommendation_status'])
# train_data = pd.read_csv('Dataset/train.csv')
test_data = pd.read_csv('Dataset/test.csv')

In [11]:
# Handle Missing Values and Encode Labels
train_data['recommendation_status'] = train_data['recommendation_status'].fillna("no_idea")
label_map = {"no_idea": 2, "recommended": 1, "not_recommended": 0}
train_data['recommendation_status'] = train_data['recommendation_status'].map(label_map)

In [12]:
# Text Preprocessing
stopwords = set(stopwords_list())
normalizer = Normalizer()
stemmer = Stemmer()

In [15]:
# Precompile regex patterns for efficiency
digit_pattern = re.compile(r'[۰-۹\d]+')
punctuation_pattern = re.compile(r'[!()\[\]{};:\'",؟<>./?@#$%^&*_~]')
whitespace_pattern = re.compile(r'\s+')

def preprocess_text(text):
    text = normalizer.normalize(str(text))
    text = digit_pattern.sub('', text)
    text = punctuation_pattern.sub(' ', text)
    text = whitespace_pattern.sub(' ', text).strip()

    tokens = word_tokenize(text)
    return [stemmer.stem(token) for token in tokens if token not in stopwords and token.strip()]

# Use multiprocessing to parallelize text preprocessing
def parallel_preprocessing(data):
    with Pool(cpu_count()) as pool:
        return pool.map(preprocess_text, data)

# Apply the function in parallel
train_data['preprocess'] = parallel_preprocessing(train_data['body'].tolist())


In [16]:
# Tokenization and Padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['preprocess'])
sequences = tokenizer.texts_to_sequences(train_data['preprocess'])
max_len = max(map(len, sequences))
X = pad_sequences(sequences, maxlen=max_len, padding='post')
y = train_data['recommendation_status'].values

In [17]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=512),  # Using smaller dimensions for embedding
    Bidirectional(LSTM(128, return_sequences=True)),  # Bidirectional LSTM layer with 128 neurons
    Dropout(0.2),  # Dropout layer to prevent overfitting
    Bidirectional(LSTM(64)),  # Bidirectional LSTM layer with 64 neurons
    Dropout(0.2),  # Dropout layer
    Dense(64, activation='relu'),  # Fully Connected layer with 64 neurons and ReLU activation function
    Dense(3, activation='softmax')  # Output layer with 3 classes and Softmax activation function
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [None]:
# Train Model with GPU
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

model.fit(X_train, y_train, batch_size=1024, epochs=10, validation_data=(X_test, y_test),
          callbacks=[early_stopping, model_checkpoint])

Epoch 1/10
[1m253/353[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m1:57[0m 1s/step - accuracy: 0.7454 - loss: 0.6112

In [14]:
# Evaluate Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Accuracy: {accuracy}")

[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 21ms/step - accuracy: 0.6433 - loss: 1.3007
Accuracy: 0.6454484462738037


In [15]:
# Prediction Function
def predict_recommendation(comment):
    preprocessed_comment = preprocess_text(comment)
    seq = tokenizer.texts_to_sequences([preprocessed_comment])
    padded_seq = pad_sequences(seq, maxlen=max_len, padding='post')
    prediction = model.predict(padded_seq)
    return {v: k for k, v in label_map.items()}[np.argmax(prediction)]

In [22]:
# Test Prediction
print(predict_recommendation("عالی بود"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
recommended
