# part 1 - embedding

**IMDB**

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [None]:
def train_model_with_embedding(embedding_dim, X_train, y_train, X_test, y_test):
    max_words = 10000
    maxlen = 50

    X_train_padded = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test_padded = sequence.pad_sequences(X_test, maxlen=maxlen)

    # Create a simple model with Embedding and Dense layers for binary classification
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train_padded, y_train, epochs=5, batch_size=64, validation_split=0.2)
    y_pred = (model.predict(X_test_padded) > 0.5).astype("int32")

    # Compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Print evaluation metrics
    print("Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.5f}")
    print(f"Precision: {precision:.5f}")
    print(f"Recall: {recall:.5f}")
    print(f"F1 Score: {f1:.5f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

In [None]:
# Load IMDB dataset with a maximum of 10000 words
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

embedding_dims = [10, 32, 64, 100]

# Iterate over different embedding dimensions and train models
for embedding_dim in embedding_dims:
    print(f"Training model with Embedding Dimension: {embedding_dim}")
    train_model_with_embedding(embedding_dim, X_train, y_train, X_test, y_test)

Training model with Embedding Dimension: 10
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.77944
Precision: 0.78711
Recall: 0.76608
F1 Score: 0.77645
Confusion Matrix:
[[9910 2590]
 [2924 9576]]
Training model with Embedding Dimension: 32
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.79380
Precision: 0.78915
Recall: 0.80184
F1 Score: 0.79544
Confusion Matrix:
[[ 9822  2678]
 [ 2477 10023]]
Training model with Embedding Dimension: 64
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.79316
Precision: 0.79104
Recall: 0.79680
F1 Score: 0.79391
Confusion Matrix:
[[9869 2631]
 [2540 9960]]
Training model with Embedding Dimension: 100
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.79412
Precision: 0.79098
Recall: 0.79952
F1 Score: 0.79523
Confusion Matrix:
[[9859 2641]
 [2506 9994]]


**Persian-Text-Sentiment**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

!pip install datasets

from datasets import load_dataset

persian_dataset = load_dataset("SeyedAli/Persian-Text-Sentiment")



In [None]:
def preprocess_persian_text_sentiment(dataset):
    # Preprocess Persian Text Sentiment dataset
    texts = dataset['train']['text']
    labels = dataset['train']['label']
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    max_words = len(word_index) + 1
    X = pad_sequences(sequences, maxlen=50)
    y = np.array(labels)
    return X, y, max_words

In [None]:
def train_model_with_embedding(embedding_dim, X_train, y_train, X_test, y_test, max_words):
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=50))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)
    y_pred = (model.predict(X_test) > 0.5).astype("int32")
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print("Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.5f}")
    print(f"Precision: {precision:.5f}")
    print(f"Recall: {recall:.5f}")
    print(f"F1 Score: {f1:.5f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))


In [None]:
X_persian, y_persian, max_words_persian = preprocess_persian_text_sentiment(persian_dataset)

embedding_dims = [10, 32, 64, 100]

for embedding_dim in embedding_dims:
    print(f"Training model with Embedding Dimension: {embedding_dim}")
    train_model_with_embedding(embedding_dim, X_persian, y_persian, X_persian, y_persian, max_words_persian)

Training model with Embedding Dimension: 10
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.93787
Precision: 0.92911
Recall: 0.94808
F1 Score: 0.93850
Confusion Matrix:
[[25906  2020]
 [ 1450 26476]]
Training model with Embedding Dimension: 32
Epoch 1/5
Epoch 2/5

**persian_news_dataset**

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder

def preprocess_persian_news_dataset(dataset, maxlen=50):
    texts = []
    labels = []
    for doc in dataset:
        if len(doc['title']) > 0 and len(doc['category']) > 0:
            texts.append(doc['text'])
            labels.append(doc['category'])

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    max_words = len(word_index) + 1
    X = pad_sequences(sequences, maxlen=maxlen)

    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(labels)

    return X, y, max_words

dataset = load_dataset("saied/persian_news_dataset", split="train", streaming=True)

docs = []
document_count = 40000
counter = 0
skip_counter = 0

for doc in dataset:
    if len(doc['title']) == 0 or len(doc['category']) == 0:
        skip_counter += 1
        if skip_counter % 10000 == 0:
            print(f'{skip_counter} skipped')
        continue
    docs.append(doc)
    counter += 1
    if counter == document_count:
        break

print(f"Number of documents processed: {len(docs)}")

# Prepare the data
not_proccess_x = [doc['text'] for doc in docs]
not_proccess_y = [doc['category'] for doc in docs]

# Tokenize the text data
max_words = 10000
max_length = 50

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(not_proccess_x)

x = tokenizer.texts_to_sequences(not_proccess_x)
x = pad_sequences(x, maxlen=max_length)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(not_proccess_y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

embedding_dims = [10, 32, 64, 100]

for embedding_dim in embedding_dims:
    print(f"Training model with Embedding Dimension: {embedding_dim}")
    model = Sequential([
        Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length),
        Flatten(),
        Dense(64, activation="relu"),
        Dense(len(set(y)), activation="softmax"),
    ])

    model.compile(optimizer="rmsprop",
                  loss="sparse_categorical_crossentropy",
                  metrics=["accuracy"])

    model.summary()
    model.fit(x_train, y_train, epochs=5, batch_size=512)
    print("---------------------------------------")
    test_loss, test_acc = model.evaluate(x_test, y_test)
    print(f"test_acc: {test_acc} \t test_loss: {test_loss}")

    y_pred_probs = model.predict(x_test)
    y_pred = np.argmax(y_pred_probs, axis=1)

    cm = confusion_matrix(y_test, y_pred)

    print("Confusion Matrix:")
    print(cm)

    unique_categories = label_encoder.classes_

    for i, category in enumerate(unique_categories):
        tp = cm[i, i]
        fp = cm[:, i].sum() - tp
        fn = cm[i, :].sum() - tp
        tn = cm.sum() - (tp + fp + fn)

        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        accuracy = (tp + tn) / cm.sum() if cm.sum() > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        print("Category:", category)
        print(f"Precision: {precision:.5f}")
        print(f"Recall: {recall:.5f}")
        print(f"Accuracy: {accuracy:.5f}")
        print(f"F1 Score: {f1_score:.5f}")
        print("--------------------------------------")

# part 2 - pre-trained embedding

Step 1: Preprocess the Persian News Dataset

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder

def preprocess_persian_news_dataset(dataset, maxlen=50):
    texts = []
    labels = []
    for doc in dataset:
        if len(doc['title']) > 0 and len(doc['category']) > 0:
            texts.append(doc['text'])
            labels.append(doc['category'])

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    max_words = len(word_index) + 1
    X = pad_sequences(sequences, maxlen=maxlen)

    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(labels)

    return X, y, max_words, tokenizer.word_index

dataset = load_dataset("saied/persian_news_dataset", split="train", streaming=True)

docs = []
document_count = 1000
counter = 0
skip_counter = 0

for doc in dataset:
    if len(doc['title']) == 0 or len(doc['category']) == 0:
        skip_counter += 1
        if skip_counter % 10000 == 0:
            print(f'{skip_counter} skipped')
        continue
    docs.append(doc)
    counter += 1
    if counter == document_count:
        break

print(f"Total documents loaded: {len(docs)}")

X, y, max_words, word_index = preprocess_persian_news_dataset(docs, maxlen=50)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


10000 skipped
20000 skipped
30000 skipped
40000 skipped
50000 skipped
60000 skipped
70000 skipped
80000 skipped
90000 skipped
100000 skipped
110000 skipped
120000 skipped
130000 skipped
140000 skipped
150000 skipped
160000 skipped
170000 skipped
180000 skipped
190000 skipped
200000 skipped
210000 skipped
Total documents loaded: 1000


Step 2: Create an Embedding Matrix from GloVe


In [None]:
import gensim.downloader as api

def create_embedding_matrix(word_index, max_words, embedding_dim):
    word_vectors = api.load("glove-wiki-gigaword-100")

    # Initialize embedding matrix
    embedding_matrix = np.zeros((max_words, embedding_dim))

    # Fill embedding matrix with pre-trained GloVe vectors
    for word, i in word_index.items():
        if i < max_words:
            try:
                embedding_vector = word_vectors[word]
                embedding_matrix[i] = embedding_vector
            except KeyError:
                # Use random initialization for missing words
                embedding_matrix[i] = np.random.normal(size=(embedding_dim,))

    return embedding_matrix


Step 3: Build and Train the Model Using the Embedding Matrix


In [None]:
def train_model_with_pretrained_embedding(embedding_dim, X_train, y_train, X_test, y_test, max_words, embedding_matrix, maxlen=50):
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)
    y_pred = (model.predict(X_test) > 0.5).astype("int32")

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print("Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.5f}")
    print(f"Precision: {precision:.5f}")
    print(f"Recall: {recall:.5f}")
    print(f"F1 Score: {f1:.5f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

train_model_with_pretrained_embedding(embedding_dim, X_train, y_train, X_test, y_test, max_words, embedding_matrix)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.59572
Precision: 0.59576
Recall: 0.59572
F1 Score: 0.59568
Confusion Matrix:
[[7323 5177]
 [4930 7570]]


**IMDB**

In [None]:
from tensorflow.keras.datasets import imdb

max_words = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_words)
word_index = imdb.get_word_index()

X_train = pad_sequences(X_train, maxlen=50)
X_test = pad_sequences(X_test, maxlen=50)

embedding_matrix = create_embedding_matrix(word_index, max_words, embedding_dim)

train_model_with_pretrained_embedding(embedding_dim, X_train, y_train, X_test, y_test, max_words, embedding_matrix)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.58084
Precision: 0.58085
Recall: 0.58084
F1 Score: 0.58082
Confusion Matrix:
[[7183 5317]
 [5162 7338]]


**Persian-Text-Sentiment**

In [None]:
import gensim.downloader as api
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from datasets import load_dataset
import numpy as np

def load_persian_text_sentiment_dataset():
    dataset = load_dataset("SeyedAli/Persian-Text-Sentiment")
    return dataset

def preprocess_persian_text_sentiment_dataset(dataset):
    texts = [item['text'] for item in dataset['train']]
    labels = [item['label'] for item in dataset['train']]
    return texts, labels

def create_embedding_matrix(word_index, max_words, embedding_dim):
    word_vectors = api.load("glove-wiki-gigaword-100")

    embedding_matrix = np.zeros((max_words, embedding_dim))

    for word, i in word_index.items():
        if i < max_words:
            try:
                embedding_vector = word_vectors[word]
                embedding_matrix[i] = embedding_vector
            except KeyError:
                embedding_matrix[i] = np.random.normal(size=(embedding_dim,))

    return embedding_matrix

def train_model_with_pretrained_embedding(embedding_dim, X_train, y_train, X_test, y_test, max_words, embedding_matrix, maxlen=50):
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)
    y_pred = (model.predict(X_test) > 0.5).astype("int32")

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print("Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.5f}")
    print(f"Precision: {precision:.5f}")
    print(f"Recall: {recall:.5f}")
    print(f"F1 Score: {f1:.5f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

dataset = load_persian_text_sentiment_dataset()
X, y = preprocess_persian_text_sentiment_dataset(dataset)

max_words = 10000
maxlen = 50
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
X = pad_sequences(sequences, maxlen=maxlen)

embedding_dim = 100
embedding_matrix = create_embedding_matrix(tokenizer.word_index, max_words, embedding_dim)

train_model_with_pretrained_embedding(embedding_dim, X_train, y_train, X_test, y_test, max_words, embedding_matrix)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.59636
Precision: 0.59642
Recall: 0.59636
F1 Score: 0.59630
Confusion Matrix:
[[7305 5195]
 [4896 7604]]


**persian_news_dataset**

In [None]:
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Flatten
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
import gensim.downloader as api

# Load the dataset
dataset = load_dataset("saied/persian_news_dataset", split="train", streaming=True)

# Preprocess the dataset
docs = []
document_count = 40000
counter = 0
skip_counter = 0

for doc in dataset:
    if len(doc['title']) == 0 or len(doc['category']) == 0:
        skip_counter += 1
        if skip_counter % 10000 == 0:
            print(f'{skip_counter} skipped')
        continue
    docs.append(doc)
    counter += 1
    if counter == document_count:
        break

print(f"Number of documents processed: {len(docs)}")

texts = [doc['text'] for doc in docs]
labels = [doc['category'] for doc in docs]

max_words = 10000
max_length = 50

# Tokenize the text data
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)
x = pad_sequences(sequences, maxlen=max_length)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Load GloVe embeddings
word_vectors = api.load("glove-wiki-gigaword-100")

embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < max_words:
        try:
            embedding_vector = word_vectors[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            embedding_matrix[i] = np.random.normal(size=(embedding_dim,))

# Define the model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=False),
    LSTM(32, return_sequences=True),
    LSTM(64),
    Flatten(),
    Dense(10, activation="relu"),
    Dense(5, activation="relu"),
    Dense(len(set(y)), activation="softmax"),
])

# Compile the model
model.compile(optimizer="rmsprop",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

# Build and fit the model
model.summary()
model.fit(x_train, y_train, epochs=4, batch_size=512)

# Evaluate the model
print("-------------------------------------")
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"test_acc: {test_acc:.5f} \t test_loss: {test_loss:.5f}")

# Make predictions on the test set
y_pred_probs = model.predict(x_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(cm)

# Calculate precision, recall, and other metrics for each class
unique_categories = label_encoder.classes_

for i, category in enumerate(unique_categories):
    tp = cm[i, i]
    fp = cm[:, i].sum() - tp
    fn = cm[i, :].sum() - tp
    tn = cm.sum() - (tp + fp + fn)

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    accuracy = (tp + tn) / cm.sum() if cm.sum() > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"Category: {category}")
    print(f"Precision: {precision:.5f}")
    print(f"Recall: {recall:.5f}")
    print(f"Accuracy: {accuracy:.5f}")
    print(f"F1 Score: {f1_score:.5f}")
    print("----------------------------------------")


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


10000 skipped
20000 skipped
30000 skipped
40000 skipped
50000 skipped
60000 skipped
70000 skipped
80000 skipped
90000 skipped
100000 skipped
110000 skipped
120000 skipped
130000 skipped
140000 skipped
150000 skipped
160000 skipped
170000 skipped
180000 skipped
190000 skipped
200000 skipped
210000 skipped
220000 skipped
230000 skipped
240000 skipped
Number of documents processed: 40000
Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 50, 100)           1000000   
                                                                 
 lstm_4 (LSTM)               (None, 50, 32)            17024     
                                                                 
 lstm_5 (LSTM)               (None, 64)                24832     
                                                                 
 flatten_8 (Flatten)         (None, 64)                0      

# part 3 - simpleRNN

**IMDB**

In [None]:
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, SimpleRNN
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def train_model_with_embedding(embedding_dim, X_train, y_train, X_test, y_test):
    max_words = 10000
    maxlen = 50

    X_train_padded = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test_padded = sequence.pad_sequences(X_test, maxlen=maxlen)

    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
    model.add(SimpleRNN(32))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train_padded, y_train, epochs=5, batch_size=64, validation_split=0.2)
    y_pred = (model.predict(X_test_padded) > 0.5).astype("int32")

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.5f}")
    print(f"Precision: {precision:.5f}")
    print(f"Recall: {recall:.5f}")
    print(f"F1 Score: {f1:.5f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

(X_train_imdb, y_train_imdb), (X_test_imdb, y_test_imdb) = imdb.load_data(num_words=10000)

embedding_dim_imdb = 100

print("Training model with Embedding Dimension 100 for IMDb dataset")
train_model_with_embedding(embedding_dim_imdb, X_train_imdb, y_train_imdb, X_test_imdb, y_test_imdb)


Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any.

**Persian Text Sentiment**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datasets import load_dataset
import numpy as np
from tensorflow.keras.layers import SimpleRNN

def train_model_with_embedding_and_simplernn(embedding_dim, X_train, y_train, X_test, y_test, max_words):
    model = Sequential([
        Embedding(max_words, embedding_dim, input_length=50),
        SimpleRNN(16, return_sequences=True),  # First SimpleRNN layer with 16 units
        SimpleRNN(32),  # Second SimpleRNN layer with 32 units
        Flatten(),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)
    y_pred = (model.predict(X_test) > 0.5).astype("int32")

    # Compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Print evaluation metrics
    print("Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.5f}")
    print(f"Precision: {precision:.5f}")
    print(f"Recall: {recall:.5f}")
    print(f"F1 Score: {f1:.5f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

print("Training model with Embedding Dimension 100 and SimpleRNN for Persian Text Sentiment dataset")
train_model_with_embedding_and_simplernn(embedding_dim_persian, X_persian, y_persian, X_persian, y_persian, max_words_persian)


Training model with Embedding Dimension 100 and SimpleRNN for Persian Text Sentiment dataset
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.94451
Precision: 0.95882
Recall: 0.92892
F1 Score: 0.94364
Confusion Matrix:
[[26812  1114]
 [ 1985 25941]]


**Persian News Dataset**

In [None]:
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

dataset = load_dataset("saied/persian_news_dataset", split="train", streaming=True)

docs = []
document_count = 40000
counter = 0
skip_counter = 0

for doc in dataset:
    if len(doc['title']) == 0 or len(doc['category']) == 0:
        skip_counter += 1
        if skip_counter % 10000 == 0:
            print(f'{skip_counter} skipped')
        continue
    docs.append(doc)
    counter += 1
    if counter == document_count:
        break

print(f"Number of documents processed: {len(docs)}")

not_proccess_x = [doc['text'] for doc in docs]
not_proccess_y = [doc['category'] for doc in docs]

max_words = 10000
max_length = 50
embedding_dim = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(not_proccess_x)

x = tokenizer.texts_to_sequences(not_proccess_x)
x = pad_sequences(x, maxlen=max_length)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(not_proccess_y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length),
    SimpleRNN(16, return_sequences=True),  # First RNN layer with return_sequences=True
    SimpleRNN(32),  # Second RNN layer
    Dense(10, activation="relu"),
    Dense(5, activation="relu"),
    Dense(len(set(y)), activation="softmax"),
])

model.compile(optimizer="rmsprop",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()
model.fit(x_train, y_train, epochs=4, batch_size=512)

print("-------------------------------------")
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"test_acc: {test_acc:.5f} \t test_loss: {test_loss:.5f}")

y_pred_probs = model.predict(x_test)
y_pred = np.argmax(y_pred_probs, axis=1)

cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(cm)

unique_categories = label_encoder.classes_

for i, category in enumerate(unique_categories):
    tp = cm[i, i]
    fp = cm[:, i].sum() - tp
    fn = cm[i, :].sum() - tp
    tn = cm.sum() - (tp + fp + fn)

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    accuracy = (tp + tn) / cm.sum() if cm.sum() > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"Category: {category}")
    print(f"Accuracy: {accuracy:.5f}")
    print(f"Precision: {precision:.5f}")
    print(f"Recall: {recall:.5f}")
    print(f"F1 Score: {f1_score:.5f}")
    print("---------------------------------------------------")


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.35k [00:00<?, ?B/s]

10000 skipped
20000 skipped
30000 skipped
40000 skipped
50000 skipped
60000 skipped
70000 skipped
80000 skipped
90000 skipped
100000 skipped
110000 skipped
120000 skipped
130000 skipped
140000 skipped
150000 skipped
160000 skipped
170000 skipped
180000 skipped
190000 skipped
200000 skipped
210000 skipped
220000 skipped
230000 skipped
240000 skipped
Number of documents processed: 40000
Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 50, 100)           1000000   
                                                                 
 simple_rnn_6 (SimpleRNN)    (None, 50, 16)            1872      
                                                                 
 simple_rnn_7 (SimpleRNN)    (None, 32)                1568      
                                                                 
 dense_12 (Dense)            (None, 10)                330    

# part 4 - LSTM

**imdb**

In [None]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, LSTM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
import gensim.downloader as api

max_words = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_words)
word_index = imdb.get_word_index()

X_train = pad_sequences(X_train, maxlen=50)
X_test = pad_sequences(X_test, maxlen=50)

def create_embedding_matrix(word_index, max_words, embedding_dim):
    word_vectors = api.load("glove-wiki-gigaword-100")

    embedding_matrix = np.zeros((max_words, embedding_dim))

    for word, i in word_index.items():
        if i < max_words:
            try:
                embedding_vector = word_vectors[word]
                embedding_matrix[i] = embedding_vector
            except KeyError:
                embedding_matrix[i] = np.random.normal(size=(embedding_dim,))

    return embedding_matrix

embedding_dim = 100
embedding_matrix = create_embedding_matrix(word_index, max_words, embedding_dim)

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=50, weights=[embedding_matrix], trainable=False),
    LSTM(32, return_sequences=True),
    LSTM(64),
    Flatten(),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

print("-------------------------------------")
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"test_acc: {test_acc:.5f} \t test_loss: {test_loss:.5f}")


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
-------------------------------------
test_acc: 0.72244 	 test_loss: 0.54155


**Persian Text Sentiment**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from datasets import load_dataset
import numpy as np
import gensim.downloader as api

def load_persian_text_sentiment_dataset():
    dataset = load_dataset("SeyedAli/Persian-Text-Sentiment")
    return dataset

def preprocess_persian_text_sentiment_dataset(dataset):
    texts = [item['text'] for item in dataset['train']]
    labels = [item['label'] for item in dataset['train']]
    return texts, labels

def create_embedding_matrix(word_index, max_words, embedding_dim):
    word_vectors = api.load("glove-wiki-gigaword-100")

    embedding_matrix = np.zeros((max_words, embedding_dim))

    for word, i in word_index.items():
        if i < max_words:
            try:
                embedding_vector = word_vectors[word]
                embedding_matrix[i] = embedding_vector
            except KeyError:
                embedding_matrix[i] = np.random.normal(size=(embedding_dim,))

    return embedding_matrix

def train_model_with_lstm(embedding_dim, X_train, y_train, X_test, y_test, max_words, embedding_matrix, maxlen=50):
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False))
    model.add(LSTM(32, return_sequences=True))
    model.add(LSTM(64))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)
    y_pred = (model.predict(X_test) > 0.5).astype("int32")

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print("Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.5f}")
    print(f"Precision: {precision:.5f}")
    print(f"Recall: {recall:.5f}")
    print(f"F1 Score: {f1:.5f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

dataset = load_persian_text_sentiment_dataset()
X, y = preprocess_persian_text_sentiment_dataset(dataset)

max_words = 10000
maxlen = 50
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
X = pad_sequences(sequences, maxlen=maxlen)

embedding_dim = 100
embedding_matrix = create_embedding_matrix(tokenizer.word_index, max_words, embedding_dim)

train_model_with_lstm(embedding_dim, X_train, y_train, X_test, y_test, max_words, embedding_matrix)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.77156
Precision: 0.77271
Recall: 0.77156
F1 Score: 0.77132
Confusion Matrix:
[[10050  2450]
 [ 3261  9239]]


**Persian News Dataset**

In [19]:
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Flatten
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
import gensim.downloader as api

# Load the dataset
dataset = load_dataset("saied/persian_news_dataset", split="train", streaming=True)

# Preprocess the dataset
docs = []
document_count = 40000
counter = 0
skip_counter = 0

for doc in dataset:
    if len(doc['title']) == 0 or len(doc['category']) == 0:
        skip_counter += 1
        if skip_counter % 10000 == 0:
            print(f'{skip_counter} skipped')
        continue
    docs.append(doc)
    counter += 1
    if counter == document_count:
        break

print(f"Number of documents processed: {len(docs)}")

texts = [doc['text'] for doc in docs]
labels = [doc['category'] for doc in docs]

max_words = 10000
max_length = 50

# Tokenize the text data
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)
x = pad_sequences(sequences, maxlen=max_length)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Load GloVe embeddings using gensim
word_vectors = api.load("glove-wiki-gigaword-100")

embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < max_words:
        try:
            embedding_vector = word_vectors[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            embedding_matrix[i] = np.random.normal(size=(embedding_dim,))

# Define the model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=False),
    LSTM(32, return_sequences=True),
    LSTM(64),
    Flatten(),
    Dense(10, activation="relu"),
    Dense(5, activation="relu"),
    Dense(len(set(y)), activation="softmax"),
])

# Compile the model
model.compile(optimizer="rmsprop",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

# Build and fit the model
model.summary()
model.fit(x_train, y_train, epochs=4, batch_size=512)

# Evaluate the model
print("-------------------------------------")
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"test_acc: {test_acc:.5f} \t test_loss: {test_loss:.5f}")

# Make predictions on the test set
y_pred_probs = model.predict(x_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(cm)

# Calculate precision, recall, and other metrics for each class
unique_categories = label_encoder.classes_

for i, category in enumerate(unique_categories):
    tp = cm[i, i]
    fp = cm[:, i].sum() - tp
    fn = cm[i, :].sum() - tp
    tn = cm.sum() - (tp + fp + fn)

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    accuracy = (tp + tn) / cm.sum() if cm.sum() > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"Category: {category}")
    print(f"Precision: {precision:.5f}")
    print(f"Recall: {recall:.5f}")
    print(f"Accuracy: {accuracy:.5f}")
    print(f"F1 Score: {f1_score:.5f}")
    print("----------------------------------------")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


10000 skipped
20000 skipped
30000 skipped
40000 skipped
50000 skipped
60000 skipped
70000 skipped
80000 skipped
90000 skipped
100000 skipped
110000 skipped
120000 skipped
130000 skipped
140000 skipped
150000 skipped
160000 skipped
170000 skipped
180000 skipped
190000 skipped
200000 skipped
210000 skipped
220000 skipped
230000 skipped
240000 skipped
Number of documents processed: 40000
Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 50, 100)           1000000   
                                                                 
 lstm_6 (LSTM)               (None, 50, 32)            17024     
                                                                 
 lstm_7 (LSTM)               (None, 64)                24832     
                                                                 
 flatten_9 (Flatten)         (None, 64)                0     