# part one

**imdb**

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, Conv1D, MaxPooling1D
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def train_model_with_cnn(embedding_dim, X_train, y_train, X_test, y_test, max_words, filters, kernel_size, use_maxpool):
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=50))
    model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation='relu'))
    if use_maxpool:
        model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

    y_pred = (model.predict(X_test) > 0.5).astype("int32")

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.5f}")
    print(f"Precision: {precision:.5f}")
    print(f"Recall: {recall:.5f}")
    print(f"F1 Score: {f1:.5f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# Load IMDB dataset with a maximum of 10000 words
(X_train_imdb, y_train_imdb), (X_test_imdb, y_test_imdb) = imdb.load_data(num_words=10000)
max_words_imdb = 10000

# Pad sequences
maxlen = 50
X_train_imdb = sequence.pad_sequences(X_train_imdb, maxlen=maxlen)
X_test_imdb = sequence.pad_sequences(X_test_imdb, maxlen=maxlen)

configs = [
    (16, 5, False), (16, 5, True), (16, 7, False), (16, 7, True),
    (32, 5, False), (32, 5, True), (32, 7, False), (32, 7, True)
]

embedding_dims = [10, 32, 64, 100]

for embedding_dim in embedding_dims:
    for filters, kernel_size, use_maxpool in configs:
        print(f"Training model with Embedding Dimension: {embedding_dim}, filters={filters}, kernel_size={kernel_size}, maxpool={use_maxpool}")
        train_model_with_cnn(embedding_dim, X_train_imdb, y_train_imdb, X_test_imdb, y_test_imdb, max_words_imdb, filters, kernel_size, use_maxpool)


Training model with Embedding Dimension: 10, filters=16, kernel_size=5, maxpool=False
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.78452
Precision: 0.78046
Recall: 0.79176
F1 Score: 0.78607
Confusion Matrix:
[[9716 2784]
 [2603 9897]]
Training model with Embedding Dimension: 10, filters=16, kernel_size=5, maxpool=True
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.78592
Precision: 0.76696
Recall: 0.82144
F1 Score: 0.79326
Confusion Matrix:
[[ 9380  3120]
 [ 2232 10268]]
Training model with Embedding Dimension: 10, filters=16, kernel_size=7, maxpool=False
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.78172
Precision: 0.76428
Recall: 0.81472
F1 Score: 0.78869
Confusion Matrix:
[[ 9359  3141]
 [ 2316 10184]]
Training model with Embedding Dimension: 10, filters=16, kernel_size=7, maxpool=True
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.78452
Prec

**Persian Text Sentiment Dataset**

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, Conv1D, MaxPooling1D
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from datasets import load_dataset

def preprocess_persian_text_sentiment(dataset):
    texts = dataset['train']['text']
    labels = dataset['train']['label']
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    max_words = len(word_index) + 1
    X = pad_sequences(sequences, maxlen=50)
    y = np.array(labels)
    return X, y, max_words

def train_model_with_cnn(embedding_dim, X_train, y_train, X_test, y_test, max_words, filters, kernel_size, use_maxpool):
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=50))
    model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation='relu'))
    if use_maxpool:
        model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

    y_pred = (model.predict(X_test) > 0.5).astype("int32")

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.5f}")
    print(f"Precision: {precision:.5f}")
    print(f"Recall: {recall:.5f}")
    print(f"F1 Score: {f1:.5f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# Load Persian Text Sentiment dataset
persian_dataset = load_dataset("SeyedAli/Persian-Text-Sentiment")
X_persian, y_persian, max_words_persian = preprocess_persian_text_sentiment(persian_dataset)

embedding_dim = 100
configs = [
    (16, 5, False), (16, 5, True), (16, 7, False), (16, 7, True),
    (32, 5, False), (32, 5, True), (32, 7, False), (32, 7, True)
]

for filters, kernel_size, use_maxpool in configs:
    print(f"Training model with Embedding Dimension: {embedding_dim}, filters={filters}, kernel_size={kernel_size}, maxpool={use_maxpool}")
    train_model_with_cnn(embedding_dim, X_persian, y_persian, X_persian, y_persian, max_words_persian, filters, kernel_size, use_maxpool)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/524 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/55852 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/13964 [00:00<?, ? examples/s]

Training model with Embedding Dimension: 100, filters=16, kernel_size=5, maxpool=False
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.95655
Precision: 0.95866
Recall: 0.95424
F1 Score: 0.95645
Confusion Matrix:
[[26777  1149]
 [ 1278 26648]]
Training model with Embedding Dimension: 100, filters=16, kernel_size=5, maxpool=True
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.95621
Precision: 0.95387
Recall: 0.95878
F1 Score: 0.95632
Confusion Matrix:
[[26631  1295]
 [ 1151 26775]]
Training model with Embedding Dimension: 100, filters=16, kernel_size=7, maxpool=False
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.95651
Precision: 0.96246
Recall: 0.95008
F1 Score: 0.95623
Confusion Matrix:
[[26891  1035]
 [ 1394 26532]]
Training model with Embedding Dimension: 100, filters=16, kernel_size=7, maxpool=True
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.95

**Persian News Dataset**

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Conv1D, MaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder

def preprocess_persian_news_dataset(dataset, maxlen=50):
    texts = []
    labels = []
    for doc in dataset:
        if len(doc['title']) > 0 and len(doc['category']) > 0:
            texts.append(doc['text'])
            labels.append(doc['category'])

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    max_words = len(word_index) + 1
    X = pad_sequences(sequences, maxlen=maxlen)

    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(labels)

    return X, y, max_words

dataset = load_dataset("saied/persian_news_dataset", split="train", streaming=True)

docs = []
document_count = 40000
counter = 0
skip_counter = 0

for doc in dataset:
    if len(doc['title']) == 0 or len(doc['category']) == 0:
        skip_counter += 1
        if skip_counter % 10000 == 0:
            print(f'{skip_counter} skipped')
        continue
    docs.append(doc)
    counter += 1
    if counter == document_count:
        break

print(f"Number of documents processed: {len(docs)}")

not_proccess_x = [doc['text'] for doc in docs]
not_proccess_y = [doc['category'] for doc in docs]

max_words_news = 10000
max_length_news = 50

tokenizer = Tokenizer(num_words=max_words_news, oov_token="<OOV>")
tokenizer.fit_on_texts(not_proccess_x)

x_news = tokenizer.texts_to_sequences(not_proccess_x)
x_news = pad_sequences(x_news, maxlen=max_length_news)

label_encoder = LabelEncoder()
y_news = label_encoder.fit_transform(not_proccess_y)

x_train_news, x_test_news, y_train_news, y_test_news = train_test_split(x_news, y_news, test_size=0.3, random_state=42)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


10000 skipped
20000 skipped
30000 skipped
40000 skipped
50000 skipped
60000 skipped
70000 skipped
80000 skipped
90000 skipped
100000 skipped
110000 skipped
120000 skipped
130000 skipped
140000 skipped
150000 skipped
160000 skipped
170000 skipped
180000 skipped
190000 skipped
200000 skipped
210000 skipped
220000 skipped
230000 skipped
240000 skipped
Number of documents processed: 40000


In [None]:
embedding_dim = 100
configs = [
    (16, 5, False), (16, 5, True), (16, 7, False), (16, 7, True),
    (32, 5, False), (32, 5, True), (32, 7, False), (32, 7, True)
]

for filters, kernel_size, use_maxpool in configs:
    print(f"Training model with Embedding Dimension: {embedding_dim}, filters={filters}, kernel_size={kernel_size}, maxpool={use_maxpool}")
    model = Sequential([
        Embedding(input_dim=max_words_news, output_dim=embedding_dim, input_length=max_length_news),
        Conv1D(filters=filters, kernel_size=kernel_size, activation='relu'),
        MaxPooling1D() if use_maxpool else Flatten(),
        Flatten(),
        Dense(64, activation="relu"),
        Dense(len(set(y_news)), activation="softmax"),
    ])

    model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    model.summary()
    model.fit(x_train_news, y_train_news, epochs=5, batch_size=512)

    test_loss, test_acc = model.evaluate(x_test_news, y_test_news)
    print(f"test_acc: {test_acc} \t test_loss: {test_loss}")

    y_pred_probs = model.predict(x_test_news)
    y_pred = np.argmax(y_pred_probs, axis=1)

    cm = confusion_matrix(y_test_news, y_pred)

    print("Confusion Matrix:")
    print(cm)

    unique_categories = label_encoder.classes_

    for i, category in enumerate(unique_categories):
        tp = cm[i, i]
        fp = cm[:, i].sum() - tp
        fn = cm[i, :].sum() - tp
        tn = cm.sum() - (tp + fp + fn)

        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        accuracy = (tp + tn) / cm.sum() if cm.sum() > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        print(f"Category: {category}")
        print(f"Precision: {precision:.5f}")
        print(f"Recall: {recall:.5f}")
        print(f"Accuracy: {accuracy:.5f}")
        print(f"F1 Score: {f1_score:.5f}")
        print("--------------------------------------")

Training model with Embedding Dimension: 100, filters=16, kernel_size=5, maxpool=False
Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 50, 100)           1000000   
                                                                 
 conv1d_8 (Conv1D)           (None, 46, 16)            8016      
                                                                 
 flatten_8 (Flatten)         (None, 736)               0         
                                                                 
 flatten_9 (Flatten)         (None, 736)               0         
                                                                 
 dense_16 (Dense)            (None, 64)                47168     
                                                                 
 dense_17 (Dense)            (None, 6)                 390       
                                 

# part 2

**imdb**

In [3]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Flatten
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
import gensim.downloader as api

# Load and preprocess data
max_words = 10000
maxlen = 50
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_words)
word_index = imdb.get_word_index()
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

# Create embedding matrix
def create_embedding_matrix(word_index, max_words, embedding_dim):
    word_vectors = api.load("glove-wiki-gigaword-100")
    embedding_matrix = np.zeros((max_words, embedding_dim))
    for word, i in word_index.items():
        if i < max_words:
            try:
                embedding_vector = word_vectors[word]
                embedding_matrix[i] = embedding_vector
            except KeyError:
                embedding_matrix[i] = np.random.normal(size=(embedding_dim,))
    return embedding_matrix

embedding_dim = 100
embedding_matrix = create_embedding_matrix(word_index, max_words, embedding_dim)

# Define and compile the model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False),
    Conv1D(filters=32, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    LSTM(32, return_sequences=True),
    LSTM(64),
    Flatten(),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"test_acc: {test_acc:.5f} \t test_loss: {test_loss:.5f}")

# Predict and calculate metrics
y_pred = (model.predict(X_test) > 0.5).astype("int32")

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')
cm = confusion_matrix(y_test, y_pred)

print("Evaluation Metrics:")
print(f"Accuracy: {accuracy:.5f}")
print(f"Precision: {precision:.5f}")
print(f"Recall: {recall:.5f}")
print(f"F1 Score: {f1:.5f}")
print("Confusion Matrix:")
print(cm)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
test_acc: 0.68504 	 test_loss: 0.57953
Evaluation Metrics:
Accuracy: 0.68504
Precision: 0.70913
Recall: 0.62744
F1 Score: 0.66579
Confusion Matrix:
[[9283 3217]
 [4657 7843]]


**Persian Text Sentiment Dataset**

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/542.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from datasets import load_dataset
import numpy as np
import gensim.downloader as api

# Load and preprocess data
def load_persian_text_sentiment_dataset():
    dataset = load_dataset("SeyedAli/Persian-Text-Sentiment")
    return dataset

def preprocess_persian_text_sentiment_dataset(dataset):
    texts = [item['text'] for item in dataset['train']]
    labels = [item['label'] for item in dataset['train']]
    return texts, labels

def create_embedding_matrix(word_index, max_words, embedding_dim):
    word_vectors = api.load("glove-wiki-gigaword-100")
    embedding_matrix = np.zeros((max_words, embedding_dim))
    for word, i in word_index.items():
        if i < max_words:
            try:
                embedding_vector = word_vectors[word]
                embedding_matrix[i] = embedding_vector
            except KeyError:
                embedding_matrix[i] = np.random.normal(size=(embedding_dim,))
    return embedding_matrix

def train_model_with_lstm(embedding_dim, X_train, y_train, X_test, y_test, max_words, embedding_matrix, maxlen=50):
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False))
    model.add(Conv1D(filters=32, kernel_size=5, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(32, return_sequences=True))
    model.add(LSTM(64))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(np.array(X_train), np.array(y_train), epochs=5, batch_size=64, validation_split=0.2)
    y_pred = (model.predict(np.array(X_test)) > 0.5).astype("int32")

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    cm = confusion_matrix(y_test, y_pred)

    print("Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.5f}")
    print(f"Precision: {precision:.5f}")
    print(f"Recall: {recall:.5f}")
    print(f"F1 Score: {f1:.5f}")
    print("Confusion Matrix:")
    print(cm)

dataset = load_persian_text_sentiment_dataset()
X, y = preprocess_persian_text_sentiment_dataset(dataset)

max_words = 10000
maxlen = 50
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
X = pad_sequences(sequences, maxlen=maxlen)

embedding_dim = 100
embedding_matrix = create_embedding_matrix(tokenizer.word_index, max_words, embedding_dim)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

train_model_with_lstm(embedding_dim, X_train, y_train, X_test, y_test, max_words, embedding_matrix)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluation Metrics:
Accuracy: 0.80646
Precision: 0.78931
Recall: 0.83036
F1 Score: 0.80931
Confusion Matrix:
[[6631 1837]
 [1406 6882]]


**Persian News Dataset**

In [7]:
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Flatten
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import gensim.downloader as api

# Load and preprocess data
dataset = load_dataset("saied/persian_news_dataset", split="train", streaming=True)
docs = []
document_count = 40000
counter = 0
skip_counter = 0

for doc in dataset:
    if len(doc['title']) == 0 or len(doc['category']) == 0:
        skip_counter += 1
        if skip_counter % 10000 == 0:
            print(f'{skip_counter} skipped')
        continue
    docs.append(doc)
    counter += 1
    if counter == document_count:
        break

print(f"Number of documents processed: {len(docs)}")

texts = [doc['text'] for doc in docs]
labels = [doc['category'] for doc in docs]

max_words = 10000
max_length = 50

# Tokenize the text data
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
x = pad_sequences(sequences, maxlen=max_length)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Create embedding matrix
def create_embedding_matrix(word_index, max_words, embedding_dim):
    word_vectors = api.load("glove-wiki-gigaword-100")
    embedding_matrix = np.zeros((max_words, embedding_dim))
    for word, i in word_index.items():
        if i < max_words:
            try:
                embedding_vector = word_vectors[word]
                embedding_matrix[i] = embedding_vector
            except KeyError:
                embedding_matrix[i] = np.random.normal(size=(embedding_dim,))
    return embedding_matrix

embedding_dim = 100
embedding_matrix = create_embedding_matrix(tokenizer.word_index, max_words, embedding_dim)

# Define and compile the model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=False),
    Conv1D(filters=32, kernel_size=7, activation='relu'),
    MaxPooling1D(pool_size=2),
    LSTM(32, return_sequences=True),
    LSTM(64),
    Flatten(),
    Dense(10, activation="relu"),
    Dense(5, activation="relu"),
    Dense(len(set(y)), activation="softmax"),
])

model.compile(optimizer="rmsprop",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

# Train the model
model.summary()
model.fit(x_train, y_train, epochs=4, batch_size=512)

# Evaluate the model
print("-------------------------------------")
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"test_acc: {test_acc:.5f} \t test_loss: {test_loss:.5f}")

# Make predictions on the test set
y_pred_probs = model.predict(x_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(cm)

# Calculate precision, recall, and other metrics for each class
unique_categories = label_encoder.classes_

for i, category in enumerate(unique_categories):
    tp = cm[i, i]
    fp = cm[:, i].sum() - tp
    fn = cm[i, :].sum() - tp
    tn = cm.sum() - (tp + fp + fn)

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    accuracy = (tp + tn) / cm.sum() if cm.sum() > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"Category: {category}")
    print(f"Precision: {precision:.5f}")
    print(f"Recall: {recall:.5f}")
    print(f"Accuracy: {accuracy:.5f}")
    print(f"F1 Score: {f1_score:.5f}")
    print("----------------------------------------")


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.35k [00:00<?, ?B/s]

10000 skipped
20000 skipped
30000 skipped
40000 skipped
50000 skipped
60000 skipped
70000 skipped
80000 skipped
90000 skipped
100000 skipped
110000 skipped
120000 skipped
130000 skipped
140000 skipped
150000 skipped
160000 skipped
170000 skipped
180000 skipped
190000 skipped
200000 skipped
210000 skipped
220000 skipped
230000 skipped
240000 skipped
Number of documents processed: 40000
Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 50, 100)           1000000   
                                                                 
 conv1d_5 (Conv1D)           (None, 44, 32)            22432     
                                                                 
 max_pooling1d_5 (MaxPoolin  (None, 22, 32)            0         
 g1D)                                                            
                                                              