In [1]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, roc_auc_score
import tensorflow as tf

In [2]:
data_folder = 'text'
texts = []
labels = []
# Load data from text files
for file_name in os.listdir(data_folder):
    with open(os.path.join(data_folder, file_name), 'r', encoding='utf-8') as file:
        for line in file:
            text, label = line.strip().split('\t')
            texts.append(text)
            labels.append(label)

In [3]:
label_to_index = {label: i for i, label in enumerate(set(labels))}
encoded_labels = np.array([label_to_index[label] for label in labels])

In [4]:
texts_train, texts_temp, labels_train, labels_temp = train_test_split(texts, encoded_labels, test_size=0.2, random_state=42)
texts_val, texts_test, labels_val, labels_test = train_test_split(texts_temp, labels_temp, test_size=0.5, random_state=42)

In [5]:
max_words = 10000  # Consider only the top 10,000 words in the dataset
max_len = 100  # Limit each sentence to 100 words
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts_train)
# Convert texts to sequences of integers
sequences_train = tokenizer.texts_to_sequences(texts_train)
sequences_val = tokenizer.texts_to_sequences(texts_val)
sequences_test = tokenizer.texts_to_sequences(texts_test)
# Pad sequences to ensure uniform length
X_train = pad_sequences(sequences_train, maxlen=max_len)
X_val = pad_sequences(sequences_val, maxlen=max_len)
X_test = pad_sequences(sequences_test, maxlen=max_len)

In [7]:
embedding_dim = 100  # Dimensionality of word embeddings
lstm_units = 128  # Number of LSTM units
dropout_rate = 0.5  # Dropout rate
vocab_size = min(max_words, len(tokenizer.word_index) + 1)
lstm_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    LSTM(units=lstm_units, dropout=dropout_rate, recurrent_dropout=dropout_rate),
    Dense(units=len(label_to_index), activation='softmax')
])
lstm_model.summary()

In [8]:
learning_rate = 0.001  # Learning rate for Adam optimizer
lstm_model.compile(optimizer=Adam(learning_rate=learning_rate),
                   loss='sparse_categorical_crossentropy',
                   metrics=['accuracy'])

In [9]:
batch_size = 64
epochs = 10
lstm_history = lstm_model.fit(X_train, labels_train,
                              batch_size=batch_size,
                              epochs=epochs,
                              validation_data=(X_val, labels_val),
                              verbose=1)

Epoch 1/10
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 117ms/step - accuracy: 0.3583 - loss: 1.7139 - val_accuracy: 0.3625 - val_loss: 1.6265
Epoch 2/10
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 116ms/step - accuracy: 0.4043 - loss: 1.5606 - val_accuracy: 0.4510 - val_loss: 1.4664
Epoch 3/10
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 111ms/step - accuracy: 0.5292 - loss: 1.3179 - val_accuracy: 0.5245 - val_loss: 1.3528
Epoch 4/10
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 137ms/step - accuracy: 0.6276 - loss: 1.0955 - val_accuracy: 0.5629 - val_loss: 1.3032
Epoch 5/10
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 125ms/step - accuracy: 0.6817 - loss: 0.9210 - val_accuracy: 0.5501 - val_loss: 1.3214
Epoch 6/10
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 129ms/step - accuracy: 0.7104 - loss: 0.8044 - val_accuracy: 0.5448 - val_loss: 1.3917
Epoch 7/10

In [10]:
loss, accuracy = lstm_model.evaluate(X_test, labels_test, verbose=1)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.5396 - loss: 1.6852
Test Loss: 1.6417317390441895
Test Accuracy: 0.5255863666534424


In [11]:
predictions = lstm_model.predict(X_test)
y_pred = np.argmax(predictions, axis=1)

[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step


In [12]:
print("Classification Report:")
print(classification_report(labels_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.56      0.52        63
           1       0.74      0.67      0.70        21
           2       0.50      0.40      0.44       139
           3       0.55      0.64      0.59       368
           4       0.38      0.21      0.27        81
           5       0.54      0.58      0.56       225
           6       0.32      0.15      0.20        41

    accuracy                           0.53       938
   macro avg       0.50      0.46      0.47       938
weighted avg       0.51      0.53      0.51       938



In [19]:
lstm_model.save("lstm_model.h5")


