In [6]:


import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))


def text_to_word_list(text):
    text = text.split()
    return text

def replace_strings(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\u00C0-\u017F"          #latin
                           u"\u2000-\u206F"          #generalPunctuations
                               
                           "]+", flags=re.UNICODE)
    english_pattern=re.compile('[a-zA-Z0-9]+', flags=re.I)
    #latin_pattern=re.compile('[A-Za-z\u00C0-\u00D6\u00D8-\u00f6\u00f8-\u00ff\s]*',)
    
    text=emoji_pattern.sub(r'', text)
    text=english_pattern.sub(r'', text)

    return text

def remove_punctuations(my_str):
    # define punctuation
    punctuations = '''````£|¢|Ñ+-*/=EROero৳০১২৩৪৫৬৭৮৯012–34567•89।!()-[]{};:'"“\’,<>./?@#$%^&*_~‘—॥”‰⚽️✌�￰৷￰'''
    
    no_punct = ""
    for char in my_str:
        if char not in punctuations:
            no_punct = no_punct + char

    # display the unpunctuated string
    return no_punct



def joining(text):
    out=' '.join(text)
    return out

def preprocessing(text):
    out=remove_punctuations(replace_strings(text))
    return out



train_url = '/kaggle/input/80-20ratiofinal/train8020.csv'
test_url = '/kaggle/input/80-20ratiofinal/test8020.csv'
df_train = pd.read_csv(train_url)
df_test = pd.read_csv(test_url)
stop_words_df = pd.read_excel('/kaggle/input/bangla-stopwords/stopwords_bangla.xlsx',index_col=False)
STOPWORDS = set([word.strip() for word in stop_words_df['words']])

df_train['Comment'] = df_train.Comment.apply(lambda x: preprocessing(str(x)))
df_test['Comment'] = df_test.Comment.apply(lambda x:preprocessing(str(x)))
df = pd.concat([df_train,df_test],ignore_index = True)

df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
print(df_train.head(5))

def encode(s):
    d = {
        "Code Switching":0,
        "Grammatical":1,
        "Multiple Errors":2,
        "Spelling":3
    }
    if s in d:
        return d[s]
    else:
        return 4
    
df['Category'] = df.Category.apply(lambda x: encode(x))
df_train['Category'] = df_train.Category.apply(lambda x: encode(x))
df_test['Category'] = df_test.Category.apply(lambda x: encode(x))


      Video ID           Channel name     Time of Publishing  \
0  f-rEHLDfXro  ইতিহাসের অনুসন্ধানে 7G  2022-11-26T10:03:44Z   
1  aLeV04Bz5xk              Channel 24  2023-07-19T13:37:46Z   
2  zBPoDX9TSXA       Cine Fever Bangla  2022-07-15T17:32:43Z   
3  sAoIhoP1zhk                ATN News  2023-06-10T16:32:00Z   
4  jV4gHJYIi34               BigganPiC  2023-02-15T08:00:19Z   

                                               Title          Genre  \
0  ১০০ বছর আগে হারিয়ে যাওয়া গ্রাম কেমন ছিল ইতিহাস...  Miscellaneous   
1  দেশে প্রথমবারের মতো চ্যানেল 24-এর পর্দায় সংবাদ...           News   
2  বড়লোক অনন্ত বর্ষা কে ধুয়ে দিল পরীমনি!দেখুন কি বলল  Entertainment   
3  বিএনপি নির্বাচিত হলে কাকে প্রধানমন্ত্রী করা হব...           News   
4  পৃথিবীর কেন্দ্র Earth core and Earthquake wave...  Miscellaneous   

                                             Comment  Error        Category  \
0  হ্যালো যতগুলা পিক দেখালেন সবগুলা তো মনে হচ্ছে ...      1        Spelling   
1  মানুষের মতো সুন্দর সাবলীল ভ

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Attention
from tensorflow.keras.models import Model
# import tensorflow_addons as tfa

# Sample data and labels (replace with your own dataset)
data = df_train['Comment']
labels = df_train['Category']

# Tokenize text data
max_words = 5000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
word_index = tokenizer.word_index
max_sequence_length = 100  # Adjust this based on your data

# Pad sequences
X = pad_sequences(sequences, maxlen=max_sequence_length)

# One-hot encode labels
Y = tf.keras.utils.to_categorical(labels, num_classes=5)

# Create LSTM with Attention model
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(max_words, 100)(input_layer)
lstm_layer = LSTM(64, return_sequences=True)(embedding_layer)

# Attention mechanism using TensorFlow Addons
attention = Attention()([lstm_layer, lstm_layer])
attention = tf.keras.layers.GlobalAveragePooling1D()(attention)

output_layer = Dense(5, activation='sigmoid')(attention)
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, Y, epochs=10, batch_size=100, validation_split=0.2)

# Evaluate the model on test data (replace with your test data)
test_data = df_test['Comment'] # Replace with your actual test data
test_labels = df_test['Category']  # Replace with your actual test labels

test_sequences = tokenizer.texts_to_sequences(test_data)
X_test = pad_sequences(test_sequences, maxlen=max_sequence_length)
Y_test = tf.keras.utils.to_categorical(test_labels, num_classes=5)

loss, accuracy = model.evaluate(X_test, Y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 1.6474559307098389, Test Accuracy: 0.5199123620986938


In [3]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
y_pred = model.predict(X_test)

# Calculate macro recall
macro_recall = recall_score(Y_test.argmax(axis=1), y_pred.argmax(axis=1), average='macro')
macro_precision = precision_score(Y_test.argmax(axis=1), y_pred.argmax(axis=1), average='macro')
# Print the results
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n  Macro Recall: {:0.3f}\n  Macro Precision: {:0.3f}'.format(loss, accuracy, macro_recall, macro_precision))


Test set
  Loss: 1.647
  Accuracy: 0.520
  Macro Recall: 0.367
  Macro Precision: 0.368


In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Attention
from tensorflow.keras.models import Model
# import tensorflow_addons as tfa

# Sample data and labels (replace with your own dataset)
data = df_train['Comment']
labels = df_train['Category']

# Tokenize text data
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
word_index = tokenizer.word_index
max_sequence_length = 100  # Adjust this based on your data

# Pad sequences
X = pad_sequences(sequences, maxlen=max_sequence_length)

# One-hot encode labels
Y = tf.keras.utils.to_categorical(labels, num_classes=5)

# Create LSTM with Attention model
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(max_words, 100)(input_layer)
lstm_layer = LSTM(64, return_sequences=True)(embedding_layer)

# Attention mechanism using TensorFlow Addons
attention = Attention()([lstm_layer, lstm_layer])
attention = tf.keras.layers.GlobalAveragePooling1D()(attention)

output_layer = Dense(5, activation='sigmoid')(attention)
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, Y, epochs=10, batch_size=300, validation_split=0.2)

# Evaluate the model on test data (replace with your test data)
test_data = df_test['Comment'] # Replace with your actual test data
test_labels = df_test['Category']  # Replace with your actual test labels

test_sequences = tokenizer.texts_to_sequences(test_data)
X_test = pad_sequences(test_sequences, maxlen=max_sequence_length)
Y_test = tf.keras.utils.to_categorical(test_labels, num_classes=5)

loss, accuracy = model.evaluate(X_test, Y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 1.3338813781738281, Test Accuracy: 0.5525686740875244


In [5]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
y_pred = model.predict(X_test)

# Calculate macro recall
macro_recall = recall_score(Y_test.argmax(axis=1), y_pred.argmax(axis=1), average='macro')
macro_precision = precision_score(Y_test.argmax(axis=1), y_pred.argmax(axis=1), average='macro')
# Print the results
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n  Macro Recall: {:0.3f}\n  Macro Precision: {:0.3f}'.format(loss, accuracy, macro_recall, macro_precision))


Test set
  Loss: 1.334
  Accuracy: 0.553
  Macro Recall: 0.380
  Macro Precision: 0.391


  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Attention
from tensorflow.keras.models import Model
# import tensorflow_addons as tfa

# Sample data and labels (replace with your own dataset)
data = df_train['Comment']
labels = df_train['Category']

# Tokenize text data
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
word_index = tokenizer.word_index
max_sequence_length = 100  # Adjust this based on your data

# Pad sequences
X = pad_sequences(sequences, maxlen=max_sequence_length)

# One-hot encode labels
Y = tf.keras.utils.to_categorical(labels, num_classes=5)

# Create LSTM with Attention model
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(max_words, 100)(input_layer)
lstm_layer = LSTM(64, return_sequences=True)(embedding_layer)

# Attention mechanism using TensorFlow Addons
attention = Attention()([lstm_layer, lstm_layer])
attention = tf.keras.layers.GlobalAveragePooling1D()(attention)

output_layer = Dense(5, activation='sigmoid')(attention)
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, Y, epochs=10, batch_size=100, validation_split=0.2)

# Evaluate the model on test data (replace with your test data)
test_data = df_test['Comment'] # Replace with your actual test data
test_labels = df_test['Category']  # Replace with your actual test labels

test_sequences = tokenizer.texts_to_sequences(test_data)
X_test = pad_sequences(test_sequences, maxlen=max_sequence_length)
Y_test = tf.keras.utils.to_categorical(test_labels, num_classes=5)

loss, accuracy = model.evaluate(X_test, Y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 1.853079080581665, Test Accuracy: 0.5356431603431702


In [7]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
y_pred = model.predict(X_test)

# Calculate macro recall
macro_recall = recall_score(Y_test.argmax(axis=1), y_pred.argmax(axis=1), average='macro')
macro_precision = precision_score(Y_test.argmax(axis=1), y_pred.argmax(axis=1), average='macro')
# Print the results
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n  Macro Recall: {:0.3f}\n  Macro Precision: {:0.3f}'.format(loss, accuracy, macro_recall, macro_precision))


Test set
  Loss: 1.853
  Accuracy: 0.536
  Macro Recall: 0.387
  Macro Precision: 0.386


In [8]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Attention
from tensorflow.keras.models import Model
# import tensorflow_addons as tfa

# Sample data and labels (replace with your own dataset)
data = df_train['Comment']
labels = df_train['Category']

# Tokenize text data
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
word_index = tokenizer.word_index
max_sequence_length = 100  # Adjust this based on your data

# Pad sequences
X = pad_sequences(sequences, maxlen=max_sequence_length)

# One-hot encode labels
Y = tf.keras.utils.to_categorical(labels, num_classes=5)

# Create LSTM with Attention model
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(max_words, 100)(input_layer)
lstm_layer = LSTM(64, return_sequences=True)(embedding_layer)

# Attention mechanism using TensorFlow Addons
attention = Attention()([lstm_layer, lstm_layer])
attention = tf.keras.layers.GlobalAveragePooling1D()(attention)

output_layer = Dense(5, activation='sigmoid')(attention)
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, Y, epochs=10, batch_size=500, validation_split=0.2)

# Evaluate the model on test data (replace with your test data)
test_data = df_test['Comment'] # Replace with your actual test data
test_labels = df_test['Category']  # Replace with your actual test labels

test_sequences = tokenizer.texts_to_sequences(test_data)
X_test = pad_sequences(test_sequences, maxlen=max_sequence_length)
Y_test = tf.keras.utils.to_categorical(test_labels, num_classes=5)

loss, accuracy = model.evaluate(X_test, Y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 1.2322776317596436, Test Accuracy: 0.5262843370437622


In [9]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
y_pred = model.predict(X_test)

# Calculate macro recall
macro_recall = recall_score(Y_test.argmax(axis=1), y_pred.argmax(axis=1), average='macro')
macro_precision = precision_score(Y_test.argmax(axis=1), y_pred.argmax(axis=1), average='macro')
# Print the results
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n  Macro Recall: {:0.3f}\n  Macro Precision: {:0.3f}'.format(loss, accuracy, macro_recall, macro_precision))


Test set
  Loss: 1.232
  Accuracy: 0.526
  Macro Recall: 0.364
  Macro Precision: 0.389


  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Attention
from tensorflow.keras.models import Model
# import tensorflow_addons as tfa

# Sample data and labels (replace with your own dataset)
data = df_train['Comment']
labels = df_train['Category']

# Tokenize text data
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
word_index = tokenizer.word_index
max_sequence_length = 100  # Adjust this based on your data

# Pad sequences
X = pad_sequences(sequences, maxlen=max_sequence_length)

# One-hot encode labels
Y = tf.keras.utils.to_categorical(labels, num_classes=5)

# Create LSTM with Attention model
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(max_words, 100)(input_layer)
lstm_layer = LSTM(64, return_sequences=True)(embedding_layer)

# Attention mechanism using TensorFlow Addons
attention = Attention()([lstm_layer, lstm_layer])
attention = tf.keras.layers.GlobalAveragePooling1D()(attention)

output_layer = Dense(5, activation='sigmoid')(attention)
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, Y, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model on test data (replace with your test data)
test_data = df_test['Comment'] # Replace with your actual test data
test_labels = df_test['Category']  # Replace with your actual test labels

test_sequences = tokenizer.texts_to_sequences(test_data)
X_test = pad_sequences(test_sequences, maxlen=max_sequence_length)
Y_test = tf.keras.utils.to_categorical(test_labels, num_classes=5)

loss, accuracy = model.evaluate(X_test, Y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 2.343489408493042, Test Accuracy: 0.5312624573707581


In [5]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
y_pred = model.predict(X_test)

# Calculate macro recall
macro_recall = recall_score(Y_test.argmax(axis=1), y_pred.argmax(axis=1), average='macro')
macro_precision = precision_score(Y_test.argmax(axis=1), y_pred.argmax(axis=1), average='macro')
# Print the results
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n  Macro Recall: {:0.3f}\n  Macro Precision: {:0.3f}'.format(loss, accuracy, macro_recall, macro_precision))


Test set
  Loss: 2.343
  Accuracy: 0.531
  Macro Recall: 0.377
  Macro Precision: 0.381


In [7]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Attention
from tensorflow.keras.models import Model
# import tensorflow_addons as tfa

# Sample data and labels (replace with your own dataset)
data = df_train['Comment']
labels = df_train['Category']

# Tokenize text data
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
word_index = tokenizer.word_index
max_sequence_length = 100  # Adjust this based on your data

# Pad sequences
X = pad_sequences(sequences, maxlen=max_sequence_length)

# One-hot encode labels
Y = tf.keras.utils.to_categorical(labels, num_classes=5)

# Create LSTM with Attention model
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(max_words, 100)(input_layer)
lstm_layer = LSTM(64, return_sequences=True)(embedding_layer)

# Attention mechanism using TensorFlow Addons
attention = Attention()([lstm_layer, lstm_layer])
attention = tf.keras.layers.GlobalAveragePooling1D()(attention)

output_layer = Dense(5, activation='sigmoid')(attention)
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, Y, epochs=10, batch_size=300, validation_split=0.2)

# Evaluate the model on test data (replace with your test data)
test_data = df_test['Comment'] # Replace with your actual test data
test_labels = df_test['Category']  # Replace with your actual test labels

test_sequences = tokenizer.texts_to_sequences(test_data)
X_test = pad_sequences(test_sequences, maxlen=max_sequence_length)
Y_test = tf.keras.utils.to_categorical(test_labels, num_classes=5)

loss, accuracy = model.evaluate(X_test, Y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 1.3097609281539917, Test Accuracy: 0.5659099817276001


In [8]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
y_pred = model.predict(X_test)

# Calculate macro recall
macro_recall = recall_score(Y_test.argmax(axis=1), y_pred.argmax(axis=1), average='macro')
macro_precision = precision_score(Y_test.argmax(axis=1), y_pred.argmax(axis=1), average='macro')
# Print the results
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n  Macro Recall: {:0.3f}\n  Macro Precision: {:0.3f}'.format(loss, accuracy, macro_recall, macro_precision))


Test set
  Loss: 1.310
  Accuracy: 0.566
  Macro Recall: 0.379
  Macro Precision: 0.393


  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Attention
from tensorflow.keras.models import Model
# import tensorflow_addons as tfa

# Sample data and labels (replace with your own dataset)
data = df_train['Comment']
labels = df_train['Category']

# Tokenize text data
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
word_index = tokenizer.word_index
max_sequence_length = 100  # Adjust this based on your data

# Pad sequences
X = pad_sequences(sequences, maxlen=max_sequence_length)

# One-hot encode labels
Y = tf.keras.utils.to_categorical(labels, num_classes=5)

# Create LSTM with Attention model
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(max_words, 100)(input_layer)
lstm_layer = LSTM(64, return_sequences=True)(embedding_layer)

# Attention mechanism using TensorFlow Addons
attention = Attention()([lstm_layer, lstm_layer])
attention = tf.keras.layers.GlobalAveragePooling1D()(attention)

output_layer = Dense(5, activation='sigmoid')(attention)
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, Y, epochs=10, batch_size=100, validation_split=0.2)

# Evaluate the model on test data (replace with your test data)
test_data = df_test['Comment'] # Replace with your actual test data
test_labels = df_test['Category']  # Replace with your actual test labels

test_sequences = tokenizer.texts_to_sequences(test_data)
X_test = pad_sequences(test_sequences, maxlen=max_sequence_length)
Y_test = tf.keras.utils.to_categorical(test_labels, num_classes=5)

loss, accuracy = model.evaluate(X_test, Y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 1.740644097328186, Test Accuracy: 0.5553564429283142


In [10]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
y_pred = model.predict(X_test)

# Calculate macro recall
macro_recall = recall_score(Y_test.argmax(axis=1), y_pred.argmax(axis=1), average='macro')
macro_precision = precision_score(Y_test.argmax(axis=1), y_pred.argmax(axis=1), average='macro')
# Print the results
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n  Macro Recall: {:0.3f}\n  Macro Precision: {:0.3f}'.format(loss, accuracy, macro_recall, macro_precision))

Test set
  Loss: 1.741
  Accuracy: 0.555
  Macro Recall: 0.386
  Macro Precision: 0.391
