In [1]:
import numpy as np
import pandas as pd
import nltk
import plotly.express as px
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import spacy


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/Dataset_12.5.2025.csv",encoding='utf-8')

In [4]:
df.head(5)

Unnamed: 0,Index,Sentence,Label
0,0,BRI နိုင်ငံများ အပြန်အလှန် ရင်းနှီးမြှုပ်နှံမှ...,Business
1,1,ရှမ်းပြည်နယ်မြောက်ပိုင်း မူဆယ်နယ်မြို့ ရှိ ဆင်...,Business
2,2,ယခုကဲ့သို့ သယ်ဆောင်ခွင့်ပြု ရာ တွင် လည်း ထောက်...,Business
3,3,ထိုသို့ မီးဖိုချောင်သုံးအသီးအနှံ အသားငါးများ က...,Business
4,4,မူဆယ်နယ်စပ်ကုန်သည်တဦး က နိုဝင်ဘာ ၂၉ ရက်မှာ စာရ...,Business


In [5]:
df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
Product&Service,9962
Political,9835
Business,9166
Health,8978
Sports,8217
Culture&History,7976
Education,7395
Social,7196
Entertainment,6972
Science&Technology,5187


Check Null values

In [6]:
df.isnull().sum()

Unnamed: 0,0
Index,0
Sentence,0
Label,0


In [7]:
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = stopwords.words('english')
stop_words.remove('not')
lemmatizer = WordNetLemmatizer()

class MyanmarTextPreprocessor():
    def __init__(self, dict_path: str, stop_path: str):
        self.dictionary = self.load_dictionary(dict_path)
        self.stopwords = self.load_stopwords(stop_path)

    # Load dictionary into a set
    def load_dictionary(self, dict_path):
        dictionary = set()
        with open(dict_path, 'r', encoding='utf-8') as f:
            for line in f:
                word = line.strip()
                if word:
                    dictionary.add(word)
        return dictionary

    # Load stopwords into a set
    def load_stopwords(self, stopword_path):
        stopwords = set()
        with open(stopword_path, 'r', encoding='utf-8') as f:
            for line in f:
                word = line.strip()
                if word:
                    stopwords.add(word)
        return stopwords

    # Merge syllables based on dictionary
    def merge_with_dictionary(self, syllables):
        merged_tokens = []
        i = 0
        while i < len(syllables):
            matched = False
            for j in range(len(syllables), i, -1):
                combined = ''.join(syllables[i:j])
                if combined in self.dictionary:
                    merged_tokens.append(combined)
                    i = j
                    matched = True
                    break
            if not matched:
                merged_tokens.append(syllables[i])
                i += 1
        return merged_tokens

    def preprocessing(self, text: str):
        text = re.sub(r"(([A-Za-z0-9]+)|[က-အ|ဥ|ဦ](င်္|[က-အ][ှ]*[့း]*[်]|္[က-အ]|[ါ-ှႏꩻ][ꩻ]*){0,}|.)", r"\1 ", text)
        text = text.strip().split()
        merged_tokens = self.merge_with_dictionary(text)
        filtered_tokens = [token for token in merged_tokens if token not in self.stopwords]
        return ' '.join(filtered_tokens)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
preprocessor = MyanmarTextPreprocessor('/content/drive/MyDrive/Datasets/dict-words.txt', '/content/drive/MyDrive/Datasets/sw.txt')


In [None]:
# spacy_eng = spacy.load("en_core_web_sm")
# def get_entities(x):
#   entity = []
#   text = spacy_eng(x)
#   for word in text.ents:
#     entity.append(word.label_)
#   return ",".join(entity)

#   df['entity'] = df['Sentiment'].progess_apply(get_entities)

In [9]:
df.head()

Unnamed: 0,Index,Sentence,Label
0,0,BRI နိုင်ငံများ အပြန်အလှန် ရင်းနှီးမြှုပ်နှံမှ...,Business
1,1,ရှမ်းပြည်နယ်မြောက်ပိုင်း မူဆယ်နယ်မြို့ ရှိ ဆင်...,Business
2,2,ယခုကဲ့သို့ သယ်ဆောင်ခွင့်ပြု ရာ တွင် လည်း ထောက်...,Business
3,3,ထိုသို့ မီးဖိုချောင်သုံးအသီးအနှံ အသားငါးများ က...,Business
4,4,မူဆယ်နယ်စပ်ကုန်သည်တဦး က နိုဝင်ဘာ ၂၉ ရက်မှာ စာရ...,Business


Model Building

In [10]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, Layer, Dense, Dropout, MultiHeadAttention, LayerNormalization, Input, GlobalAveragePooling1D
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split

In [11]:
sentences = df['Sentence']
label = df['Label']
X_train, X_val, y_train, y_val = train_test_split(sentences, label, test_size=0.2, stratify=label, random_state=40)

In [12]:
print(X_train.shape)
print(y_train.shape)

(74796,)
(74796,)


In [13]:
X_train_clean = X_train.apply(preprocessor.preprocessing)
X_val_clean = X_val.apply(preprocessor.preprocessing)

In [14]:
X_train_clean.head(5)

Unnamed: 0,Sentence
923,မူဝါဒ ဆိုင်ရာ ပံ့ပိုး e-commerce အစိမ်း ရောင် ...
59663,It lets your Healthcare provider spot Health p...
10804,"အလင်း အလျင် နာရီ 1 , 079 , 252 , 848 . 8 ကီလို..."
2692,ဈေးဝယ် စင်တာ အဆင့် ဟိုတယ် ပန်း ခြုံ ကဲ့သို့ လူ...
2585,အစေ့ အဆံ


Data Leakage


In [15]:
# Convert to sets to quickly check for overlap
overlap = set(X_train_clean).intersection(set(X_val_clean))
print(f"Number of overlapping samples: {len(overlap)}")

Number of overlapping samples: 1735


In [16]:
X_val_clean = [x for x in X_val_clean if x not in X_train_clean]

In [22]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(oov_token="<OOV>", split=' ')  # 'split' helps with syllable/token split
tokenizer.fit_on_texts(X_train_clean)

X_train_seq = tokenizer.texts_to_sequences(X_train_clean)
X_val_seq = tokenizer.texts_to_sequences(X_val_clean)

MAX_LEN = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post', truncating='post')

vocab_size = len(tokenizer.word_index) + 1


In [24]:
print(X_train_pad[:3])

[[ 1359   140  1129  1073  3118  4864   838   587   132    32  4865   111
   4705   357    81  3785   114   146     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [   18 11852    83  3203  4783  3620   206   450   604     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0    

In [38]:
map_label = {
    'Social': 0,
    'Entertainment': 1,
    'Product&Service': 2,
    'Business': 3,
    'Sports': 4,
    'Science&Technology': 5,
    'Education': 6,
    'Culture&History': 7,
    'Health': 8,
    'Environmental': 9,
    'Political': 10,
    'Gambling': 11,
    'Adult Content': 12,
}
y_train_encoded = y_train.map(map_label)
y_val_encoded = y_val.map(map_label)


Attention is all You Need


In [39]:
# Transformer Encoder with attention mask support
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, heads, neurons, dropout_rate=0.5):
        super(TransformerEncoder, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=heads, key_dim=embed_dim)
        self.ffn = Sequential([
            layers.Dense(neurons, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)

    def call(self, inputs, mask=None, training=False):
        # Multi-head self-attention with mask
        attn_output = self.att(inputs, inputs, attention_mask=mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# Token + Position Embedding
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

Model Structure

In [40]:
emb_dim = 300
heads = 4
neurons = 32
maxlen = 100
vocab_size = 40701

inputs = Input(shape=(maxlen,), dtype=tf.int32)
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, emb_dim)
x = embedding_layer(inputs)
transformer_block = TransformerEncoder(emb_dim, heads, neurons)
x = transformer_block(x)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = Dropout(0.3)(x)
outputs = layers.Dense(13, activation="softmax")(x)
model = Model(inputs=inputs, outputs=outputs)

In [41]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.00003), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [42]:
DLmodel = "DL.keras"
checkpoint = ModelCheckpoint(DLmodel,
                             monitor = "val_loss",
                             mode = "min",
                             save_best_only = True,
                             verbose = 1)
early_stopping = EarlyStopping(monitor = "val_loss",min_delta = 0.0001,patience = 1,verbose = 1)
learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss',
                                            patience=3,
                                            verbose=1,
                                            factor=0.2,
                                            min_lr=0.00000001)

In [43]:
history = model.fit(X_train_pad,y_train_encoded,
                    validation_data=(X_val_pad,y_val_encoded),
                    epochs=25,
                    batch_size=32,
                    callbacks=[early_stopping])

Epoch 1/25
[1m2338/2338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 48ms/step - accuracy: 0.2674 - loss: 2.1822 - val_accuracy: 0.7597 - val_loss: 0.8355
Epoch 2/25
[1m2338/2338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 44ms/step - accuracy: 0.7952 - loss: 0.7173 - val_accuracy: 0.8400 - val_loss: 0.5382
Epoch 3/25
[1m2338/2338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 42ms/step - accuracy: 0.8803 - loss: 0.4040 - val_accuracy: 0.8644 - val_loss: 0.4659
Epoch 4/25
[1m2338/2338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 43ms/step - accuracy: 0.9226 - loss: 0.2646 - val_accuracy: 0.8755 - val_loss: 0.4465
Epoch 5/25
[1m2338/2338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 42ms/step - accuracy: 0.9478 - loss: 0.1750 - val_accuracy: 0.8794 - val_loss: 0.4533
Epoch 5: early stopping


In [53]:
val_loss, val_acc = model.evaluate(X_val_pad, y_val_encoded, batch_size=32)
print(f"Validation Accuracy: {val_acc:.4f}")

[1m585/585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.8793 - loss: 0.4473
Validation Accuracy: 0.8794


In [55]:
import pickle
from tensorflow.keras.models import load_model

model.save('my_transformer_model3.keras')
with open('tokenizer3.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


In [56]:
from sklearn.metrics import classification_report
y_pred_prob = model.predict(X_val_pad)          # Predict probabilities
y_pred = y_pred_prob.argmax(axis=1)             # Convert to predicted class indices
y_true = y_val_encoded
print(classification_report(y_true, y_pred, digits=4))


[1m585/585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step
              precision    recall  f1-score   support

           0     0.7624    0.6713    0.7140      1439
           1     0.8930    0.8917    0.8923      1394
           2     0.8408    0.8901    0.8647      1993
           3     0.8689    0.8893    0.8789      1833
           4     0.9178    0.9574    0.9372      1644
           5     0.7241    0.7898    0.7555      1037
           6     0.8723    0.9148    0.8931      1479
           7     0.9389    0.9060    0.9221      1595
           8     0.9419    0.8853    0.9127      1796
           9     0.9534    0.8627    0.9058      1020
          10     0.8653    0.9044    0.8844      1967
          11     0.9651    0.9083    0.9358       883
          12     0.9501    0.9516    0.9508       620

    accuracy                         0.8794     18700
   macro avg     0.8842    0.8787    0.8806     18700
weighted avg     0.8805    0.8794    0.8792     18700



In [57]:
# Transformer Encoder with attention mask support
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, heads, neurons, dropout_rate=0.5,**kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.att = layers.MultiHeadAttention(num_heads=heads, key_dim=embed_dim)
        self.ffn = Sequential([
            layers.Dense(neurons, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)

    def call(self, inputs, mask=None, training=False):
        # Multi-head self-attention with mask
        attn_output = self.att(inputs, inputs, attention_mask=mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# Token + Position Embedding
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim,**kwargs):
        super(TokenAndPositionEmbedding, self).__init__(**kwargs)
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [134]:
model = load_model('/content/my_transformer_model.h5', custom_objects={
    'TransformerEncoder': TransformerEncoder,
    'TokenAndPositionEmbedding': TokenAndPositionEmbedding,
})
with open('/content/tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

new_sentence = "ရပ်ကွက်ထဲကစုံတွဲ"
cleaned = preprocessor.preprocessing(new_sentence)
seq = tokenizer.texts_to_sequences([cleaned])  # reuse the saved tokenizer
seq_pad = pad_sequences(seq, maxlen=maxlen, padding='post', truncating='post')



In [135]:
pred_probs = model.predict(seq_pad)
pred_index = np.argmax(pred_probs, axis=1)[0]

# Reverse label map
inv_map_label = {v: k for k, v in map_label.items()}
pred_label = inv_map_label[pred_index]

print(f"Predicted class: {pred_label}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 813ms/step
Predicted class: Social


End