In [9]:
import pickle
import tensorflow as tf
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.layers import Embedding, Layer, Dense, Dropout, MultiHeadAttention, LayerNormalization, Input, GlobalAveragePooling1D
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from fastapi import FastAPI
from pydantic import BaseModel
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = stopwords.words('english')
stop_words.remove('not')
lemmatizer = WordNetLemmatizer()

class MyanmarTextPreprocessor():
    def __init__(self, dict_path: str, stop_path: str):
        self.dictionary = self.load_dictionary(dict_path)
        self.stopwords = self.load_stopwords(stop_path)

    # Load dictionary into a set
    def load_dictionary(self, dict_path):
        dictionary = set()
        with open(dict_path, 'r', encoding='utf-8') as f:
            for line in f:
                word = line.strip()
                if word:
                    dictionary.add(word)
        return dictionary

    # Load stopwords into a set
    def load_stopwords(self, stopword_path):
        stopwords = set()
        with open(stopword_path, 'r', encoding='utf-8') as f:
            for line in f:
                word = line.strip()
                if word:
                    stopwords.add(word)
        return stopwords

    # Merge syllables based on dictionary
    def merge_with_dictionary(self, syllables):
        merged_tokens = []
        i = 0
        while i < len(syllables):
            matched = False
            for j in range(len(syllables), i, -1):
                combined = ''.join(syllables[i:j])
                if combined in self.dictionary:
                    merged_tokens.append(combined)
                    i = j
                    matched = True
                    break
            if not matched:
                merged_tokens.append(syllables[i])
                i += 1
        return merged_tokens

    def preprocessing(self, text: str):
        text = re.sub(r"(([A-Za-z0-9]+)|[က-အ|ဥ|ဦ](င်္|[က-အ][ှ]*[့း]*[်]|္[က-အ]|[ါ-ှႏꩻ][ꩻ]*){0,}|.)", r"\1 ", text)
        text = text.strip().split()
        merged_tokens = self.merge_with_dictionary(text)
        filtered_tokens = [token for token in merged_tokens if token not in self.stopwords]
        return ' '.join(filtered_tokens)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [4]:
preprocessor = MyanmarTextPreprocessor('/content/drive/MyDrive/Datasets/dict-words.txt', '/content/drive/MyDrive/Datasets/sw.txt')


In [None]:
sentences = df['Sentence']
label = df['Label']
X_train, X_val, y_train, y_val = train_test_split(sentences, label, test_size=0.2, stratify=label, random_state=40)

In [None]:
X_train_clean = X_train.apply(preprocessor.preprocessing)
X_val_clean = X_val.apply(preprocessor.preprocessing)

In [None]:
overlap = set(X_train_clean).intersection(set(X_val_clean))
print(f"Number of overlapping samples: {len(overlap)}")
X_val_clean = [x for x in X_val_clean if x not in X_train_clean]

In [None]:
map_label = {
    'Social': 0,
    'Entertainment': 1,
    'Product&Service': 2,
    'Business': 3,
    'Sports': 4,
    'Science&Technology': 5,
    'Education': 6,
    'Culture&History': 7,
    'Health': 8,
    'Environmental': 9,
    'Political': 10,
    'Gambling': 11,
    'Adult Content': 12,
}
y_train_encoded = y_train.map(map_label)
y_val_encoded = y_val.map(map_label)

In [5]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, heads, neurons, dropout_rate=0.5,**kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.att = layers.MultiHeadAttention(num_heads=heads, key_dim=embed_dim)
        self.ffn = Sequential([
            layers.Dense(neurons, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)

    def call(self, inputs, mask=None, training=False):
        # Multi-head self-attention with mask
        attn_output = self.att(inputs, inputs, attention_mask=mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# Token + Position Embedding
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim,**kwargs):
        super(TokenAndPositionEmbedding, self).__init__(**kwargs)
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [6]:
maxlen = 100
model = load_model('/content/drive/MyDrive/Datasets/my_transformer_model_1.h5', custom_objects={
    'TransformerEncoder': TransformerEncoder,
    'TokenAndPositionEmbedding': TokenAndPositionEmbedding,
})
with open('/content/drive/MyDrive/Datasets/tokenizer_1.pkl', 'rb') as f:
    tokenizer = pickle.load(f)



In [None]:
history = model.fit(X_train_pad,y_train_encoded,
                    validation_data=(X_val_pad,y_val_encoded),
                    epochs=25,
                    batch_size=32,
                    callbacks=[early_stopping])

In [None]:
val_loss, val_acc = model.evaluate(X_val_pad, y_val_encoded, batch_size=32)
print(f"Validation Accuracy: {val_acc:.4f}")

In [None]:
from sklearn.metrics import classification_report
y_pred_prob = model.predict(X_val_pad)          # Predict probabilities
y_pred = y_pred_prob.argmax(axis=1)             # Convert to predicted class indices
y_true = y_val_encoded
print(classification_report(y_true, y_pred, digits=4))

In [None]:
model.save('my_transformer_model3.keras')
with open('tokenizer3.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)