In [19]:
import pickle
import tensorflow as tf
tf.config.run_functions_eagerly(True)
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.layers import Embedding, Layer, Dense, Dropout, MultiHeadAttention, LayerNormalization, Input, GlobalAveragePooling1D
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from fastapi import FastAPI
from pydantic import BaseModel
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
import re

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/fine_tune_dataset.csv",encoding='utf-8')

In [22]:
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = stopwords.words('english')
stop_words.remove('not')
lemmatizer = WordNetLemmatizer()

class MyanmarTextPreprocessor():
    def __init__(self, dict_path: str, stop_path: str):
        self.dictionary = self.load_dictionary(dict_path)
        self.stopwords = self.load_stopwords(stop_path)

    # Load dictionary into a set
    def load_dictionary(self, dict_path):
        dictionary = set()
        with open(dict_path, 'r', encoding='utf-8') as f:
            for line in f:
                word = line.strip()
                if word:
                    dictionary.add(word)
        return dictionary

    # Load stopwords into a set
    def load_stopwords(self, stopword_path):
        stopwords = set()
        with open(stopword_path, 'r', encoding='utf-8') as f:
            for line in f:
                word = line.strip()
                if word:
                    stopwords.add(word)
        return stopwords

    # Merge syllables based on dictionary
    def merge_with_dictionary(self, syllables):
        merged_tokens = []
        i = 0
        while i < len(syllables):
            matched = False
            for j in range(len(syllables), i, -1):
                combined = ''.join(syllables[i:j])
                if combined in self.dictionary:
                    merged_tokens.append(combined)
                    i = j
                    matched = True
                    break
            if not matched:
                merged_tokens.append(syllables[i])
                i += 1
        return merged_tokens

    def preprocessing(self, text: str):
        text = re.sub(r"(([A-Za-z0-9]+)|[က-အ|ဥ|ဦ](င်္|[က-အ][ှ]*[့း]*[်]|္[က-အ]|[ါ-ှႏꩻ][ꩻ]*){0,}|.)", r"\1 ", text)
        text = text.strip().split()
        merged_tokens = self.merge_with_dictionary(text)
        filtered_tokens = [token for token in merged_tokens if token not in self.stopwords]
        return ' '.join(filtered_tokens)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
preprocessor = MyanmarTextPreprocessor('/content/drive/MyDrive/Datasets/dict-words.txt', '/content/drive/MyDrive/Datasets/sw.txt')


In [24]:
sentences = df['Sentence']
label = df['Label']
X_train, X_val, y_train, y_val = train_test_split(sentences, label, test_size=0.2, stratify=label, random_state=40)

In [25]:
X_train_clean = X_train.apply(preprocessor.preprocessing)
X_val_clean = X_val.apply(preprocessor.preprocessing)

In [26]:
overlap = set(X_train_clean).intersection(set(X_val_clean))
print(f"Number of overlapping samples: {len(overlap)}")
X_val_clean = [x for x in X_val_clean if x not in X_train_clean]

Number of overlapping samples: 470


In [27]:
map_label = {
    'Social': 0,
    'Entertainment': 1,
    'Product&Service': 2,
    'Business': 3,
    'Sports': 4,
    'Science&Technology': 5,
    'Education': 6,
    'Culture&History': 7,
    'Health': 8,
    'Environmental': 9,
    'Political': 10,
    'Gambling': 11,
    'Adult Content': 12,
}
y_train_encoded = y_train.map(map_label)
y_val_encoded = y_val.map(map_label)

In [28]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, heads, neurons, dropout_rate=0.5,**kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.att = layers.MultiHeadAttention(num_heads=heads, key_dim=embed_dim)
        self.ffn = Sequential([
            layers.Dense(neurons, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)

    def call(self, inputs, mask=None, training=False):
        # Multi-head self-attention with mask
        attn_output = self.att(inputs, inputs, attention_mask=mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# Token + Position Embedding
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim,**kwargs):
        super(TokenAndPositionEmbedding, self).__init__(**kwargs)
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [29]:
model = load_model('/content/drive/MyDrive/Datasets/my_transformer_model_1.h5', custom_objects={
    'TransformerEncoder': TransformerEncoder,
    'TokenAndPositionEmbedding': TokenAndPositionEmbedding,
})
with open('/content/drive/MyDrive/Datasets/tokenizer_1.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

X_train_seq = tokenizer.texts_to_sequences(X_train_clean)
X_val_seq = tokenizer.texts_to_sequences(X_val_clean)

MAX_LEN = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post', truncating='post')

vocab_size = len(tokenizer.word_index) + 1



In [30]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.00003), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [31]:
early_stopping = EarlyStopping(monitor = "val_loss",min_delta = 0.0001,patience = 4,verbose = 1)
history = model.fit(X_train_pad,y_train_encoded,
                    validation_data=(X_val_pad,y_val_encoded),
                    epochs=25,
                    batch_size=32,
                    callbacks=[early_stopping])

Epoch 1/25




[1m1617/1617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m410s[0m 254ms/step - accuracy: 0.8369 - loss: 0.5697 - val_accuracy: 0.8803 - val_loss: 0.4068
Epoch 2/25
[1m1617/1617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m428s[0m 245ms/step - accuracy: 0.8996 - loss: 0.3284 - val_accuracy: 0.8940 - val_loss: 0.3677
Epoch 3/25
[1m1617/1617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m442s[0m 245ms/step - accuracy: 0.9292 - loss: 0.2332 - val_accuracy: 0.8987 - val_loss: 0.3650
Epoch 4/25
[1m1617/1617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m443s[0m 245ms/step - accuracy: 0.9494 - loss: 0.1660 - val_accuracy: 0.9048 - val_loss: 0.3541
Epoch 5/25
[1m1617/1617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m442s[0m 245ms/step - accuracy: 0.9638 - loss: 0.1238 - val_accuracy: 0.9023 - val_loss: 0.3642
Epoch 6/25
[1m1617/1617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m444s[0m 247ms/step - accuracy: 0.9726 - loss: 0.0949 - val_accuracy: 0.9048 - val_loss: 0.3830
Epo

In [35]:
val_loss, val_acc = model.evaluate(X_val_pad, y_val_encoded, batch_size=32)
print(f"Validation Accuracy: {val_acc:.4f}")

[1m  3/405[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m17s[0m 44ms/step - accuracy: 0.8663 - loss: 0.5665



[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 51ms/step - accuracy: 0.8965 - loss: 0.4674
Validation Accuracy: 0.9014


In [36]:
from sklearn.metrics import classification_report
y_pred_prob = model.predict(X_val_pad)          # Predict probabilities
y_pred = y_pred_prob.argmax(axis=1)             # Convert to predicted class indices
y_true = y_val_encoded
print(classification_report(y_true, y_pred, digits=4))

[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 39ms/step
              precision    recall  f1-score   support

           0     0.8311    0.6150    0.7069      1000
           1     0.8420    0.9060    0.8728      1000
           2     0.8707    0.8620    0.8663      1000
           3     0.9333    0.8820    0.9069      1000
           4     0.9620    0.9360    0.9488      1000
           5     0.7910    0.8970    0.8407      1000
           6     0.9182    0.9650    0.9410      1000
           7     0.8721    0.9140    0.8926      1000
           8     0.9348    0.9170    0.9258      1000
           9     0.9432    0.9300    0.9366      1000
          10     0.9049    0.9520    0.9279      1000
          11     0.9707    0.9582    0.9644       932
          12     0.9537    0.9880    0.9705      1000

    accuracy                         0.9014     12932
   macro avg     0.9021    0.9017    0.9001     12932
weighted avg     0.9018    0.9014    0.8998     12932



In [37]:
model.save('finetune_transformer_ver1.2.keras')
with open('finetune_tokenizer_ver1.2.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [39]:
from google.colab import files
files.download('finetune_transformer_ver1.2.keras')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>