In [None]:
# Updated imports for latest versions
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc

# TensorFlow/Keras for LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# Transformers for BERT
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import tensorflow as tf

In [None]:
# Load and preprocess data
df = pd.read_csv('/content/combined_dataset.csv', encoding='utf8')


In [None]:
df

Unnamed: 0,id,comment,label
0,1,The weather is nice today.,0
1,2,I need to buy some groceries.,0
2,3,What time does the store open?,0
3,4,She is reading a book.,0
4,5,The train arrives at 5 PM.,0
...,...,...,...
11340,6341,ෆට්ට ඒත් ඉතින් ඔහේ ඔක්සිජන් ඔනී නැතී සෙට් එකක්...,0
11341,6342,ෆයිනලි,0
11342,6343,ෆුකෙන් හිනා මේවට...,1
11343,6344,"ෆෝන් එක චාර්ජ් කරගනින්,29%. පට්ට.....",0


In [None]:
# Text cleaning functions
def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = ''.join([c for c in text if not c.isnumeric()])
    return text.strip()

In [None]:
df

Unnamed: 0,id,comment,label
0,1,The weather is nice today.,0
1,2,I need to buy some groceries.,0
2,3,What time does the store open?,0
3,4,She is reading a book.,0
4,5,The train arrives at 5 PM.,0
...,...,...,...
11340,6341,ෆට්ට ඒත් ඉතින් ඔහේ ඔක්සිජන් ඔනී නැතී සෙට් එකක්...,0
11341,6342,ෆයිනලි,0
11342,6343,ෆුකෙන් හිනා මේවට...,1
11343,6344,"ෆෝන් එක චාර්ජ් කරගනින්,29%. පට්ට.....",0


In [None]:
df.columns


Index(['id', 'comment', 'label'], dtype='object')

In [None]:
df.head()



Unnamed: 0,id,comment,label
0,1,The weather is nice today.,0
1,2,I need to buy some groceries.,0
2,3,What time does the store open?,0
3,4,She is reading a book.,0
4,5,The train arrives at 5 PM.,0


In [None]:
#calculate length of the comment
df['length'] = df['comment'].apply(len)
df.head()

Unnamed: 0,id,comment,label,length
0,1,The weather is nice today.,0,26
1,2,I need to buy some groceries.,0,29
2,3,What time does the store open?,0,30
3,4,She is reading a book.,0,22
4,5,The train arrives at 5 PM.,0,26


In [None]:
exclude = set(",.:;'\"-?!/´`%")
def removePunctuation(txt):
    return ''.join([(c if c not in exclude else " ") for c in txt])

def removeNumbers(txt):
    return ''.join(c for c in txt if not c.isnumeric())


In [None]:
df['cleaned'] = df['comment'].apply(lambda x: removePunctuation(x))
df.head()

Unnamed: 0,id,comment,label,length,cleaned
0,1,The weather is nice today.,0,26,The weather is nice today
1,2,I need to buy some groceries.,0,29,I need to buy some groceries
2,3,What time does the store open?,0,30,What time does the store open
3,4,She is reading a book.,0,22,She is reading a book
4,5,The train arrives at 5 PM.,0,26,The train arrives at 5 PM


In [None]:
df


Unnamed: 0,id,comment,label,length,cleaned
0,1,The weather is nice today.,0,26,The weather is nice today
1,2,I need to buy some groceries.,0,29,I need to buy some groceries
2,3,What time does the store open?,0,30,What time does the store open
3,4,She is reading a book.,0,22,She is reading a book
4,5,The train arrives at 5 PM.,0,26,The train arrives at 5 PM
...,...,...,...,...,...
11340,6341,ෆට්ට ඒත් ඉතින් ඔහේ ඔක්සිජන් ඔනී නැතී සෙට් එකක්...,0,57,ෆට්ට ඒත් ඉතින් ඔහේ ඔක්සිජන් ඔනී නැතී සෙට් එකක්...
11341,6342,ෆයිනලි,0,6,ෆයිනලි
11342,6343,ෆුකෙන් හිනා මේවට...,1,19,ෆුකෙන් හිනා මේවට
11343,6344,"ෆෝන් එක චාර්ජ් කරගනින්,29%. පට්ට.....",0,37,ෆෝන් එක චාර්ජ් කරගනින් 29 පට්ට


In [None]:
df['cleaned'] = df['cleaned'].apply(lambda x: removeNumbers(x))
df.head()

Unnamed: 0,id,comment,label,length,cleaned
0,1,The weather is nice today.,0,26,The weather is nice today
1,2,I need to buy some groceries.,0,29,I need to buy some groceries
2,3,What time does the store open?,0,30,What time does the store open
3,4,She is reading a book.,0,22,She is reading a book
4,5,The train arrives at 5 PM.,0,26,The train arrives at PM


In [None]:
from collections import Counter

plt.rc('font', family='Lohit Devanagari')

results = Counter()
df.cleaned.str.split().apply(results.update)

most = results.most_common()
print(most[:10])

[('I', 1997), ('you', 1500), ('a', 1126), ('the', 1125), ('to', 1000), ('You', 999), ('මේ', 933), ('The', 875), ('will', 752), ('is', 750)]


In [None]:
words = [word for i in df[df['label'] == 1]['cleaned'].str.split() for word in i]

counter = Counter(words)
most = counter.most_common(20)

most

[('you', 1500),
 ('I', 1247),
 ('You', 999),
 ('will', 752),
 ('and', 629),
 ('a', 626),
 ('are', 625),
 ('මේ', 586),
 ('තෝ', 465),
 ('වගේ', 441),
 ('එපා', 429),
 ('be', 377),
 ('make', 376),
 ('should', 375),
 ('the', 375),
 ('ලංකාවට', 332),
 ('නෑ', 275),
 ('තොට', 255),
 ('from', 251),
 ('People', 250)]

In [None]:
X = df['cleaned'].values
y = df['label'].values

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# 1. LSTM Model
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = 128
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

model_lstm = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(32)),
    Dense(1, activation='sigmoid')
])

model_lstm.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=3)
history_lstm = model_lstm.fit(X_train_pad, y_train,
                             epochs=10,
                             batch_size=32,
                             validation_split=0.1,
                             callbacks=[early_stop])



Epoch 1/10
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 26ms/step - accuracy: 0.7869 - loss: 0.4044 - val_accuracy: 0.9295 - val_loss: 0.1651
Epoch 2/10
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 25ms/step - accuracy: 0.9528 - loss: 0.1324 - val_accuracy: 0.9141 - val_loss: 0.1858
Epoch 3/10
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 26ms/step - accuracy: 0.9840 - loss: 0.0528 - val_accuracy: 0.9273 - val_loss: 0.2035
Epoch 4/10
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.9896 - loss: 0.0339 - val_accuracy: 0.9251 - val_loss: 0.2280


In [None]:
from tensorflow.keras.models import Sequential, load_model



# Save model in modern .keras format
model_lstm.save('lstm_model.keras')  # Recommended over .h5


In [None]:
import pickle

# After fitting the tokenizer on your training data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

# Save the tokenizer to a file
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Tokenizer saved successfully!")


Tokenizer saved successfully!


In [None]:
df

Unnamed: 0,id,comment,label,length,cleaned
0,1,The weather is nice today.,0,26,The weather is nice today
1,2,I need to buy some groceries.,0,29,I need to buy some groceries
2,3,What time does the store open?,0,30,What time does the store open
3,4,She is reading a book.,0,22,She is reading a book
4,5,The train arrives at 5 PM.,0,26,The train arrives at PM
...,...,...,...,...,...
11340,6341,ෆට්ට ඒත් ඉතින් ඔහේ ඔක්සිජන් ඔනී නැතී සෙට් එකක්...,0,57,ෆට්ට ඒත් ඉතින් ඔහේ ඔක්සිජන් ඔනී නැතී සෙට් එකක්...
11341,6342,ෆයිනලි,0,6,ෆයිනලි
11342,6343,ෆුකෙන් හිනා මේවට...,1,19,ෆුකෙන් හිනා මේවට
11343,6344,"ෆෝන් එක චාර්ජ් කරගනින්,29%. පට්ට.....",0,37,ෆෝන් එක චාර්ජ් කරගනින් පට්ට


In [None]:
model_lstm.save("lstm_model.keras")


In [None]:
# Save the LSTM model in HDF5 format
model_lstm.save("lstm_model.h5")


