In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import joblib

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import load_model

# Load dataset
df = pd.read_csv("combined_data.csv")


In [4]:
VOCAB_SIZE = 10000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(df['text'])

sequences = tokenizer.texts_to_sequences(df['text'])
padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')

labels = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [5]:
model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=128, input_length=MAX_LEN),
    LSTM(64),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)




Epoch 1/5
[1m1669/1669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 61ms/step - accuracy: 0.8136 - loss: 0.4241 - val_accuracy: 0.7938 - val_loss: 0.4111
Epoch 2/5
[1m1669/1669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 61ms/step - accuracy: 0.9188 - loss: 0.2473 - val_accuracy: 0.9676 - val_loss: 0.0995
Epoch 3/5
[1m1669/1669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 61ms/step - accuracy: 0.9673 - loss: 0.1066 - val_accuracy: 0.9726 - val_loss: 0.0896
Epoch 4/5
[1m1669/1669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 60ms/step - accuracy: 0.9765 - loss: 0.0805 - val_accuracy: 0.9770 - val_loss: 0.0820
Epoch 5/5
[1m1669/1669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 59ms/step - accuracy: 0.9848 - loss: 0.0531 - val_accuracy: 0.9752 - val_loss: 0.0805


In [7]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")


[1m522/522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.9757 - loss: 0.0794
Test Accuracy: 0.98


In [9]:
joblib.dump(tokenizer, 'tokenizer.pkl')
model.save('rnn_model.h5')

