In [None]:
# --- Cell 1: Imports ---
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

# Paths
DATA_PATH = os.path.join("..", "data", "final_dataset.csv")
MODEL_PATH = os.path.join("..", "model", "my_model.keras")

# --- Cell 2: Load dataset ---
df = pd.read_csv(DATA_PATH)
df = df.dropna()
print("✅ Dataset loaded:", df.shape)
df.head()

# --- Cell 3: Tokenization ---
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["text"])

X = tokenizer.texts_to_sequences(df["text"])
X = pad_sequences(X, maxlen=200, padding='post', truncating='post')
y = df["label"].values

# --- Cell 4: Train-test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Cell 5: Model architecture ---
model = Sequential([
    Embedding(5000, 128, input_length=200),
    LSTM(64, dropout=0.3, recurrent_dropout=0.3),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# --- Cell 6: Train the model ---
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=4,
    batch_size=64,
    verbose=1
)

# --- Cell 7: Evaluate the model ---
loss, acc = model.evaluate(X_test, y_test)
print(f"✅ Test Accuracy: {acc:.4f}")

# --- Cell 8: Save the model ---
os.makedirs(os.path.join("..", "model"), exist_ok=True)
model.save(MODEL_PATH)
print(f"✅ Model saved successfully at {MODEL_PATH}")
