In [None]:
# CELL 1 – Imports
import os
import numpy as np
import pandas as pd

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import joblib
import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt", quiet=True)

print("All imports successful!")


In [None]:
# CELL 2 – Create training dataset for summarization
data = [
    {
        "input_text": (
            "Machine learning is a field of artificial intelligence that "
            "focuses on building systems that learn from data. These systems "
            "can improve their performance on tasks over time without being "
            "explicitly programmed for every rule."
        ),
        "target_summary": (
            "Machine learning is AI where systems learn from data and improve "
            "performance over time."
        ),
    },
    {
        "input_text": (
            "Python is a high-level, interpreted programming language known "
            "for its readability and large ecosystem of libraries. It is "
            "widely used in web development, data science, automation, and "
            "machine learning."
        ),
        "target_summary": (
            "Python is a readable, high-level language used in web, data "
            "science, automation, and ML."
        ),
    },
    {
        "input_text": (
            "Supervised learning uses labeled data to train models, meaning "
            "each input comes with the correct output. The model learns to "
            "map inputs to outputs so it can make predictions on new, "
            "unseen data."
        ),
        "target_summary": (
            "Supervised learning trains models on labeled data so they can "
            "predict outputs for new inputs."
        ),
    },
    {
        "input_text": (
            "Neural networks are computational models inspired by the human "
            "brain. They consist of layers of interconnected nodes that can "
            "learn complex patterns from data through training."
        ),
        "target_summary": (
            "Neural networks are layered models that learn complex patterns "
            "from data, inspired by the brain."
        ),
    },
    {
        "input_text": (
            "Deep learning is a subset of machine learning using neural networks "
            "with multiple layers to learn representations of data. It has "
            "achieved remarkable success in image recognition, natural language "
            "processing, and other complex tasks."
        ),
        "target_summary": (
            "Deep learning uses multi-layer neural networks to learn data "
            "representations, succeeding in vision and NLP tasks."
        ),
    },
]

df = pd.DataFrame(data)
print(f"Created training dataset with {len(df)} samples")
df.head()


In [None]:
# CELL 3 – Prepare tokenizers
input_texts = df["input_text"].tolist()
target_texts = df["target_summary"].tolist()

num_words = 5000
oov_token = "<OOV>"

input_tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
input_tokenizer.fit_on_texts(input_texts)

target_tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
target_tokenizer.fit_on_texts(target_texts)

input_sequences = input_tokenizer.texts_to_sequences(input_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)

max_input_len = max(len(seq) for seq in input_sequences)
max_target_len = max(len(seq) for seq in target_sequences)

print(f"Max input length: {max_input_len}")
print(f"Max target length: {max_target_len}")


In [None]:
# CELL 4 – Pad sequences
X = pad_sequences(input_sequences, maxlen=max_input_len, padding="post", truncating="post")
y = pad_sequences(target_sequences, maxlen=max_target_len, padding="post", truncating="post")

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")


In [None]:
# CELL 5 – Build Keras embedding + LSTM model
embedding_dim = 64
latent_dim = 128
vocab_size = num_words

inputs = keras.Input(shape=(max_input_len,))
x = layers.Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs)
x = layers.Bidirectional(layers.LSTM(latent_dim))(x)
x = layers.Dense(128, activation="relu")(x)
outputs = layers.Dense(vocab_size, activation="softmax")(x)

model = keras.Model(inputs, outputs)
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"],
)

print("Model summary:")
model.summary()


In [None]:
# CELL 6 – Prepare targets (first token of each summary)
y_first_token = np.array([seq[0] if len(seq) > 0 else 0 for seq in target_sequences])
print(f"Target shape: {y_first_token.shape}")


In [None]:
# CELL 7 – Train Keras model
history = model.fit(
    X,
    y_first_token,
    epochs=50,
    batch_size=2,
    verbose=1,
)

print("\nModel training complete!")


In [None]:
# CELL 8 – Plot training history
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(history.history["loss"], label="Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Keras Model Training Loss")
plt.legend()
plt.savefig("../data/summarizer_training.png")
plt.show()

print("Training plot saved to data/summarizer_training.png")


In [None]:
# CELL 9 – Define extractive summarizer function
def simple_extractive_summary(text: str, max_sentences: int = 2) -> str:
    """
    Simple extractive summarizer: returns first N sentences.
    """
    text = (text or "").strip()
    if not text:
        return ""

    sentences = sent_tokenize(text)

    if len(sentences) <= max_sentences:
        return text

    return " ".join(sentences[:max_sentences])

# Test it
test_text = df["input_text"].iloc[0]
print("Original text:")
print(test_text)
print("\n\nSummarized:")
print(simple_extractive_summary(test_text, max_sentences=2))


In [None]:
# CELL 10 – Save Keras model and tokenizers
models_dir = "../backend/models"
os.makedirs(models_dir, exist_ok=True)

model_path = os.path.join(models_dir, "summarizer_keras_model.h5")
input_tok_path = os.path.join(models_dir, "summarizer_input_tokenizer.joblib")
target_tok_path = os.path.join(models_dir, "summarizer_target_tokenizer.joblib")

model.save(model_path)
joblib.dump(input_tokenizer, input_tok_path)
joblib.dump(target_tokenizer, target_tok_path)

print(f"Keras model saved to: {model_path}")
print(f"Input tokenizer saved to: {input_tok_path}")
print(f"Target tokenizer saved to: {target_tok_path}")
