In [None]:
import os
import numpy as np
import pandas as pd

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import joblib

In [None]:
data = [
    {
        "input_text": (
            "Machine learning is a field of artificial intelligence that "
            "focuses on building systems that learn from data. These systems "
            "can improve their performance on tasks over time without being "
            "explicitly programmed for every rule."
        ),
        "target_summary": (
            "Machine learning is AI where systems learn from data and improve "
            "performance over time."
        ),
    },
    {
        "input_text": (
            "Python is a high-level, interpreted programming language known "
            "for its readability and large ecosystem of libraries. It is "
            "widely used in web development, data science, automation, and "
            "machine learning."
        ),
        "target_summary": (
            "Python is a readable, high-level language used in web, data "
            "science, automation, and ML."
        ),
    },
    {
        "input_text": (
            "Supervised learning uses labeled data to train models, meaning "
            "each input comes with the correct output. The model learns to "
            "map inputs to outputs so it can make predictions on new, "
            "unseen data."
        ),
        "target_summary": (
            "Supervised learning trains models on labeled data so they can "
            "predict outputs for new inputs."
        ),
    },
    {
        "input_text": (
            "Neural networks are computational models inspired by the human "
            "brain. They consist of layers of interconnected nodes that can "
            "learn complex patterns from data through training."
        ),
        "target_summary": (
            "Neural networks are layered models that learn complex patterns "
            "from data, inspired by the brain."
        ),
    },
]

df = pd.DataFrame(data)
df.head()


In [None]:
input_texts = df["input_text"].tolist()
target_texts = df["target_summary"].tolist()

num_words = 5000
oov_token = "<OOV>"

input_tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
input_tokenizer.fit_on_texts(input_texts)

target_tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
target_tokenizer.fit_on_texts(target_texts)

input_sequences = input_tokenizer.texts_to_sequences(input_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)

max_input_len = max(len(seq) for seq in input_sequences)
max_target_len = max(len(seq) for seq in target_sequences)

max_input_len, max_target_len

In [None]:
X = pad_sequences(input_sequences, maxlen=max_input_len, padding="post", truncating="post")
y = pad_sequences(target_sequences, maxlen=max_target_len, padding="post", truncating="post")

X.shape, y.shape


In [None]:
embedding_dim = 64
latent_dim = 128
vocab_size = num_words

inputs = keras.Input(shape=(max_input_len,))
x = layers.Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs)
x = layers.Bidirectional(layers.LSTM(latent_dim))(x)
x = layers.Dense(128, activation="relu")(x)
outputs = layers.Dense(vocab_size, activation="softmax")(x)  # predict next-word distribution (simplified)

model = keras.Model(inputs, outputs)
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"],
)
model.summary()


In [None]:
# Use only the first token of each target sequence as a simple classification target
y_first_token = np.array([seq[0] if len(seq) > 0 else 0 for seq in target_sequences])

X.shape, y_first_token.shape


In [None]:
history = model.fit(
    X,
    y_first_token,
    epochs=50,
    batch_size=2,
    verbose=1,
)


In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
import string

# Download punkt once
nltk.download("punkt")

def simple_extractive_summary(text, max_sentences=2):
    """
    Very simple extractive summarizer:
    - Split into sentences
    - If text is short, return as is
    - Otherwise return the first `max_sentences` sentences
    """
    text = text.strip()
    if not text:
        return ""

    sentences = sent_tokenize(text)

    if len(sentences) <= max_sentences:
        return text

    summary = " ".join(sentences[:max_sentences])
    return summary

# Quick test
sample_text = df["input_text"].iloc[0]
print("Original:")
print(sample_text)
print("\nSummary:")
print(simple_extractive_summary(sample_text, max_sentences=2))
