<a href="https://colab.research.google.com/github/EngRidhoNet/BertLite-Summarizer/blob/main/BertLin_New.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import tensorflow as tf
import pandas as pd
import re
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

# 1. Load dataset
url = "https://raw.githubusercontent.com/mxfaqih/bertlin/refs/heads/main/rawdata.csv"
df = pd.read_csv(url)
df_cleaned = df[["Text", "Question"]].copy()

# 2. Normalisasi teks
def normalize_text(text):
    if pd.isna(text):
        return ""
    text = ''.join(c for c in text if c.isprintable())
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    return text

df_cleaned["Text"] = df_cleaned["Text"].apply(normalize_text)
df_cleaned["Question"] = df_cleaned["Question"].apply(normalize_text)

# 3. Inisialisasi tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 4. Fungsi tokenisasi batch
def tokenize_batch(texts, questions, max_input_length=512, max_target_length=50):
    try:
        inputs = tokenizer(
            texts.tolist(),
            padding="max_length",
            truncation=True,
            max_length=max_input_length,
            return_tensors="tf"
        )

        # Tokenize questions
        target_tokens = []
        for question in questions.tolist():
            # Add decoder start token if needed
            # question = "[CLS] " + question  # Use this if you want to explicitly add start token
            encoded = tokenizer.encode(
                question,
                padding="max_length",
                truncation=True,
                max_length=max_target_length,
                return_tensors="tf"
            )
            target_tokens.append(encoded[0])

        target_ids = tf.stack(target_tokens)

        return {
            "input_ids": inputs.input_ids,
            "attention_mask": inputs.attention_mask,
            "target_ids": target_ids
        }
    except Exception as e:
        print(f"Error during tokenization: {e}")
        return None

# 5. Split dataset
train_texts, val_texts, train_questions, val_questions = train_test_split(
    df_cleaned["Text"], df_cleaned["Question"], test_size=0.2, random_state=42
)

# 6. Tokenisasi batch
train_encodings = tokenize_batch(train_texts, train_questions)
val_encodings = tokenize_batch(val_texts, val_questions)

if train_encodings is None or val_encodings is None:
    raise ValueError("Tokenization failed. Check input data.")

# 7. Buat tf.data.Dataset
def create_tf_dataset(encodings, batch_size=8):
    # Create dataset using the target_ids directly (not using -100 as mask)
    return tf.data.Dataset.from_tensor_slices((
        {
            "input_ids": encodings["input_ids"],
            "attention_mask": encodings["attention_mask"],
            "target_ids": encodings["target_ids"][:, :-1]  # Remove last token for input
        },
        encodings["target_ids"][:, 1:]  # Shift by 1 for next-token prediction
    )).shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)

train_dataset = create_tf_dataset(train_encodings)
val_dataset = create_tf_dataset(val_encodings)

In [None]:
import tensorflow as tf
import pandas as pd
import re
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

# 1. Load dataset
url = "https://raw.githubusercontent.com/mxfaqih/bertlin/refs/heads/main/rawdata.csv"
df = pd.read_csv(url)
df_cleaned = df[["Text", "Question"]].copy()

# 2. Normalisasi teks
def normalize_text(text):
    if pd.isna(text):
        return ""
    text = ''.join(c for c in text if c.isprintable())
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    return text

df_cleaned["Text"] = df_cleaned["Text"].apply(normalize_text)
df_cleaned["Question"] = df_cleaned["Question"].apply(normalize_text)

# 3. Inisialisasi tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 4. Fungsi tokenisasi batch
def tokenize_batch(texts, questions, max_input_length=512, max_target_length=50):
    try:
        inputs = tokenizer(
            texts.tolist(),
            padding="max_length",
            truncation=True,
            max_length=max_input_length,
            return_tensors="tf"
        )

        # Tokenize questions
        target_tokens = []
        for question in questions.tolist():
            encoded = tokenizer.encode(
                question,
                padding="max_length",
                truncation=True,
                max_length=max_target_length,
                return_tensors="tf"
            )
            target_tokens.append(encoded[0])

        target_ids = tf.stack(target_tokens)

        return {
            "input_ids": inputs.input_ids,
            "attention_mask": inputs.attention_mask,
            "target_ids": target_ids
        }
    except Exception as e:
        print(f"Error during tokenization: {e}")
        return None

# 5. Split dataset
train_texts, val_texts, train_questions, val_questions = train_test_split(
    df_cleaned["Text"], df_cleaned["Question"], test_size=0.2, random_state=42
)

# 6. Tokenisasi batch
train_encodings = tokenize_batch(train_texts, train_questions)
val_encodings = tokenize_batch(val_texts, val_questions)

if train_encodings is None or val_encodings is None:
    raise ValueError("Tokenization failed. Check input data.")

# 7. Buat tf.data.Dataset
def create_tf_dataset(encodings, batch_size=8):
    # Create dataset for next token prediction
    decoder_inputs = encodings["target_ids"][:, :-1]  # Remove last token
    decoder_outputs = encodings["target_ids"][:, 1:]  # Shift by 1 for targets

    return tf.data.Dataset.from_tensor_slices((
        {
            "input_ids": encodings["input_ids"],
            "attention_mask": encodings["attention_mask"],
            "target_ids": decoder_inputs
        },
        decoder_outputs
    )).shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)

train_dataset = create_tf_dataset(train_encodings)
val_dataset = create_tf_dataset(val_encodings)

# Define model architecture
from transformers import TFBertModel

# 1. Definisi Transformer Decoder Layer dengan masking yang benar
class TransformerDecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(TransformerDecoderLayer, self).__init__()
        self.mha1 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads)
        self.mha2 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training, look_ahead_mask=None):
        # Self-attention with causal masking
        attn1, attn_weights_block1 = self.mha1(
            query=x, key=x, value=x, attention_mask=look_ahead_mask,
            return_attention_scores=True)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(x + attn1)

        # Cross-attention with encoder output
        attn2, attn_weights_block2 = self.mha2(
            query=out1, key=enc_output, value=enc_output,
            return_attention_scores=True)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(out1 + attn2)

        # Feed forward network
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(out2 + ffn_output)

        return out3

# 2. Model Seq2Seq yang diperbaiki
class Seq2SeqModel(tf.keras.Model):
    def __init__(self, vocab_size, d_model=768, num_heads=12, dff=2048, num_layers=4):
        super(Seq2SeqModel, self).__init__()

        # BERT encoder
        self.encoder = TFBertModel.from_pretrained(
            "bert-base-uncased",
            output_hidden_states=False,
            return_dict=True,
            output_attentions=False
        )

        # Freeze BERT pooling layer to fix the warning
        self.encoder.trainable = True
        if hasattr(self.encoder, 'bert') and hasattr(self.encoder.bert, 'pooler'):
            self.encoder.bert.pooler.trainable = False

        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.pos_encoding = self.positional_encoding(max_position=1000, d_model=d_model)

        # Decoder layers
        self.decoder_layers = [TransformerDecoderLayer(d_model, num_heads, dff) for _ in range(num_layers)]

        # Final projection to vocabulary
        self.final_layer = tf.keras.layers.Dense(vocab_size)

    def positional_encoding(self, max_position, d_model):
        """Create standard transformer positional encoding."""
        angle_rads = self.get_angles(
            tf.range(max_position, dtype=tf.float32)[:, tf.newaxis],
            tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
            d_model
        )

        # Apply sin to even indices in the array; 2i
        sines = tf.math.sin(angle_rads[:, 0::2])

        # Apply cos to odd indices in the array; 2i+1
        cosines = tf.math.cos(angle_rads[:, 1::2])

        pos_encoding = tf.concat([sines, cosines], axis=-1)
        pos_encoding = pos_encoding[tf.newaxis, ...]

        return tf.cast(pos_encoding, tf.float32)

    def get_angles(self, pos, i, d_model):
        angle_rates = 1 / tf.pow(10000.0, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
        return pos * angle_rates

    def create_padding_mask(self, seq):
        """Create mask for padding tokens."""
        seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
        return seq[:, tf.newaxis, tf.newaxis, :]

    def create_look_ahead_mask(self, size):
        """Create causal mask to prevent attending to future tokens."""
        mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
        return mask[tf.newaxis, tf.newaxis, :, :]  # Add batch and head dims

    def call(self, inputs, training=False):
        # Unpack inputs
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        target_ids = inputs["target_ids"]

        # Get BERT encoder output
        enc_output = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            training=training
        ).last_hidden_state

        # Process decoder inputs
        seq_len = tf.shape(target_ids)[1]

        # Convert decoder input tokens to embeddings and add positional encoding
        dec_embedding = self.embedding(target_ids)
        dec_embedding *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        dec_embedding += self.pos_encoding[:, :seq_len, :]

        # Create causal mask for decoder self-attention
        look_ahead_mask = self.create_look_ahead_mask(seq_len)

        # Pass through decoder layers
        dec_output = dec_embedding
        for layer in self.decoder_layers:
            dec_output = layer(dec_output, enc_output, training, look_ahead_mask)

        # Final projection to vocabulary
        logits = self.final_layer(dec_output)

        return logits

# Custom loss function
def custom_loss(y_true, y_pred):
    # Create mask for non-padding tokens
    mask = tf.cast(tf.math.not_equal(y_true, 0), tf.float32)

    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    loss = loss_object(y_true, y_pred)

    # Apply mask
    loss *= mask

    # Return mean loss over non-padding tokens
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

# Custom accuracy metric
def custom_accuracy(y_true, y_pred):
    # Make sure y_true is int64 for comparison with argmax
    y_true = tf.cast(y_true, tf.int64)

    # Create mask for non-padding tokens
    mask = tf.cast(tf.math.not_equal(y_true, 0), tf.float32)

    # Get predicted tokens
    pred_tokens = tf.argmax(y_pred, axis=-1, output_type=tf.int64)

    # Compare predictions with ground truth
    correct_predictions = tf.cast(tf.equal(pred_tokens, y_true), tf.float32)

    # Apply mask to only count non-padding tokens
    correct_predictions *= mask

    # Return accuracy
    return tf.reduce_sum(correct_predictions) / tf.reduce_sum(mask)

# Initialize model
vocab_size = tokenizer.vocab_size
model = Seq2SeqModel(vocab_size)

# Try a forward pass on a small batch to verify
sample_inputs = next(iter(train_dataset.take(1)))[0]
sample_outputs = model(sample_inputs, training=False)
print(f"Sample output shape: {sample_outputs.shape}")

# Compile model with custom loss and metrics
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
model.compile(
    optimizer=optimizer,
    loss=custom_loss,
    metrics=[custom_accuracy]
)

# Create checkpoint callback
checkpoint_path = "./bertlin_seq2seq_tf/checkpoints"
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_path,
    monitor='val_loss',
    save_best_only=True,
    save_weights_only=True
)

# Define early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)

# Train model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=3,
    callbacks=[checkpoint, early_stopping]
)

# Save model
model.save_weights('./bertlin_seq2seq_tf/final_weights')

# Function to generate questions from text
def generate_question(text, max_length=50):
    # Tokenize input text
    inputs = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="tf"
    )

    # Initialize with start token
    decoder_input = tf.expand_dims([tokenizer.cls_token_id], 0)

    # Generate tokens autoregressively
    for i in range(max_length - 1):
        # Prepare inputs for model
        model_inputs = {
            "input_ids": inputs.input_ids,
            "attention_mask": inputs.attention_mask,
            "target_ids": decoder_input
        }

        # Generate predictions
        predictions = model(model_inputs, training=False)

        # Get the last token's prediction
        next_token_logits = predictions[:, -1, :]
        next_token = tf.argmax(next_token_logits, axis=-1, output_type=tf.int32)

        # Break if end token is generated
        if next_token == tokenizer.sep_token_id:
            break

        # Append token to decoder input
        decoder_input = tf.concat([decoder_input, tf.expand_dims(next_token, 0)], axis=1)

    # Decode the generated tokens
    question = tokenizer.decode(decoder_input[0].numpy(), skip_special_tokens=True)
    return question

# Test with a sample from the dataset
sample_text = df_cleaned["Text"].iloc[0]
print("Sample text:", sample_text)
print("Original question:", df_cleaned["Question"].iloc[0])
print("Generated question:", generate_question(sample_text))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Sample output shape: (8, 49, 30522)
Epoch 1/3
Epoch 2/3
Epoch 3/3
Sample text: TMAU.. Please find a way to research TMAU And how to cure it with gene therapy. I'm tired kg smelling like fish/feces And getting talked about everyday of my life.
Original question: What is the latest research on TMAU and treatments for it?
Generated question: what are the treatments for for for for for for?


In [None]:
from rouge_score import rouge_scorer
from nltk.stem import PorterStemmer
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
import nltk
import numpy as np

# Download NLTK data untuk METEOR
nltk.download('punkt')
nltk.download('wordnet')

# 1. Fungsi preprocessing untuk evaluasi
def preprocess_text(text, use_stemming=True):
    if not text or not isinstance(text, str):
        return []
    ps = PorterStemmer()
    tokens = word_tokenize(text.lower())
    if use_stemming:
        tokens = [ps.stem(token) for token in tokens]
    return tokens

# 2. Fungsi evaluasi ROUGE dan METEOR
def evaluate_model(model, dataset, tokenizer, num_samples=100):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
    rouge1_scores, rouge2_scores, rougeL_scores, meteor_scores = [], [], [], []

    for i, (inputs, true_labels) in enumerate(dataset.take(num_samples)):
        try:
            # Prediksi
            target_ids = true_labels[:, :-1]
            predictions = model({"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "target_ids": target_ids}, training=False)
            predicted_ids = tf.argmax(predictions, axis=-1)

            # Decode
            pred_texts = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
            true_texts = tokenizer.batch_decode(true_labels, skip_special_tokens=True)

            # Preprocess dan hitung skor
            for pred, ref in zip(pred_texts, true_texts):
                pred_tokens = preprocess_text(pred)
                ref_tokens = preprocess_text(ref)
                pred_text = ' '.join(pred_tokens)
                ref_text = ' '.join(ref_tokens)

                # ROUGE
                if pred_text and ref_text:
                    scores = scorer.score(ref_text, pred_text)
                    rouge1_scores.append(scores['rouge1'].fmeasure)
                    rouge2_scores.append(scores['rouge2'].fmeasure)
                    rougeL_scores.append(scores['rougeL'].fmeasure)

                # METEOR
                if pred_tokens and ref_tokens:
                    meteor_scores.append(meteor_score([ref_tokens], pred_tokens))
        except Exception as e:
            print(f"Error during evaluation of sample {i}: {e}")
            continue

    return {
        "ROUGE-1": np.mean(rouge1_scores) if rouge1_scores else 0.0,
        "ROUGE-2": np.mean(rouge2_scores) if rouge2_scores else 0.0,
        "ROUGE-L": np.mean(rougeL_scores) if rougeL_scores else 0.0,
        "METEOR": np.mean(meteor_scores) if meteor_scores else 0.0
    }

# 3. Jalankan evaluasi
rouge_meteor_scores = evaluate_model(model, val_dataset, tokenizer)
print("Hasil Evaluasi (F1-score untuk ROUGE, skor untuk METEOR):")
for metric, score in rouge_meteor_scores.items():
    print(f"{metric}: {score:.4f}")