# 1) Initial Set-Up


*   Installing packages
*   Importing packages
*   Database connection function (MongoDB)


In [2]:
# Installing necessary libraries
!pip install "pymongo[srv]"
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets
!pip install -U bertviz
!pip install -U umap-learn
!pip install -U sentencepiece
!pip install -U urllib3
!pip install py7zr
!pip install -U evaluate
!pip install rouge_score

Collecting pymongo[srv]
  Downloading pymongo-4.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (677 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m677.1/677.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dnspython<3.0.0,>=1.16.0 (from pymongo[srv])
  Downloading dnspython-2.4.2-py3-none-any.whl (300 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.4/300.4 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.4.2 pymongo-4.6.1
Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled tr

In [3]:
# Importing libraries
import numpy as np
import pandas as pd
from pymongo import MongoClient
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
import evaluate
from transformers import create_optimizer, AdamWeightDecay
from huggingface_hub import notebook_login
from transformers import TFAutoModelForSeq2SeqLM
from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, create_optimizer, AdamWeightDecay
from transformers import pipeline, TrainingArguments

In [4]:
# Preventing pandas from truncating dataframes while printing
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 1000)

In [5]:
# Database connection function
def get_database(database_name):
    # Assigning the connection string
    CONNECTION_STRING = "mongodb+srv://capstone-bangkit-experiment:rgnfUyMSMA4TYVVW@prognet.tnozzjt.mongodb.net/?retryWrites=true&w=majority"
    # Creating the connection
    client = MongoClient(CONNECTION_STRING)
    # Accessing a database and returning it
    return client[database_name]

# Connecting to the database
db_capstone = get_database("capstone-project")

In [None]:
# Login To Huggingface Hub
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# 2) Initial data loading

In [None]:
# Loading the specified collection
raw_artikel_detik = db_capstone["article"]
df = pd.DataFrame(list(raw_artikel_detik.find()))

# 3) Data Exploration (Sandbox)

In [None]:
# Peeking dataset
df.head(1)

In [None]:
# Dataset anatomy
print("Anatomy : \n", df.info())

In [None]:
# Count how many duplicated data are there
duplicated = df[df.duplicated(subset="link_to_origin", keep=False)]
print(duplicated.shape)

(40, 10)


In [None]:
# Count how many unique categories out there
print(df['category'].nunique())

25


In [None]:
# Category count
category_count = df['category'].value_counts()
print(category_count)

In [None]:
# Inspecting data for each category
kecelakaan = df[df.category == 'Kecelakaan']
kecelakaan

In [None]:
# Location count
location_count = df['location'].value_counts()
print("Location unique values: ", df["location"].nunique())
print(location_count)

# 4) Data Pre-Processing & Feature Extraction

**4.1) Data Preparation & Splitting**


---



In [None]:
# Dropping Duplicates
df = df.drop_duplicates(subset=['link_to_origin'])
# Checking the result
duplicated = df[df.duplicated(subset="link_to_origin", keep=False)]
print(duplicated.shape)

(0, 11)


In [None]:
# Creating new columns based on the data needed
df['en_summary'] = df['summary'].apply(lambda x: x['en'])
df['en_content'] = df['content'].apply(lambda x: x['en'])
df['en_headline'] = df['headline'].apply(lambda x: x['en'])

In [None]:
# Creating a new dataframe with filtered column
selected_column = ['en_headline', 'en_summary', 'en_content']
df_filtered = df[selected_column]
print(df_filtered.shape)
df_filtered.head(1)

In [None]:
# Column rename
new_column_name = {'en_headline':'title', 'en_content':'text', 'en_summary':'summary'}
df_filtered.rename(columns=new_column_name, inplace=True)
df_filtered.head(1)

In [None]:
# Saving into database
collection_summary = db_capstone['article_summary']
collection_summary.insert_many(df_filtered.to_dict(orient='records'))

**⬇️ Pre-processing Checkpoint**

In [None]:
# Loading the dataset
article_summary = db_capstone["article_summary"]
df_filtered = pd.DataFrame(list(article_summary.find()))

In [None]:
# Removing the _id column due to incompatibility with huggingface dataset object.
selected_column = ['title', 'summary', 'text']
df_filtered = df_filtered[selected_column]
df_filtered.head(1)

Unnamed: 0,title,summary,text
0,"Initially only robbed, Gusti Mirah was killed because of the rebellion","Initially only robbed, Gusti Mirah was killed because of the rebellion","The trial in the murder case of Gusti Agung Mirah Lestari with defendants Nova Sandi Prasetia (31) and Rahman (28) at the Denpasar District Court has entered the witness examination stage, Wednesday (8/3/2023). From the testimony of the victim's brother and the police, it was concluded that Gusti Mirah was initially only going to be robbed before finally being killed because he rebelled. The legal team for the two defendants, Tyas Yuniawati Suroto, said that Nova Sandi Prasetya was the mastermind behind the robbery of Gusti Mirah, whom she had been dating for a month. Meanwhile, Rahman was Sandi's colleague who acted as the executor who robbed and killed Gusti Mirah. ""Initially, the executor and the victim's girlfriend only wanted to control the car, necklace and ring. However, because the victim resisted when he was robbed, they were killed,"" said Tyas to detikBali, Thursday (9/3/2023). Rahman, who initially only wanted to rob, had to face resistance from Gusti Mirah. Not wanting ..."


In [None]:
# Wrapping the dataset with dataset object from huggingface
dataset = Dataset.from_pandas(df_filtered)

In [None]:
# Dataset split
dataset = dataset.train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'summary', 'text'],
        num_rows: 1834
    })
    test: Dataset({
        features: ['title', 'summary', 'text'],
        num_rows: 459
    })
})

In [None]:
# Dataset sneakpeek
print("First training data:\n", dataset["train"][1])
print("Second training data:\n", dataset["test"][4])

First training data:
 {'title': 'British foreigner berates prosecutor after being sentenced to 2 years and 6 months in prison', 'summary': "British citizen (WN) Stephen Michael Jamnitzky cursed at the prosecutor when reading the verdict at the Denpasar District Court (PN), Tuesday (4/7/2023). The 39 year old foreigner cursed at the prosecutor, 'f**k you Prosecutor, f**k you.' Jamnitzky cursed after being sentenced to 2.6 years in prison.", 'text': 'British citizen (WN) Stephen Michael Jamnitzky cursed at the prosecutor when reading the verdict at the Denpasar District Court (PN), Tuesday (4/7/2023). The 39 year old foreigner cursed at the prosecutor, "f**k you Prosecutor, f**k you." Denpasar District Court spokesperson Gede Putra Astawa confirmed the incident. "According to the statement from the panel of judges, it was true (swearing) like that," he told detikBali on Tuesday evening (4/7/2023). Jamnitzky cursed after being sentenced to 2.6 years in prison. The trial with the agenda of

**4.2) Preprocessing**

Preprocessing procedure:
1. Prefixing the input with a prompt so T5 knows this is a summarization task.
2. Using the keyword "text_target" argument when tokenizing labels.
3. Truncating sequences to be no longer than the allowed maximum_length


---



In [None]:
# Model checkpoint
checkpoint = "t5-small"
# Loading the tokenizer for T5
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
# Preprocessing Function
prefix = "summarize: "
def preprocess_fn(data):
  # Adding the prefix
  inputs = [prefix + doc for doc in data["text"]]
  # Performing truncation & tokenizing
  model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
  # Encoding labels
  labels = tokenizer(text_target=data["summary"], max_length=128, truncation=True)
  # Corporating the encoded labels
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
# Applying preprocessing
tokenized_dataset = dataset.map(preprocess_fn, batched=True)

Map:   0%|          | 0/1834 [00:00<?, ? examples/s]

Map:   0%|          | 0/459 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'summary', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1834
    })
    test: Dataset({
        features: ['title', 'summary', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 459
    })
})

In [None]:
# Creating data collator (Tensorflow)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")

# 5) Fine-Tuning & Evaluation (T5)

**5.1) Fine Tuning**


---



In [None]:
# Loading the rouge metrics
rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
# Creating the evaluation function
def compute_metrics(eval_pred):
  # Unpack the evaluation predictions & their corresponding labels
  predictions, labels = eval_pred
  # Converting back decoded predicted token IDs to human-readable text
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  # Replacing any label with value -100 with the actual padding token ID.
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  # Decoding labels back into human text
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # Calculating ROGUE score
  result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

  # Calculating the length of each generated summary
  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  # Find the average length across all generated summaries
  result["gen_len"] = np.mean(prediction_lens)

  # Iterates through all key-value pairs in the result dict, and round them into 4 decimal places.
  return {k: round(v, 4) for k, v in result.items()}


In [None]:
# Creating the optimizer
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [None]:
# Instantiating the model from the model checkpoint
model = TFAutoModelForSeq2SeqLM.from_pretrained('t5-small')

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [None]:
# Model architecture
model.summary()

Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  16449536  
                                                                 
 encoder (TFT5MainLayer)     multiple                  35330816  
                                                                 
 decoder (TFT5MainLayer)     multiple                  41625344  
                                                                 
Total params: 60506624 (230.81 MB)
Trainable params: 60506624 (230.81 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Preparing the training & testing dataset (converting them from huggingface dataset to tensorflow dataset)
tf_train_set = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    shuffle=True,
    batch_size=4,
    collate_fn=data_collator
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_dataset["test"],
    shuffle=False,
    batch_size=4,
    collate_fn=data_collator
)

In [None]:
# Compiling the model with the specified optimizer
model.compile(optimizer=optimizer)

In [None]:
# Computing the rouge sore with callback
metric_callback = KerasMetricCallback(
    metric_fn=compute_metrics,
    eval_dataset=tf_test_set,
    predict_with_generate=True
)

# Pushing model to huggingface hub
push_to_hub_callback = PushToHubCallback(
    output_dir="wanderwise_summary_1",
    tokenizer=tokenizer
)

# Aggregating callback
callbacks = [metric_callback, push_to_hub_callback]

In [None]:
# Model training
model.fit(
    x=tf_train_set,
    validation_data=tf_test_set,
    epochs=5,
    callbacks=callbacks)

Epoch 1/5



Epoch 2/5



Epoch 3/5



Epoch 4/5



Epoch 5/5





<keras.src.callbacks.History at 0x7e0d0a167ac0>

**5.2) Model Inference**


---



In [None]:
# Dummy text for inference
text ='''
summarize: Israeli strikes on a group of seven journalists in south Lebanon on 13 October, which killed Reuters journalist Issam Abdallah and injured six others, were likely a direct attack on civilians that must be investigated as a war crime, Amnesty International said today.
Amnesty International verified over 100 videos and photographs, analyzed weapons fragments from the site, and interviewed nine witnesses. The findings indicate that the group was visibly identifiable as journalists and that the Israeli military knew or should have known that they were civilians yet attacked them anyway in two separate strikes 37 seconds apart.
“Our investigation into the incident uncovers chilling evidence pointing to an attack on a group of international journalists who were carrying out their work by reporting on hostilities. Direct attacks on civilians and indiscriminate attacks are absolutely prohibited by international humanitarian law and can amount to war crimes,” said Aya Majzoub, Amnesty International’s Deputy Regional Director for the Middle East and North Africa.
“Those responsible for Issam Abdallah’s unlawful killing and the injuring of six other journalists must be held accountable. No journalist should ever be targeted or killed simply for carrying out their work. Israel must not be allowed to kill and attack journalists with impunity. There must be an independent and impartial investigation into this deadly attack.”
'''

In [6]:
# Instantiating pipeline (model wrapper) and pulling the model from huggingface hub.
pipe = pipeline('summarization', model="arthd24/wanderwise_summary_1")

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [17]:
text_1='''
summarize: World News - In an analysis of official figures covering England and Wales, the Liberal Democrats reveal a concerning trend with more than 200,000 shoplifting cases left unresolved in the past 12 months.
The party asserts that the government is failing to effectively combat this escalating crime wave. The statistics indicate that, out of 362,809 cases examined, a staggering 205,676 shoplifting incidents concluded without
identifying a suspect from August to July. A Home Office spokeswoman counters, stating that charging rates for shoplifting have risen by “almost a third” in the past year.
Craig Beaumont, representing the Federation of Small Businesses, expresses dismay over the situation, highlighting the impact on local small and independent businesses.
He emphasizes that the lack of police investigation and prosecution contributes to an increase in organised shoplifting and threats to staff.
Chief Constable Amanda Blakeman of the National Police Chiefs’ Council acknowledges the damaging impact of retail crime and underscores the commitment of law enforcement to address offenders and support retailers.
However, Liberal Democrat home affairs spokesman Alistair Carmichael criticises the Conservative government, asserting that as unsolved shoplifting cases surge, criminal gangs go unchecked while shopkeepers are left vulnerable.
Carmichael advocates for a practical approach, emphasising the necessity of putting police back on the streets and ensuring thorough investigations. In response, a Home Office spokeswoman reiterates the government’s
commitment to combat shoplifting, labeling it a blight on communities and emphasizing a zero-tolerance approach by the police.
As shoplifting continues to plague communities, the debate intensifies over the effectiveness of current measures and the need for proactive strategies to curb this escalating crime epidemic.
'''
pipe(text_1)

[{'summary_text': 'The Liberal Democrats reveal a concerning trend with more than 200,000 shoplifting cases left unresolved in the past 12 months. The party asserts that the government is failing to effectively combat this escalating crime wave.'}]