In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

# Tải các thư viện cần thiết

In [2]:
# !pip install datasets -q
# !pip install transformers[torch] -q
# !pip install evaluate -q
# !pip install sacrebleu -q
# !pip install streamlit -q
# !pip install pyngrok -q

# mày cứ tải hết cái đống ở trên (thủ công, bằng powershell/cmd, pip)
# rồi tải cái này:
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# đó là tải về pytorch có hỗ trợ cuda
# sau đó thì restart lại notebook, chạy cell bên dưới để xem có đang sử dụng GPU không, nếu nó hiện như kiểu:
# 2.3.1+cu118 -> pytorch phiên bản 2.3.1 hỗ trợ cuda 11.1
# True -> có sử dụng GPU
# thấy vậy là thành công, chạy cái đống tiếp theo như bình thường thôi

# Import các thư viện cần thiết

In [3]:
import numpy as np
import torch
from datasets import load_dataset
from datasets import load_metric
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import evaluate

from pyngrok import ngrok

import warnings
warnings.filterwarnings("ignore")

print(torch.__version__)
print(torch.cuda.is_available())

2024-07-06 02:11:25.750697: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


2.1.1+cu121
True


# Tải bộ dữ liệu có sẵn từ thư viện `datasets`

> Bộ dữ liệu sử dụng là `cnn_dailymail` phiên bản `3.0.0` từ thư viện `datasets`

In [4]:
# Load the dataset
dataset = load_dataset("cnn_dailymail", '3.0.0', trust_remote_code=True)
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

# Tạo T5 tokenizer

In [5]:
# Initialize the tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
tokenizer

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5Tokenizer(name_or_path='t5-small', vocab_size=32000, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id_4

# Khởi tạo và triển khai hàm để áp dụng tokenizer vào bộ dữ liệu

In [6]:
# Define a function to preprocess the dataset
def preprocess_function(examples):
    inputs = ["summarize: " + str(doc) for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=150, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
# Apply preprocessing to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11490
    })
})

> Chỉ sử dụng một phần của bộ dữ liệu (train) cho việc demo tinh chỉnh (fine-tune) mô hình để tiết kiệm thời gian

In [8]:
# IN ORDER TO SAVE TIME
sub_train = tokenized_datasets['train'].shuffle(seed=42).select(range(1000))
sub_eval = tokenized_datasets['validation'].shuffle(seed=42).select(range(100))
sub_test = tokenized_datasets['test'].shuffle(seed=42).select(range(100))

# Khởi tạo mô hình T5 (`t5-small`) với thông số phù hợp (config)

In [9]:
# Setting T5Config with suitable parameters for T5-small
config = T5Config(
    vocab_size=32128,
    d_model=512,
    d_kv=64,
    d_ff=2048,
    num_layers=6,
    num_decoder_layers=6,
    num_heads=8,
    relative_attention_num_buckets=32,
    dropout_rate=0.1,
    layer_norm_eps=1e-6,
    initializer_factor=1.0,
    feed_forward_proj="relu",
    is_encoder_decoder=True,
    use_cache=True,

    decoder_start_token_id=tokenizer.pad_token_id,
    decoder_end_token_id=tokenizer.eos_token_id,
)

# Initializing the T5 model with the configured settings
model = T5ForConditionalGeneration.from_pretrained("t5-small", config=config)
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

# Sử dụng `sacrebleu` metric để tinh chỉnh và đánh giá mô hình  

In [10]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Tinh chỉnh mô hình trên bộ dữ liệu `cnn-dailymail`

In [11]:
# Define data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
display(data_collator)

DataCollatorForSeq2Seq(tokenizer=T5Tokenizer(name_or_path='t5-small', vocab_size=32000, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42

: 

In [12]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    save_steps=10_000,
    save_total_limit=2,
    predict_with_generate=True,
)

In [None]:
# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=sub_train,
    eval_dataset=sub_eval,
    # train_dataset=tokenized_datasets["train"],
    # eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate the model
metrics = trainer.evaluate()
print(metrics)

# Khởi tạo hàm dự đoán (tóm tắt) cho mô hình thu được

In [None]:
# Function to summarize text
def summarize(text):
    # Preprocess the text
    inputs = trainer.tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)

    inputs = {k: v.to(trainer.model.device) for k, v in inputs.items()}  # Add this line

    # Generate the summary
    summary_ids = trainer.model.generate(
        inputs["input_ids"],
        max_length=150,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    # Decode the generated summary
    summary = trainer.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Ví dụ minh họa sử dụng mô hình thu được

In [None]:
# Example text to summarize
text = """
The US has witnessed a significant shift in its political landscape over the past few decades.
There has been a growing polarization among the populace, with ideological divides becoming more pronounced.
This has been reflected in the policies and rhetoric of political leaders, leading to a more fragmented society.
Issues such as healthcare, immigration, and climate change have become contentious topics, with strong opinions on both sides.
"""

# Generate the summary
summary = summarize(text)
print(summary)

# Lưu cả mô hình gốc và mô hình đã tinh chỉnh

In [None]:
trainer.model.save_pretrained("finetune/model")
trainer.tokenizer.save_pretrained("finetune/tokenizer")
model.save_pretrained("initial/model")
tokenizer.save_pretrained("initial/tokenizer")

> Lưu vào Drive để tải xuống

In [None]:
trainer.model.save_pretrained('/content/drive/MyDrive/finetune/model')
trainer.tokenizer.save_pretrained('/content/drive/MyDrive/finetune/tokenizer')
model.save_pretrained('/content/drive/MyDrive/initial/model')
tokenizer.save_pretrained('/content/drive/MyDrive/initial/tokenizer')

# Triển khai mô hình trong file `app.py`

In [None]:
%%writefile app.py
import streamlit as st
from transformers import T5Tokenizer, T5ForConditionalGeneration

import google.generativeai as genai
from google.colab import userdata

model = T5ForConditionalGeneration.from_pretrained("finetune/model")
tokenizer = T5Tokenizer.from_pretrained("finetune/tokenizer")

# Function to summarize text
def summarize(text):
    # Preprocess the text
    inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)

    inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Add this line

    # Generate the summary
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=150,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

genai.configure(api_key='AIzaSyATJ5uq68NHapHd59HJRCzO7MsRBc-mKzI')
gem = genai.GenerativeModel('gemini-1.5-flash')

def summarize_with_gemini(text):
    return gem.generate_content("Summarize this text:\n" + text).text

# Streamlit app
st.title("Text Summarization with fine-tuned T5 and Gemini API")

# Create two columns
col1, col2 = st.columns(2)

# Text input in the left column
with col1:
    st.header("Input Text")
    text = st.text_area("Enter text to summarize", height=300)
    summarize_button = st.button("Summarize")

# Display the summary in the right column
with col2:
    st.header("Summaries")
    # Create two rows in the second column
    if summarize_button and text:
        summary_t5 = summarize(text)
        summary_gemini = summarize_with_gemini(text)

        st.subheader("T5 Summary")
        st.write(summary_t5)

        st.subheader("Gemini API Summary")
        st.write(summary_gemini)

st.subheader("FIT@HCMUS - 21/22 - Statistical Learning")
st.markdown("""
> - 20120076: Mai Vinh Hiển
> - 21120070: Nhan Hữu Hiếu
> - 21120201: Bùi Đình Bảo
""")

# Xuất URL để truy cập mô hình

In [None]:
!killall ngrok

# Set authentication token if you haven't already done so
ngrok.set_auth_token("2iEfEmPIH5iwcoUdBcCoKbzhBL7_71h8ScJZKcJSUUn3ZC4XD")

# Start Streamlit server on a specific port
!nohup streamlit run app.py --server.port 5011 &

# Start ngrok tunnel to expose the Streamlit server
ngrok_tunnel = ngrok.connect(addr='5011', proto='http', bind_tls=True)

# Print the URL of the ngrok tunnel
print(' * Tunnel URL:', ngrok_tunnel.public_url)