In [1]:
!pip install rouge-chinese

Collecting rouge-chinese
  Downloading rouge_chinese-1.0.3-py3-none-any.whl (21 kB)
Installing collected packages: rouge-chinese
Successfully installed rouge-chinese-1.0.3
[0m

In [2]:
from tqdm.auto import tqdm

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
from transformers import get_scheduler

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

from accelerate import Accelerator
from rouge_chinese import Rouge

import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def load_data(data_path):
    print("Load data from: {}.".format(data_path))
    df = pd.read_csv(data_path) 
    df = df[['title','abstract']]
    df['abstract']=df['abstract'].astype(str)
    df['title']=df['title'].astype(str)
    print(df.head())
    return df.iloc[:25000]

In [None]:
def split_data(df, val_size):
    print("Split data into training and validation sets, and the validation size is {}.".format(val_size))
    # Split the dataframe into train and remaining data
    train_df, remaining_df = train_test_split(df, test_size=val_size, random_state=42)

    # Split the remaining data into validation and test sets
    dataset_dict = DatasetDict({
        'train': Dataset.from_pandas(train_df),
        'validation': Dataset.from_pandas(remaining_df),
    })

    return dataset_dict

In [None]:
def load_model(model_checkpoint):
    print("Load '{}' model from huggingface.".format(model_checkpoint))
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    while True:
        try:
            model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, resume_download=True)
            break
        except Exception as e:
            print(f"Error: {e}. Retrying download...")

    return tokenizer, model

In [None]:
def tokenize(tokenizer, dataset_dict, max_input_length, max_target_length):
    print("Tokenize the input data")
    def _preprocess(examples):
        model_inputs = tokenizer(
            examples["abstract"],
            max_length=max_input_length,
            truncation=True,
        )
        labels = tokenizer(
            examples["title"], 
            max_length=max_target_length, 
            truncation=True,
        )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = dataset_dict.map(_preprocess, batched=True)
    return tokenized_datasets

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

In [None]:
def training(model, train_dataloader, eval_dataloader, num_train_epochs, lr, tokenizer, output_dir):
    optimizer = AdamW(model.parameters(), lr=lr)
    accelerator = Accelerator()
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )
    num_update_steps_per_epoch = len(train_dataloader)
    num_training_steps = num_train_epochs * num_update_steps_per_epoch

    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )
    
    progress_bar = tqdm(range(num_training_steps))

    epoch_results = []  # Initialize list to store Rouge scores for each epoch
    epoch_scores_bleu = [] # Initialize list to store bleu scores for each epoch

    for epoch in range(num_train_epochs):
        # Training
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        # Evaluation
        model.eval()
        scores = []
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                generated_tokens = accelerator.unwrap_model(model).generate(
                    batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                )

                generated_tokens = accelerator.pad_across_processes(
                    generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
                )
                labels = batch["labels"]

                # If we did not pad to max length, we need to pad the labels too
                labels = accelerator.pad_across_processes(
                    batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
                )

                generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
                labels = accelerator.gather(labels).cpu().numpy()

                # Replace -100 in the labels as we can't decode them
                labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
                if isinstance(generated_tokens, tuple):
                    generated_tokens = generated_tokens[0]
                decoded_preds = tokenizer.batch_decode(
                    generated_tokens, skip_special_tokens=True
                )
                decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

                decoded_preds, decoded_labels = postprocess_text(
                    decoded_preds, decoded_labels
                )
                # Convert elements to strings
                decoded_labels_str = ' '.join(decoded_labels[0])
                decoded_preds_str = decoded_preds[0].replace('<extra_id_0>','')
                decoded_preds_str = ' '.join(decoded_preds_str) if decoded_preds_str else '_'



                rouge = Rouge()
                result = rouge.get_scores(decoded_preds_str, decoded_labels_str)[0]
                # Calculate mean scores for each epoch
                epoch_results.append(result)  # Append Rouge scores to epoch_results list

                # Calculate BLEU score
                references = [decoded_labels_str.split()]
                hypothesis = decoded_preds_str.split()
                score = nltk.translate.bleu_score.sentence_bleu(
                    references, 
                    hypothesis,
                    smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method1
                )
                scores.append(score)
        mean_scores = {
            'rouge-1': {
                'r': np.mean([score['rouge-1']['r'] for score in epoch_results]),
                'p': np.mean([score['rouge-1']['p'] for score in epoch_results]),
                'f': np.mean([score['rouge-1']['f'] for score in epoch_results])
            },
            'rouge-2': {
                'r': np.mean([score['rouge-2']['r'] for score in epoch_results]),
                'p': np.mean([score['rouge-2']['p'] for score in epoch_results]),
                'f': np.mean([score['rouge-2']['f'] for score in epoch_results])
            },
            'rouge-l': {
                'r': np.mean([score['rouge-l']['r'] for score in epoch_results]),
                'p': np.mean([score['rouge-l']['p'] for score in epoch_results]),
                'f': np.mean([score['rouge-l']['f'] for score in epoch_results])
            }
        }

        print(f"Epoch {epoch}:",f"Mean Scores: {mean_scores}")  # Print mean scores for each epoch
        # Calculate mean scores for each epoch
        epoch_score_bleu = sum(scores) / len(scores)
        epoch_scores_bleu.append(epoch_score_bleu)
        print(f"Epoch {epoch}: Mean BLEU score = {epoch_score_bleu}")

        # Save and upload
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
        if accelerator.is_main_process:
            tokenizer.save_pretrained(output_dir)

In [None]:
def run(training_data_path, val_size, model_checkpoint, max_input_length, max_target_length, batch_size, num_train_epochs, learning_rate, output_dir):
    df = load_data(data_path=training_data_path)
    dataset_dict = split_data(df=df, val_size=val_size)
    
    tokenizer, model = load_model(model_checkpoint=model_checkpoint)
    
    tokenized_datasets = tokenize(tokenizer, dataset_dict, max_input_length, max_target_length)
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    tokenized_datasets = tokenized_datasets.remove_columns(
        dataset_dict["train"].column_names
    )
    data_collator([tokenized_datasets["train"][i] for i in range(2)])
    tokenized_datasets.set_format("torch")
    train_dataloader = DataLoader(
        tokenized_datasets["train"],
        shuffle=True,
        collate_fn=data_collator,
        batch_size=batch_size,
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], collate_fn=data_collator, batch_size=batch_size
    )
    
    training(model, train_dataloader, eval_dataloader, num_train_epochs, learning_rate, tokenizer, output_dir)

In [None]:
new_var = 20
run(training_data_path='/kaggle/input/thesis-title-generator-data/training_data.csv', 
    val_size=0.3,
    model_checkpoint="google/mt5-small",
    max_input_length=1024, #設置摘要的長度上限
    max_target_length=new_var, #設置標題的長度上限
    batch_size=1,
    num_train_epochs=10,
    output_dir = "/kaggle/working/results-mt5-finetuned-squad-accelerate_v1",
    learning_rate=2e-5)

# Testing

## setting Kaggle API Token on Kaggle Notebook

In [3]:
!mkdir /root/.kaggle
!touch /root/.kaggle/kaggle.json

import json

with open('/kaggle/input/kaggle-token/kaggle.json', 'r') as api_token_file:
    api_token = json.load(api_token_file)

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 /root/.kaggle/kaggle.json

## Download saved model via Kaggle API

In [4]:
!kaggle kernels output czwinusa/thesis-title-generator -p /kaggle/working/

Output file downloaded to /kaggle/working/results-mt5-finetuned-squad-accelerate_v1/config.json
Output file downloaded to /kaggle/working/results-mt5-finetuned-squad-accelerate_v1/generation_config.json
Output file downloaded to /kaggle/working/results-mt5-finetuned-squad-accelerate_v1/pytorch_model.bin
Output file downloaded to /kaggle/working/results-mt5-finetuned-squad-accelerate_v1/special_tokens_map.json
Output file downloaded to /kaggle/working/results-mt5-finetuned-squad-accelerate_v1/spiece.model
Output file downloaded to /kaggle/working/results-mt5-finetuned-squad-accelerate_v1/tokenizer.json
Output file downloaded to /kaggle/working/results-mt5-finetuned-squad-accelerate_v1/tokenizer_config.json
Kernel log downloaded to /kaggle/working/thesis-title-generator.log 


## Start testing

In [5]:
output_dir = '/kaggle/working/results-mt5-finetuned-squad-accelerate_v1'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(output_dir).to(device)

cuda


In [6]:
# read test data & submission file
test_data = pd.read_csv('/kaggle/input/thesis-title-generator-data/test_data.csv')
submission = pd.read_csv('/kaggle/input/thesis-title-generator-data/submission.csv')

In [7]:
test_data

Unnamed: 0,ID,department,major,cluster,language,chinese_keyword,foreign_keyword,abstract,foreign_abstract
0,1,生物資訊與結構生物研究所,生命科學學門,生物訊息學類,中文,小盾鱧、外來種、鱗片、成長、魚虎、多曼魚,C.micropeltes、invasive、scale、growth、GiantSnake...,淡水外來種魚類入侵，一直都是臺灣水域生態面臨的主要問題之一。外來種的入侵除了會壓縮原生物種的...,Invasive fishes has become a main problem on t...
1,2,教育心理與諮商學系教育心理與諮商碩士在職專班,教育學門,綜合教育學類,中文,生涯發展任務、已婚中年職業婦女、在職進修、中年婦女,career development tasks、married middle-aged p...,本研究旨在由生涯發展任務探討己婚中年職業婦女之在職進修歷程，對四位45-57歲之已婚中年職業...,This study aims to investigate the in-service ...
2,3,藝術與設計學系所,藝術學門,應用藝術學類,中文,城市形象、圖文再現,city image、pictorial and verbal representation,\n由於竹科的發展，竹北移入人口逐年攀升，在都市規劃下竹北作為新興城市有著嶄新的樣貌，然竹北...,\nDue to the development of Hsinchu Science Pa...
3,4,光電工程研究所,工程學門,電資工程學類,中文,點雲、物件辨識、卷積神經網路、光達,Pointcloud、Object detection、Convolutional Neur...,在這篇論文中，我們使用卷積神經網絡建構三維點雲多物件辨識系統~(3D point cloud...,"In this thesis, we have developed a multi-obje..."
4,5,哲學研究所,人文學門,哲學學類,中文,後設倫理學、演化、價值、道德、道德實在主義、道德反實在主義、道德知識、道德懷疑主義,meta-ethics、evolution、value、morality、moral rea...,\n演化式揭穿者認為，對於道德心理現象(例如，重視生命)的最佳說明，只需要單純描述性的社會科...,\nEvolutionary debunkers argue that the best e...
...,...,...,...,...,...,...,...,...,...
7412,7413,環境與職業安全衛生系環境管理碩士班,環境保護學門,環境資源學類,中文,海洋廢棄物、淨灘淨海、減塑行為、永續發展,Marine debris、Beach and sea cleanup、Plastic re...,屏東縣擁有得天獨厚的海洋資源，多樣的海洋生態吸引大量的觀光遊客，但同時也帶來了日益增加的海洋...,"Pingtung County has unique marine resources, a..."
7413,7414,環境與職業安全衛生系環境管理碩士班,環境保護學門,環境資源學類,中文,多刺裸腹水蚤、生殖量、生殖條件,Moina macrocopa、reproductive mass、reproductive...,水蚤是水域中，最為普通的小型節肢動物之一，一般水蚤是浮游動物中種類最多，體型小，將其投入水中...,Daphnia is one of the most common small arthro...
7414,7415,藥學系碩士班,醫藥衛生學門,藥學學類,中文,浮萍、高尿酸血症、痛風、黃嘌呤氧化酶,Spirodela polyrrhiza、Hyperuricemia、Gout、Xanthi...,浮萍為浮萍科Lemnaceae水生草本植物紫萍Spirodela polyrhiza (L....,Spirodela polyrhiza (L.) Schleid. is a herbace...
7415,7416,藥學系碩士班,醫藥衛生學門,藥學學類,中文,新型冠狀病毒、社區藥局藥師、網路問卷、新冠肺炎快篩劑、國產新冠疫苗、變異病毒株、疫苗混打、冠狀病毒,COVID-19、Community Pharmacies、Online Questionn...,2019年底爆發新型冠病毒的疫情，衛福部疾病管制署立即啟動緊急應變開設中央流行疫情指揮中心。...,"At the end of 2019, the Center for Disease Con..."


In [8]:
submission

Unnamed: 0,ID,title
0,1,
1,2,
2,3,
3,4,
4,5,
...,...,...
7412,7413,
7413,7414,
7414,7415,
7415,7416,


In [10]:
input_data = test_data['abstract'].tolist()

# test with the fine-tuned model
results = []

# show progress using tqdm
with tqdm(total=len(input_data), desc='Predicting') as pbar:
    for text in input_data:
        inputs = tokenizer.encode_plus(
            text,
            padding='max_length',
            truncation=True,
            max_length=1024,
            return_tensors='pt'
        )

        with torch.no_grad():
            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)

        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        results.append(decoded_output)

        pbar.update(1)
        pbar.set_postfix({'Result': decoded_output})
        
pbar.close()

Predicting:   0%|          | 0/7417 [00:00<?, ?it/s]



In [13]:
# write into submission dataframe and export as submission.csv
submission['title'] = results
print(submission)
submission.to_csv('/kaggle/working/submission.csv', encoding='utf-8', index=False)

        ID                   title
0        1         淡水外來種小盾鱧移除與樣本研究
1        2  生涯發展任務探討己婚中年職業婦女在職進修歷程
2        3      以班雅明文化批評概念探討新興城市形象
3        4     卷積神經網絡建構三維點雲多物件辨識系統
4        5     演化式揭穿對於道德實在主義的演化式揭穿
...    ...                     ...
7412  7413       屏東縣海岸淨灘及水下淨海資料之研究
7413  7414           多刺裸腹水蚤對生殖量之影響
7414  7415    浮萍對化學誘導高尿酸血症小鼠降尿酸之影響
7415  7416    新型冠狀病毒防疫之社區藥師參與感、熱忱感
7416  7417     屏東縣籃球休閒參與者認真休閒與幸福感之

[7417 rows x 2 columns]
