Training a Summarization Model to Predict Title

# [1] Mount Drive

In [1]:
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# [2] Install requirements, Load Lib, Set WD

In [2]:
%%capture
!pip install transformers
!pip install datasets
!pip install rouge_score

In [None]:
import pandas as pd
import numpy as np
import os
import gc
import torch

import torch
from transformers import DataCollatorWithPadding
from datasets import Dataset

In [None]:
path_wd = '/content/drive/MyDrive/Github/Content'
path_NRTA = '/content/drive/MyDrive/Github/Content/sources/NRTA'

# [3] Prepare Data for Classification with Pandas

## Intantiate tokenizer for summarization

In [None]:
from transformers import AutoTokenizer
# Instantiate tokenizer
checkpoint = "uer/bart-base-chinese-cluecorpussmall"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

##  Import Records, Combine and Clean

In [None]:
# NRTA
path_NRTA = '/content/drive/MyDrive/Github/Content/sources/NRTA'
dftv = pd.read_json(path_NRTA + '/records/contents_of_registrations.json')
dftv = dftv[['剧名', '内容提要']]
dftv.columns=['title', 'summary']

# ChinaFilm
path_ChinaFilm = '/content/drive/MyDrive/Github/Content/sources/ChinaFilm'
dfmovie = pd.read_csv(path_ChinaFilm + '/records/contents_of_registrations.csv', 
                      index_col=0, encoding='utf-8-sig')
dfmovie = dfmovie[['片名', '梗概']]
dfmovie.columns=['title', 'summary']

# Combine dftv and dfmovie
df = pd.concat([dftv, dfmovie], ignore_index=True)
df['title'] = df['title'].str.lstrip('《').str.rstrip('》')
df = df.dropna()

# Calculate token numbers for title and summary
# describe title token  count: range=[1,29], mean=4.9
df['title_tc'] = df['title'].apply(lambda x: len(tokenizer.tokenize(x)))
# describe summary token count: range=[1,446], mean=140, 
df['summary_tc'] = df['summary'].apply(lambda x: len(tokenizer.tokenize(x)))


df.info()

Token indices sequence length is longer than the specified maximum sequence length for this model (299 > 128). Running this sequence through the model will result in indexing errors


<class 'pandas.core.frame.DataFrame'>
Int64Index: 42759 entries, 0 to 42758
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       42759 non-null  object
 1   summary     42759 non-null  object
 2   title_tc    42759 non-null  int64 
 3   summary_tc  42759 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 1.6+ MB


## Separate df into train, val and test, stratified

In [None]:
# Separate title into groups by tc
intervals_title_tc = pd.cut(df['title_tc'], [0 ,2, 4, 8, 32])
df['title_group'] = df.groupby(intervals_title_tc).keys

# Separate sumamry into groups by tc
intervals_summary_tc = pd.cut(df['summary_tc'], [0, 128, 256, 512])
df['summary_group'] = df.groupby(intervals_summary_tc).keys

In [None]:
# Split dataset into train, val and test with balanced groups
df_train = df.groupby(['summary_group', 'title_group']).sample(
    frac=0.9, random_state=42)[['title', 'summary']].copy()
df_not_train = df[~df.index.isin(df_train.index)]
df_val = df_not_train.groupby(['summary_group', 'title_group']).sample(
  frac=0.5, random_state=42)[['title', 'summary']].copy()
df_test = df_not_train[~df_not_train.index.isin(df_val.index)][
  ['title', 'summary']].copy()

In [None]:
# Sanity Check
df_test.head(2)

Unnamed: 0,title,summary
19,大三女生,故事发生在一所北方大学校园。人工智能专业大三女生赵嘉欣和白凌同时入选“类人足球机器人”研发团...
72,谁家的孩子,夏子鹏的“虎妈”要求严格，他努力地使自己更优秀，但又经常碰上困扰


## Clean Memory

In [None]:
del dftv, dfmovie, df
gc.collect()

NameError: ignored

In [None]:
# Free up some memory
torch.cuda.empty_cache()

# [4] Setup For Finetuning - Summarization

## Define Key Training Parameters

In [None]:
#########################
PATH_SAVE = '/content/drive/MyDrive/Github/Content/tools/models/'
BATCH_SIZE = 4
DFTRAIN = df_train # df_genre_train
DFVAL = df_val # df_genre_val
DFTEST = df_test
MAX_INPUT_LENGTH = 448
MAX_OUTPUT_LENGTH = 36
MIN_OUTPUT_LENGTH = 1
#########################

## Instaniate tokenizer and model

"uer/bart-base-chinese-cluecorpussmall"

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer,BartForConditionalGeneration

# assign device
if torch.cuda.device_count() > 0:
  device = 'cuda:' + str(torch.cuda.current_device())
else:
  device = 'cpu'

# Instantiate tokenizer and model
checkpoint = "uer/bart-base-chinese-cluecorpussmall"
tokenizer = AutoTokenizer.from_pretrained(
    checkpoint, 
    problem_type="summarization")
model = BartForConditionalGeneration.from_pretrained(
    checkpoint, 
    problem_type="summarization",
)
model.to(device)

loading configuration file https://huggingface.co/uer/bart-base-chinese-cluecorpussmall/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/302d5765382fb3187d28afa2e6ea793a91f10ce34163e57678373af3f5194d7c.aae8bfff1f5703ae7f17be35d056b7bb21839c9bed053a2f0759f9c0a43ce4d0
Model config BartConfig {
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.1,
  "decoder_layers": 6,
  "decoder_start_token_id": 101,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.1,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(21128, 768, padding_idx=0)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(21128, 768, padding_idx=0)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        

## Feed pandas df through DataSet

In [None]:
# instaniate data_collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# define function to process pd.DataFrame to datasets.Datasets
def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(
        batch["summary"],
        #padding='max_length', use data_collater for dynamic padding
        truncation=True,
        max_length=MAX_INPUT_LENGTH,
    )

    outputs = tokenizer(
        batch["title"],
        padding='max_length',
        truncation=True,
        max_length=MAX_OUTPUT_LENGTH,
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["labels"] = outputs.input_ids

    # We have to make sure that the PAD token is ignored
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels]
        for labels in batch["labels"]
    ]

    return batch

In [None]:
# Load Train, Val into DataSet

dataset_train = Dataset.from_pandas(DFTRAIN)
dataset_train = dataset_train.map(
  process_data_to_model_inputs,
  batched=True,
  batch_size=BATCH_SIZE,
  remove_columns=['__index_level_0__', 'title', 'summary'],
)
dataset_train.set_format(
  type="torch",
  columns=["input_ids", "attention_mask", "labels"],
)
dataset_val = Dataset.from_pandas(DFVAL)
dataset_val = dataset_val.map(
  process_data_to_model_inputs,
  batched=True,
  batch_size=BATCH_SIZE,
  remove_columns=['__index_level_0__', 'title', 'summary'],
)
dataset_val.set_format(
  type="torch",
  columns=["input_ids", "attention_mask", "labels"],
)

HBox(children=(FloatProgress(value=0.0, max=9621.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=535.0), HTML(value='')))




In [None]:
# Sanity Check
dataset_train[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1]),
 'input_ids': tensor([ 101, 3926, 1045, 5328,  753, 1282, 2399, 1724, 3299, 8024,  711, 6375,
         2495, 1744, 6863, 5670,  683, 2157, 2128, 1059, 2850, 6809, 4886, 2336,
         5670, 3124, 2229, 8024, 7259, 2360, 7357, 3633, 7599, 2372, 7566, 3696,
         7313,  721, 1894, 6566, 6569, 2844, 6843, 2339,  868, 8024,  830,  782,
         1728, 2824, 6437, 6825, 5310, 1762,  671, 6629, 8024,  676, 1921,  676,
         1915, 2894, 3818,  671, 5579, 4178, 6117, 8024,  809, 2673, 4164, 4295,
         4291,  711,  807,  817, 8024, 5106, 4810,  749, 3189, 3315, 3324, 2797,
         4638, 3266, 3324, 6369, 1153, 8024, 3297

## define compute_metric function

In [None]:
from datasets import load_metric
rouge = load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(
        predictions=pred_str, references=label_str, rouge_types=["rouge1"]
    )["rouge1"].mid

    return {
        "rouge1_precision": round(rouge_output.precision, 4),
        "rouge1_recall": round(rouge_output.recall, 4),
        "rouge1_fmeasure": round(rouge_output.fmeasure, 4),
    }

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2170.0, style=ProgressStyle(description…




## Instaniate training_args and Trainer

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir= PATH_SAVE,
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    fp16=True,
    logging_steps=100,
    eval_steps=100,
    save_steps=100,
    gradient_accumulation_steps=4,
    load_best_model_at_end=True,
    save_total_limit=2,
    num_train_epochs=1,
)

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

ValueError: ignored

# [5] Train and SAVE

In [None]:
trainer.train()
#resume_from_checkpoint=True

In [None]:
# Save Model

trainer.save_model(PATH_SAVE + '/' + 'bart-base-chinese-cluecorpussmall-FilmChina-and-NRTA-titleprediction')

Saving model checkpoint to /content/drive/MyDrive/Github/Content/tools/models//bart-base-chinese-cluecorpussmall-FilmChina-and-NRTA-titleprediction
Configuration saved in /content/drive/MyDrive/Github/Content/tools/models//bart-base-chinese-cluecorpussmall-FilmChina-and-NRTA-titleprediction/config.json
Model weights saved in /content/drive/MyDrive/Github/Content/tools/models//bart-base-chinese-cluecorpussmall-FilmChina-and-NRTA-titleprediction/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Github/Content/tools/models//bart-base-chinese-cluecorpussmall-FilmChina-and-NRTA-titleprediction/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Github/Content/tools/models//bart-base-chinese-cluecorpussmall-FilmChina-and-NRTA-titleprediction/special_tokens_map.json


#[6] TEST

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer,BartForConditionalGeneration

# assign device
if torch.cuda.device_count() > 0:
  device = 'cuda:' + str(torch.cuda.current_device())
else:
  device = 'cpu'

# Instantiate tokenizer and model
model_name = "/content/drive/MyDrive/Github/Content/tools/models/bart-base-chinese-cluecorpussmall-FilmChina-and-NRTA-titleprediction"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)


ValueError: ignored

In [None]:
model.eval()

In [None]:
s = df_test.loc[19,'summary']

In [None]:
def str_predict_title(s: str) -> str:
  tokens = tokenizer.tokenize(s)
  input_ids = tokenizer.convert_tokens_to_ids(tokens)
  predicted = model.generate(input_ids)
  tokenize.decode(predicted)
  return predicted_title

str_predict_title(s)

In [None]:
input = tokenizer(s)
model.name_or_path

'uer/bart-base-chinese-cluecorpussmall'

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer,BartForConditionalGeneration
import torch
############
model_name = PATH_SAVE + '/' + m_name + '-FilmChina-and-NRTA-titleprediction'
############
if torch.cuda.device_count() > 0:
  device = 'cuda:' + str(torch.cuda.current_device())
else:
  device = 'cpu'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(
  model_name,
  problem_type="summarization",
).to(device)
model.eval() # set model to eval mode for faster prediction

NameError: ignored

In [None]:
batch_size = 8
i = 0
ls = df_test['summary'].tolist()
softmax = torch.nn.Softmax(dim=-1)
L = df_genre_test.shape[0]
test_predictions = []

while i < L:
  batch_test = tokenizer(ls[i:i+batch_size],
                           padding=True,
                           max_length=512, 
                           truncation=True, 
                           return_tensors='pt')
  batch_test.to(device)
  batch_outputs = model(**batch_test)
  batch_logtis = batch_outputs.logits
  batch_softmax = softmax(batch_logtis)
  batch_results = torch.argmax(batch_softmax, dim=1).cpu().numpy()
  test_predictions.extend(list(batch_results))
  i += batch_size