Training a genere classification model using the NRTA Database for Genere Classification, with the transferred knowledge, use the model to predict movie genres.

# [1] Mount Drive

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# [2] Install requirements, Load Lib, Set WD

In [58]:
%%capture
!pip install transformers
!pip install datasets
!pip install rouge_score
!pip install sentencepiece
!pip install rouge

In [100]:
import pandas as pd
import numpy as np
import os
import gc
import torch
import re
from IPython.display import display, HTML

import torch
from transformers import AdamW, BartForConditionalGeneration
from transformers import DataCollatorWithPadding
from datasets import Dataset
from transformers import TrainingArguments, Trainer

In [None]:
path_wd = '/content/drive/MyDrive/Github/Content'
path_NRTA = '/content/drive/MyDrive/Github/Content/sources/NRTA'
path_ChinaFilm= '/content/drive/MyDrive/Github/Content/sources/ChinaFilm'

# [3] Prepare Data for Classification with Pandas

In [None]:
# import database as df
dfnrta = pd.read_json(path_NRTA + '/records/contents_of_registrations.json')
dfcf =pd.read_csv(path_ChinaFilm + '/records/contents_of_registrations.csv', 
                  encoding='utf-8-sig', index_col=0)

dft = dfcf[['片名', '梗概']].copy()
dft['来源'] = 'ChinaFilm'
tmp = dfnrta[['剧名', '内容提要']].copy()
tmp.columns = ['片名', '梗概']
tmp['来源'] = 'NRTA'
dft = pd.concat([dft, tmp], ignore_index=True)

idxval = dft.groupby('来源').sample(n=500, random_state=42).index
idxtest = dft[~dft.index.isin(idxval)].groupby('来源').sample(n=500, random_state=42).index

dft['usage'] = 'train'
dft.loc[dft.index.isin(idxval), 'usage'] = 'val'
dft.loc[dft.index.isin(idxtest), 'usage'] = 'test'

## Instaniate tokenizer and model

"hfl/chinese-bert-wwm"
"adamlin/bert-distil-chinese"

In [None]:
from transformers import BertTokenizer, BartForConditionalGeneration

# assign device
if torch.cuda.device_count() > 0:
  device = 'cuda:' + str(torch.cuda.current_device())
else:
  device = 'cpu'

# Instantiate tokenizer and model
checkpoint = "uer/bart-base-chinese-cluecorpussmall"

  
tokenizer = BertTokenizer.from_pretrained(checkpoint)
model = BartForConditionalGeneration.from_pretrained(checkpoint,
                                            gradient_checkpointing=True, 
                                            use_cache=False)
model.to(device)

loading file https://huggingface.co/uer/bart-base-chinese-cluecorpussmall/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/68a98551f47464131ddeff325158aaa18253dfe99fe69ff5eda453ae76e3c176.accd894ff58c6ff7bd4f3072890776c14f4ea34fcc08e79cd88c2d157756dceb
loading file https://huggingface.co/uer/bart-base-chinese-cluecorpussmall/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/uer/bart-base-chinese-cluecorpussmall/resolve/main/special_tokens_map.json from cache at /root/.cache/huggingface/transformers/1255f13ee487ed54af54ff4250645a5682556603f07af2a0c7f0da1af7c7c238.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
loading file https://huggingface.co/uer/bart-base-chinese-cluecorpussmall/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/019394350641d3bec31d4b5db84db0f172f3493e813e82fe0444c54821fe0f91.ee60dc4fce36b9f8af761aa570e77d91cd1ef3698907d5a98dcc367c2464a73f
loading file 

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(21128, 768, padding_idx=0)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(21128, 768, padding_idx=0)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        

In [None]:
dft['片名'][0]

'侠胆医心之致命玄机'

## Clean Memory

In [None]:
# Free up some memory
torch.cuda.empty_cache()

In [None]:
del df
gc.collect()

# [4] Setup For Finetuning - SequenceClassification

## Define Key Training Parameters

In [None]:
#########################
PATH_SAVE = '/content/drive/MyDrive/Github/Content/tools/models/'
BATCH_SIZE = 4
MAX_INPUT_LENGTH = 512
MAX_OUTPUT_LENGTH = 32
#########################

In [None]:
dft['梗概'] = dft['梗概'].apply(lambda x: '[MASK] ' + x) 


## Feed pandas df through DataSet

In [None]:
# instaniate data_collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Defube Dataet processing function
def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(
        batch["梗概"], 
        padding='max_length', #leave to data collator for dynamic padding
        truncation=True,
        max_length=MAX_INPUT_LENGTH,
    )
    with tokenizer.as_target_tokenizer():
      outputs = tokenizer(
        batch["片名"],
        padding='max_length', #leave to data collater to pad dynamically
        truncation=True,
        max_length=MAX_OUTPUT_LENGTH,
      )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    #batch['max_length'] = [int(tc*1.05) for tc in batch['gold_tc']]

    batch["labels"] = outputs.input_ids

    return batch

In [None]:
# Load Train, Val into DataSet

dataset_train = Dataset.from_pandas(dft[dft.usage=='train'])
dataset_train = dataset_train.map(
  process_data_to_model_inputs,
  batched=True,
  batch_size=BATCH_SIZE,
  remove_columns=['__index_level_0__', '梗概', '片名', 'usage', '来源'],
)
dataset_train.set_format(
  type="torch",
  columns=["input_ids", "attention_mask", "labels"],
)
dataset_val = Dataset.from_pandas(dft[dft.usage=='val'])
dataset_val = dataset_val.map(
  process_data_to_model_inputs,
  batched=True,
  batch_size=BATCH_SIZE,
  remove_columns=['__index_level_0__', '梗概', '片名', 'usage', '来源'],
)
dataset_val.set_format(
  type="torch",
  columns=["input_ids", "attention_mask", "labels"],
)

  0%|          | 0/10257 [00:00<?, ?ba/s]

  0%|          | 0/250 [00:00<?, ?ba/s]

In [None]:
# Sanity Check
dataset_train[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0

## define compute_metric function

In [None]:
from datasets import load_metric
rouge = load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(
        predictions=pred_str, references=label_str, rouge_types=["rouge2"]
    )["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

Downloading:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

## Instaniate training_args and Trainer

In [None]:
from transformers import AutoModelForSeq2SeqLM

training_args = TrainingArguments(
    output_dir= PATH_SAVE,
    evaluation_strategy='no',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_steps=2000,
    eval_steps=2000,
    save_steps=2000,
    #gradient_accumulation_steps=2,
    #load_best_model_at_end=True,
    save_total_limit=2,
    num_train_epochs=1,
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# [5] Train and SAVE

In [None]:
trainer.train()
#resume_from_checkpoint=True

***** Running training *****
  Num examples = 41028
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 10257


Step,Training Loss
2000,0.7573
4000,0.6392
6000,0.625
8000,0.6137
10000,0.5867


Saving model checkpoint to /content/drive/MyDrive/Github/Content/tools/models/checkpoint-2000
Configuration saved in /content/drive/MyDrive/Github/Content/tools/models/checkpoint-2000/config.json
Model weights saved in /content/drive/MyDrive/Github/Content/tools/models/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Github/Content/tools/models/checkpoint-2000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Github/Content/tools/models/checkpoint-2000/special_tokens_map.json
Saving model checkpoint to /content/drive/MyDrive/Github/Content/tools/models/checkpoint-4000
Configuration saved in /content/drive/MyDrive/Github/Content/tools/models/checkpoint-4000/config.json
Model weights saved in /content/drive/MyDrive/Github/Content/tools/models/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Github/Content/tools/models/checkpoint-4000/tokenizer_config.json
Special tokens file saved in /co

TrainOutput(global_step=10257, training_loss=0.6430762767838237, metrics={'train_runtime': 3907.2024, 'train_samples_per_second': 10.501, 'train_steps_per_second': 2.625, 'total_flos': 1.4553264867508224e+16, 'train_loss': 0.6430762767838237, 'epoch': 1.0})

In [None]:
# Save Model
trainer.save_model(PATH_SAVE + '/' + 'PredCNTitle)

#[6] TEST

In [None]:
from transformers import BertTokenizer, BartForConditionalGeneration

# assign device
if torch.cuda.device_count() > 0:
  device = 'cuda:' + str(torch.cuda.current_device())
else:
  device = 'cpu'

# Instantiate tokenizer and model
checkpoint = "/content/drive/MyDrive/Github/Content/tools/models/PredTitle-10000"

  
tokenizer = BertTokenizer.from_pretrained(checkpoint)
model = BartForConditionalGeneration.from_pretrained(checkpoint)
model.to(device)
model.eval()

In [None]:
dft.loc[dft['usage'] == 'test', '梗概']

97       小孩古粒偶然得到一条被训练过偷钱的狗，一时花钱大手大脚享受到伙伴们的讨好，自己误入歧途，但最...
212      路过青春系列一讲述了一段朝气蓬勃的校园故事。性格迥异的几个人从小到大都是好朋友，各自摩擦出了...
219      退伍军人出身的村支书石大臣实施乡村振兴。杨靖楠想从中套取国家投资。石大臣识破杨靖楠诡计，乡村...
253      随父回乡躲债的陈然结识了被称为“妖怪”的女孩谭霖雯，两个年轻人逐渐对彼此敞开心扉。讨债团伙上...
283      该片讲述三个年轻人为了证明自己回到父辈的家乡，在那里，他们用自己的热情帮助乡亲们一起实现乡村...
                               ...                        
42883    90年代初，漂亮的斯琴与恋人牧人即将结婚之时，父亲沙力丙和筹集来的生意款一起失身火海，斯琴在...
42887    故事发生在解放前夕的上海滩，国民党最后撤离前正在密谋一次最大的破坏计划。在这种空前混乱的大暗...
42897    1917年，第一次世界大战期间，欧洲战场激战正酣，美国经过长久权衡，终于向德国宣战。而在美国...
42965    清末民初，江南小镇。方凌霜儿时家境寒微，酒徒兼赌徒的父亲为凑赌资将她卖入青楼。父亲意外而亡，...
43007    秦二世昏庸无道，楚国大将韩公父在保卫楚国的战争中牺牲，公父的妻子带着年幼的儿子韩信流落他乡。...
Name: 梗概, Length: 1000, dtype: object

In [None]:
batch_size = 8
i = 0
ls = dft.loc[dft['usage'] == 'test', '梗概'].tolist()
L = dft.loc[dft['usage'] == 'test', '梗概'].shape[0]
test_predictions = []

while i < L:
  inputs = tokenizer(ls[i:i+batch_size],
                           padding=True,
                           max_length=512, 
                           truncation=True, 
                           return_tensors='pt')
  inputs.to(device)
  summary_ids = model.generate(input_ids=inputs['input_ids'],
                             num_beams=4,
                             min_length=0,
                             max_length=32
                             )
  
  ret = [tokenizer.decode(g, 
                         skip_specical_tokens=True, 
                         clean_up_tokenization_spaces=True) for g in summary_ids]
  test_predictions.extend(ret)
  i += batch_size

In [None]:
dftest = dft.loc[dft['usage'] == 'test'].copy()
dftest['预测片名'] = test_predictions

def remove_specials(x):
  x = re.sub(' ', '', x)
  x = re.sub('\[CLS\]', '', x)
  x = re.sub('\[PAD\]', '', x)
  x = re.sub('\[SEP\]', '', x)
  return x

dftest['预测片名'] = dftest['预测片名'].apply(remove_specials)

In [75]:
## Define Evaluation Functions

from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Evaluation Functions
rouge = Rouge()
smooth = SmoothingFunction().method1
best_bleu = 0.

def evaluate_single(x: pd.DataFrame, topk=1):
  """a functions that calculates rouge-1, rouge-2, rouge-l and bleu scores.
  """
  rouge_1, rouge_2, bleu = 0, 0, 0
  title_actual = ' '.join(list(x['片名'])) # actual title
  title_predicted = ' '.join(list(x['预测片名'])) # predicted title
  if title_predicted.strip():
    scores = rouge.get_scores(hyps=title_predicted, refs=title_actual)
    rouge_1 = scores[0]['rouge-1']['f']
    rouge_2 = scores[0]['rouge-2']['f']
    rouge_L = scores[0]['rouge-l']['f'] # rouge-L
    bleu = sentence_bleu(
        references=[title_actual.split(' ')],
        hypothesis=title_predicted.split(' '),
        smoothing_function=smooth
    )
  else:
    rouge_1, rouge_2, rouge_L, bleu = 0.,0.,0.,0.
  return {'rouge_1': rouge_1, 
          'rouge_2': rouge_2,
          'rouge_L': rouge_L, 
          'bleu': bleu
  }

In [87]:
dftest['R1'] = dftest.apply(lambda x: evaluate_single(x)['rouge_1'], axis=1)

In [108]:
view[view.index.isin([97, 31154, 1184, 3874, 4193, 5277, 6924, 9021, 9411, 9557])]

Unnamed: 0,片名,预测片名,梗概
97,萌犬大盗,偷钱的狗,小孩古粒偶然得到一条被训练过偷钱的狗，一时花钱大手大脚享受到伙伴们的讨好，自己误入歧途，但最...
3874,极速旋风,摩托车女王,帅性美丽的季云，她是顶尖摩托车赛手，征服无数的赛道，却有一条从来不敢踏上的巅峰赛道—20年前...
4193,十八岁的后旋踢,跆拳道,高中生林逸为了考入理想中的大学，开始苦练跆拳道。进入大学后，面对爱情的失利，跆拳道成了林逸的...
5277,失恋日志,拍真人秀,杨言是一名影视公司的真人秀导演。和女朋友分手后，他开始像拍真人秀一样给自己录视频日记。通过这...
6924,醉江湖,酒局,大学毕业的王勇不愿承袭父业从事制酒行业，与家庭决裂后踏入大城市追逐自己的“梦想”。从小炼就了...
9021,真假美猴王之战神归来,悟空传奇,西行万里，唐僧师徒团结一心，经过千难万险，到达了白虎岭。孙悟空忠心保护唐僧，却经常被师父责骂...
9411,迷途老爸,奇葩旅行,老爸心疼儿子吴离大城市的生活压力，误入非法直销组织。为了让父亲迷途知返，吴离利用职务之便带上...
9557,从黄土高原到伊朗高原,走西口,陕北小伙王贵城追寻父祖的足迹跳起了秧歌，当留学生波斯女孩Mobina告诉他伊朗也有“秧歌”后...
31154,来吧宝贝,三个女婿,刘一涵年过三十，家庭和谐，事业正处于上升期，除了和丈夫赵达庭还没有孩子这件事，夫妻俩的生活总...


In [102]:
view = dftest[dftest.R1 == 0][['片名', '预测片名', '梗概']]
HTML(view.to_html())

Unnamed: 0,片名,预测片名,梗概
97,萌犬大盗,偷钱的狗,小孩古粒偶然得到一条被训练过偷钱的狗，一时花钱大手大脚享受到伙伴们的讨好，自己误入歧途，但最终认识到自己的错误，将钱还了回去。古粒爷爷癌症做手术没钱，小狗为救爷爷只得再次偷盗，最终献出生命。
219,几字弯人家,石大臣,退伍军人出身的村支书石大臣实施乡村振兴。杨靖楠想从中套取国家投资。石大臣识破杨靖楠诡计，乡村振兴工作全面推进。
283,那条路,我们的家乡,该片讲述三个年轻人为了证明自己回到父辈的家乡，在那里，他们用自己的热情帮助乡亲们一起实现乡村振兴的故事。
436,东武樱花,井上的谎言,初到日本的中国女孩林月，因为一次意外送错的外卖救下了一个不得志的画家井上，为了让井上重燃生活的信心，林月编造了一个谎言和不存在的目的地，但是两人心灵却真实的被这次旅程所净化
633,月明梨花上,侯玉兰,侯玉兰十八岁参军，复员后又支边新疆，风风火火把一辈子热情精力全都奉献给了工作和身边每个人，自己却一直单身。退休后被已成家的养子接到了北京，老战友老同事都说她有福气，玉兰却为自己的身后事忧心忡忡……
744,牛大力的本命年,时代顺风车,2021牛年，是牛大力的本命年，也是在这一年他遭遇了事业与爱情的双重挫败，然后后遇到老同学王学铭，两人反思自己究竟错在哪里，最后赶上网红直播带货这趟时代顺风车，并通过不懈努力终获成功！
758,妈，抱一下,海边的声音,母亲因癌症入院治疗，儿子在二十多天里悉心陪护。可进入晚期的疾病让所有人都束手无策。这时候，儿子决心带母亲逃离病床，带她去完成一个此生未了的心愿。母亲虽然病逝路上，但儿子依然带着她来到了海边。
834,风吹烛,初中生周怡,初中生周怡身患白血病，她对党老师崇拜而有些许懵懂的喜欢。在党老师婚姻破散弟弟牺牲的低谷时期，善解人意的周怡使其重新振作，师生亲如父女。党想尽办法让周怡到大城市接受更好的治疗，却意外得到周怡病逝的消息。
1141,1184,一夜惊魂,余敏一觉醒来，发现身边躺着一个陌生女孩的尸体。同时，电视机开始播放这个房间十分钟以后发生的事情。余敏崩溃了，这到底是怎么回事？一场阴谋还是置身于科幻世界？
1265,金童玉女,布丁和米粒,一对可爱的小狗兄弟，分别被一个小女孩和一个小男孩抱回了家，各自取名布丁和米粒。一年后，一场意外将两只狗调了包。女孩和男孩不仅要面对眼前这只性格突变的狗狗，同时还要面对他们各自的成长变化。


In [68]:
list('萌犬大盗')

['萌', '犬', '大', '盗']

In [73]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], 
                                  use_stemmer=True)
rouge = Rouge()

def calc_scores(x):
  gold = list(x['片名'])
  pred = list(x['预测片名'])


97       {'rouge1': (0.0, 0.0, 0.0), 'rouge2': (0.0, 0....
212      {'rouge1': (0.0, 0.0, 0.0), 'rouge2': (0.0, 0....
219      {'rouge1': (0.0, 0.0, 0.0), 'rouge2': (0.0, 0....
253      {'rouge1': (0.0, 0.0, 0.0), 'rouge2': (0.0, 0....
283      {'rouge1': (0.0, 0.0, 0.0), 'rouge2': (0.0, 0....
                               ...                        
42883    {'rouge1': (0.0, 0.0, 0.0), 'rouge2': (0.0, 0....
42887    {'rouge1': (0.0, 0.0, 0.0), 'rouge2': (0.0, 0....
42897    {'rouge1': (0.0, 0.0, 0.0), 'rouge2': (0.0, 0....
42965    {'rouge1': (0.0, 0.0, 0.0), 'rouge2': (0.0, 0....
43007    {'rouge1': (0.0, 0.0, 0.0), 'rouge2': (0.0, 0....
Length: 1000, dtype: object