Training a genere classification model using the NRTA Database for Genere Classification, with the transferred knowledge, use the model to predict movie genres.

# [1] Mount Drive

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# [2] Install requirements, Load Lib, Set WD

In [None]:
%%capture
!pip install transformers
!pip install datasets

In [None]:
import pandas as pd
import numpy as np
import os
import gc
import torch

import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset
from transformers import TrainingArguments, Trainer

In [None]:
path_wd = '/content/drive/MyDrive/Github/Content'
path_NRTA = '/content/drive/MyDrive/Github/Content/sources/NRTA'

# [3] Prepare Data for Classification with Pandas

In [2]:
# import database as df
df = pd.read_json(path_NRTA + '/records/contents_of_registrations.json')

# drop samples without genre
df = df[~((df.题材 == '') | (df.题材.isna()))]

# drop unused columns
df = df.drop(['集数', '报备机构', '公示年月', '许可证号', 
         '体裁', '拍摄日期', '制作周期','省级管理部门备案意见', 
         '相关部门意见', '备注'], axis=1)

# separate category into time category and genre category
df['label_time'] = df['题材'].apply(lambda x: x[:2])
df['label_genre'] = df['题材'].apply(lambda x: x[2:])

# Concat TV Series Title and Summary
df['summary'] = df[['剧名', '内容提要']].apply(lambda x: 
      u'{}。{}'.format(x['剧名'], x['内容提要']), axis=1)

# assign catetory id to time and genre
df['catid_time'] = df['label_time'].astype('category').cat.codes
df['catid_genre'] = df['label_genre'].astype('category').cat.codes

# Drop unecessary columns for ML
df = df.drop(['剧名', '内容提要'], axis=1)

NameError: ignored

In [None]:
df.head(2)

Unnamed: 0,题材,label_time,label_genre,summary,catid_time,catid_genre
0,当代军旅,当代,军旅,铁军。1984年，二十万铁道兵脱下军装成为一支不穿军装的“铁军”。尚武和他儿子尚志同共同负责...,1,3
1,近代革命,近代,革命,雪影追踪。1945年8月15日日本投降，共同抗日的国共双方特工从战友变成敌人，在东北雪城展开...,3,12


##  Time category

In [None]:
# drop cat_time == '重大' from df
df_time = df[(df.catid_time !=4)].copy()
df_time = df_time.drop(['label_genre', 'catid_genre', '题材'], axis=1)

# calcualte total numbers of categories
n_cat_time = df_time['catid_time'].nunique()

# create onehot representation
df_time['onehot'] = df_time['catid_time'].apply(lambda x: 
    [float(1) if i==x else float(0) for i in range(n_cat_time)])

# create dictionary for id2label and label2id
id2label_time = dict(zip(df_time['catid_time'].tolist(), df_time['label_time'].tolist()))
label2id_time = dict(zip(df_time['label_time'].tolist(), df_time['catid_time'].tolist()))

# Split df into train, val and test at 80%, 15% and 5% stratified.
df_time_train = df_time.groupby('catid_time').sample(frac=0.80, random_state=42)
df_time_not_train = df_time[~df_time.index.isin(df_time_train.index)]
df_time_val = df_time_not_train.groupby('catid_time').sample(frac=0.75, random_state=42)
df_time_test = df_time_not_train[~df_time_not_train.index.isin(df_time_val.index)]

In [None]:
# Sanity Check...
df_time_test.head(2)

Unnamed: 0,label_time,summary,catid_time,onehot
3,当代,公诉精英。江城检察院检察官安旎受检察长许爱琳指派，提前介入调查一起大学生投湖“自尽”失踪案，...,1,"[0.0, 1.0, 0.0, 0.0]"
6,近代,征途万里。1934年10月，红军团长陆海川在一次战斗中被炸成重伤被送进医院治疗，医院却被敌人...,3,"[0.0, 0.0, 0.0, 1.0]"


In [None]:
label2id_time

{'古代': 0, '当代': 1, '现代': 2, '近代': 3}

## Genre Category

In [None]:
# create df_genre
df_genre = df[['label_genre', 'summary', 'catid_genre']].copy()

# calcualte total numbers of categories
n_cat_genre = df['catid_genre'].nunique()

# create onehot representation
df_genre['onehot'] = df_genre['catid_genre'].apply(lambda x: 
    [float(1) if i==x else float(0) for i in range(n_cat_genre)])

# create dictionary for id2label and label2id
id2label_genre = dict(zip(df_genre['catid_genre'].tolist(), df_genre['label_genre'].tolist()))
label2id_genre = dict(zip(df_genre['label_genre'].tolist(), df_genre['catid_genre'].tolist()))

# Split df into train, val and test at 80%, 15% and 5%
df_genre_train = df_genre.groupby('catid_genre').sample(frac=0.80, random_state=42)
df_genre_not_train = df_genre[~df_genre.index.isin(df_genre_train.index)]
df_genre_val = df_genre_not_train.groupby('catid_genre').sample(frac=0.75, random_state=42)
df_genre_test = df_genre_not_train[~df_genre_not_train.index.isin(df_genre_val.index)]

In [None]:
label2id_genre

{'传奇': 0,
 '传记': 1,
 '其它': 2,
 '军旅': 3,
 '农村': 4,
 '宫廷': 5,
 '武打': 6,
 '涉案': 7,
 '神话': 8,
 '科幻': 9,
 '都市': 10,
 '青少': 11,
 '革命': 12}

In [None]:
# Sanity Check...
df_genre_test.head(2)

Unnamed: 0,label_genre,summary,catid_genre,onehot
8,都市,更好的青春。学生处主任决心要成立一个教师团队解决较为棘手的大学生心理和就业问题，对于这个特殊...,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
13,其它,走出时间。人到中年的职业摄影师厉志出生在一个林奇综合征家族，因为长期忙着照顾家人，不光荒废了...,2,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## Clean Memory

In [None]:
# Free up some memory
torch.cuda.empty_cache()

In [None]:
del df
gc.collect()

194

# [4] Setup For Finetuning - SequenceClassification

## Define Key Training Parameters

In [None]:
#########################
PATH_SAVE = '/content/drive/MyDrive/Github/Content/tools/models/'
N_LABELS = n_cat_genre
ID2LABEL = id2label_genre #  id2label_genre
LABEL2ID = label2id_genre # label2id_genre
BATCH_SIZE = 12
DFTRAIN = df_genre_train # df_genre_train
DFVAL = df_genre_val # df_genre_val
DFTEST = df_genre_test
#########################

## Instaniate tokenizer and model

"hfl/chinese-bert-wwm"
"adamlin/bert-distil-chinese"

In [None]:
# assign device
if torch.cuda.device_count() > 0:
  device = 'cuda:' + str(torch.cuda.current_device())
else:
  device = 'cpu'

# Instantiate tokenizer and model
checkpoint = "hfl/chinese-bert-wwm"
tokenizer = AutoTokenizer.from_pretrained(
    checkpoint, 
    problem_type="multi_label_classification")
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, 
    num_labels=N_LABELS,
    problem_type="multi_label_classification",
    id2label=ID2LABEL,
    label2id=LABEL2ID,
).to(device)

Some weights of the model checkpoint at hfl/chinese-bert-wwm were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint

## Feed pandas df through DataSet

In [None]:
# instaniate data_collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# define function to process pd.DataFrame to datasets.Datasets
def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(
        batch["summary"],
        #padding='max_length', use data_collater for dynamic padding
        truncation=True,
        max_length=512,
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    #batch["token_type_ids"] = inputs.token_type_ids
    batch["labels"] = batch['onehot']

    return batch

In [None]:
# Load Train, Val into DataSet

dataset_train = Dataset.from_pandas(DFTRAIN)
dataset_train = dataset_train.map(
  process_data_to_model_inputs,
  batched=True,
  batch_size=BATCH_SIZE,
  remove_columns=['__index_level_0__', 'onehot', 'summary',
                  'catid_genre','label_genre'],
)
dataset_train.set_format(
  type="torch",
  columns=["input_ids", "attention_mask", "labels"],
)
dataset_val = Dataset.from_pandas(DFVAL)
dataset_val = dataset_val.map(
  process_data_to_model_inputs,
  batched=True,
  batch_size=BATCH_SIZE,
  remove_columns=['__index_level_0__', 'onehot', 'summary',
                  'catid_genre','label_genre'],
)
dataset_val.set_format(
  type="torch",
  columns=["input_ids", "attention_mask", "labels"],
)

HBox(children=(FloatProgress(value=0.0, max=826.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=155.0), HTML(value='')))




In [None]:
# Sanity Check
dataset_train[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1

## define compute_metric function

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids
    predictions = np.argmax(logits, axis=-1)
    gold = np.argmax(labels, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

## Instaniate training_args and Trainer

In [None]:
training_args = TrainingArguments(
    output_dir= PATH_SAVE,
    evaluation_strategy='steps',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_steps=100,
    eval_steps=500,
    save_steps=500,
    #gradient_accumulation_steps=2,
    load_best_model_at_end=True,
    save_total_limit=4,
    num_train_epochs=8,
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics
)

# [5] Train and SAVE

In [None]:
trainer.train()
#resume_from_checkpoint=True

In [None]:
# Save Model
trainer.save_model(PATH_SAVE + '/' + 'chinese-bert-wwm-classification-NRTAgenre2')

Saving model checkpoint to /content/drive/MyDrive/Github/Content/tools/models//chinese-bert-wwm-classification-NRTAgenre2
Configuration saved in /content/drive/MyDrive/Github/Content/tools/models//chinese-bert-wwm-classification-NRTAgenre2/config.json
Model weights saved in /content/drive/MyDrive/Github/Content/tools/models//chinese-bert-wwm-classification-NRTAgenre2/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Github/Content/tools/models//chinese-bert-wwm-classification-NRTAgenre2/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Github/Content/tools/models//chinese-bert-wwm-classification-NRTAgenre2/special_tokens_map.json


#[6] TEST

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
############
model_name = '/content/drive/MyDrive/Github/Content/tools/models/chinese-bert-wwm-classification-NRTAgenre2'
############
if torch.cuda.device_count() > 0:
  device = 'cuda:' + str(torch.cuda.current_device())
else:
  device = 'cpu'

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
  model_name,
  num_labels=N_LABELS,
  problem_type="multi_label_classification",
  id2label=ID2LABEL,
  label2id=LABEL2ID,  
).to(device)
model.eval() # set model to eval mode for faster prediction

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [1]:
ID2LABEL

NameError: ignored

In [None]:
batch_size = 8
i = 0
ls = df_genre_test['summary'].tolist()
softmax = torch.nn.Softmax(dim=-1)
L = df_genre_test.shape[0]
test_predictions = []

while i < L:
  batch_test = tokenizer(ls[i:i+batch_size],
                           padding=True,
                           max_length=512, 
                           truncation=True, 
                           return_tensors='pt')
  batch_test.to(device)
  batch_outputs = model(**batch_test)
  batch_logtis = batch_outputs.logits
  batch_softmax = softmax(batch_logtis)
  batch_results = torch.argmax(batch_softmax, dim=1).cpu().numpy()
  test_predictions.extend(list(batch_results))
  i += batch_size

In [None]:
test_golds = df_genre_test['catid_genre'].tolist()

In [None]:
df_genre_test['pred'] = test_predictions
df_genre_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,label_genre,summary,catid_genre,onehot,pred
8,都市,更好的青春。学生处主任决心要成立一个教师团队解决较为棘手的大学生心理和就业问题，对于这个特殊...,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10
13,其它,走出时间。人到中年的职业摄影师厉志出生在一个林奇综合征家族，因为长期忙着照顾家人，不光荒废了...,2,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10
37,都市,不完美女人。职业女性欧阳娜一心想升职，却因想同时照顾好家庭和女儿而焦头烂额。职场竞争中落败后...,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10
56,涉案,女子中队。柳城市公安局女刑警景莉受命组建第一支交警女子中队，上岗前一次堵截任务让女子中队颜面...,7,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...",7
58,都市,云淡风轻。成功的音乐人蓝天即将举家搬迁，在整理东西时，他发现了一个尘封已久的皮箱，一封封信、...,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10
...,...,...,...,...,...
12293,都市,生命的职责。某学院研修班学员真子，拿着自己的《生命的职责》著述，向导师汇报。导师认为真子对生...,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10
12297,传奇,米脂婆姨绥德汉。少年蛮娃与父亲为土财主梁老大运送的货物遭到抢劫，梁老大四处捉拿蛮娃父子，蛮娃...,0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
12308,军旅,大兵团。1951年夏天，新疆解放战役结束，进疆官兵面临着新的选择。原十四团团长罗正雄脱下军装...,3,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3
12333,传奇,迷局。上海“孤岛”时期，百乐门舞厅的舞后茉莉在疯狂一夜后，尸体被发现在家中。私家侦探吴飞调查...,0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0


In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

f1 = f1_score(test_golds, test_predictions, average='micro')
accuracy = accuracy_score(test_golds, test_predictions)
print('f1: {}, accuracy: {}'.format(f1, accuracy))
# f1: 0.7625201938610662, accuracy: 0.7625201938610663 #genre KEEP
# f1: 0.7463651050080776, accuracy: 0.7463651050080775 #genre2

f1: 0.7463651050080776, accuracy: 0.7463651050080775
