# Train Bert for extractive summarization and NER

In [2]:
import argparse
from time import time
import torch
import numpy as np
import random
import pandas as pd
from sklearn.utils import shuffle
from transformers import DistilBertTokenizer

In [3]:
def is_notebook() -> bool:
  try:
    shell = get_ipython().__class__.__name__
    if shell == 'ZMQInteractiveShell':
      return True   # Jupyter notebook or qtconsole
    elif shell == 'TerminalInteractiveShell':
      return False  # Terminal running IPython
    else:
      return False  # Other type (?)
  except NameError:
    return False      # Probably standard Python interpreter

## (Hyper-)parameters

In [4]:
# Parse args if script mode
parser = argparse.ArgumentParser(description='extractive summary and ner using bert')

parser.add_argument('-is_graphic',type=int,default=1,choices=[0,1])
parser.add_argument('-gpu_num',type=int,default=0)
parser.add_argument('-batch_size',type=int,default=32)
parser.add_argument('-epochs',type=int,default=100)
parser.add_argument('-dataset',type=str,default="data/wiki_geo_preprocessed.json")

args = None

if is_notebook():
  args = parser.parse_args("")
else:
  args = parser.parse_args()

In [5]:
# parameters
is_graphic = args.is_graphic != 0
cuda_num = args.gpu_num

# hyper-parameters
batch_size = args.batch_size
epochs = args.epochs
learning_rate = 5e-4
early_stopping = 3
model_name = "DistBERT_ExtSUM_NER"
sub_folder_name = "model_name__{}__time__{}__lr__{}__batch_size__{}__cuda_num__{}__early_stopping__{}".format(model_name, time(), learning_rate, batch_size, cuda_num, early_stopping)
checkpoints_folder = "./checkpoints/" + sub_folder_name

# print
print("parse:")
print("is_graphic:", is_graphic)
print("cuda_num:", cuda_num)
print("epochs", epochs)
print("batch_size", batch_size)

parse:
is_graphic: True
cuda_num: 0
epochs 100
batch_size 32


## PyTorch initialisation

In [6]:
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [7]:
# Check if a GPU is available
if torch.cuda.is_available():
  # Display the number of available GPUs
  print(f"Number of available GPUs: {torch.cuda.device_count()}")
  # Display the name of each GPU
  for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
elif torch.backends.mps.is_available():
  print("MPS available.")
else:
  print("No GPU available.")

Number of available GPUs: 1
GPU 0: NVIDIA GeForce RTX 3060


In [8]:
if torch.cuda.is_available():  
  dev = "cuda:" + str(cuda_num) 
elif torch.backends.mps.is_available():
  dev = torch.device("mps")
else:  
  dev = "cpu" 

device = torch.device(dev)
device

device(type='cuda', index=0)

## Functions

In [9]:
def average(l):
  return sum(l) / len(l)

## Load data

In [10]:
df = pd.read_json(args.dataset)
df = shuffle(df, random_state=0)

df_test = df.iloc[0:100]
df_val = df.iloc[100:200]
df_train = df.iloc[200:300]

In [11]:
df.head(2)

Unnamed: 0,categories,titles,summaries,contents,entities,flat_contents,trunc_contents,labels_entities,labels_sentences
21430,géologie,Sismicité au Japon,La sismicité au Japon est particulièrement imp...,La sismicité au Japon est particulièrement imp...,"[10 septembre, 11 mars, 12 janvier, 1399, 1400...","\n\n\n\nLe Japon est un archipel volcanique, s...",\n\n\n== Plaques et fosses ==\n\nLe Japon est ...,"[[0, E, 0, 0, E, 0, 0, 0, 0, 0, L, C, C, R, C,...","[1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, ..."
7651,géographie générale,Voyage d'études,Voyage d'études est un roman inachevé de l'écr...,Voyage d'études est un roman inachevé de l'écr...,"[1905 en littérature, 1991 en littérature, Abe...","\n\n\n\nEn 1903, Théophile Cart prononce un di...","\n\n\n== Historique ==\n\nEn 1903, Théophile C...","[[0, 0, R, C, 0, 0, 0, 0, E, 0, 0, 0, 0, 0, 0,...","[1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [34]:
# URLs:
# * preprocessing 1 : https://towardsdatascience.com/nlp-preprocessing-with-nltk-3c04ee00edc0
# * preprocessing 2 : https://www.nltk.org/api/nltk.tokenize.html

import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
# Run the below line only the first time of running nltk
# nltk.download()

def preprocess_text(text, sent_tokenizer, bert_tokenizer, padding_trunc_doc):
  # tokenize sentence
  text = sent_tokenizer(text)

  # Add [SEP]
  text = " [SEP] ".join(text)

  # tokenize with bert tokenizer
  inputs = bert_tokenizer.encode_plus(
    text,
    add_special_tokens=True,
    padding="max_length",
    max_length=padding_trunc_doc,
    return_tensors="pt",
    truncation=True
  )

  input_ids = inputs['input_ids'].squeeze()
  attention_mask = inputs['attention_mask'].squeeze()

  return input_ids, attention_mask

In [35]:
from nltk.tokenize import LineTokenizer, sent_tokenize

# preprocess df
def preprocess_df(df, bert_tokenizer, padding_trunc_doc, doc_column_name="docs", labels_sum_column_name="labels_sum", labels_ner_column_name=None, is_sep_n=False):
  nltk_line_tokenizer = LineTokenizer()
  sent_tokenizer = None

  if is_sep_n:
    sent_tokenizer = lambda x: nltk_line_tokenizer.tokenize(x)
  else:
    sent_tokenizer = sent_tokenize

  result = []
  for idx in df.index:
    input_ids, attention_mask = preprocess_text(df[doc_column_name][idx], sent_tokenizer=sent_tokenizer, bert_tokenizer=bert_tokenizer, padding_trunc_doc=padding_trunc_doc)
    if labels_ner_column_name is not None:
      result.append({"idx" : idx, "input_ids" : input_ids, "attention_mask": attention_mask, "labels_sum" : df[labels_sum_column_name][idx], "labels_ner" : df[labels_ner_column_name][idx]})
    else:
      result.append({"idx" : idx, "input_ids" : input_ids, "attention_mask": attention_mask, "labels" : df[labels_sum_column_name][idx]})

  return result

In [44]:
train_dataset = preprocess_df(df=df_train, bert_tokenizer=DistilBertTokenizer.from_pretrained('distilbert-base-uncased'), padding_trunc_sent=512, doc_column_name="flat_contents", labels_sum_column_name="labels_sentences", labels_ner_column_name="labels_entities", is_sep_n=False)
val_dataset = preprocess_df(df=df_val, bert_tokenizer=DistilBertTokenizer.from_pretrained('distilbert-base-uncased'), padding_trunc_sent=512, doc_column_name="flat_contents", labels_sum_column_name="labels_sentences", labels_ner_column_name="labels_entities", is_sep_n=False)
test_dataset = preprocess_df(df=df_test, bert_tokenizer=DistilBertTokenizer.from_pretrained('distilbert-base-uncased'), padding_trunc_sent=512, doc_column_name="flat_contents", labels_sum_column_name="labels_sentences", labels_ner_column_name="labels_entities", is_sep_n=False)

In [37]:
torch.sum(train_dataset[0]["input_ids"] == 102)

tensor(10)

In [38]:
len(train_dataset[0]["labels_sum"])

39

In [42]:
train_dataset[0]["input_ids"]

tensor([  101,  2474,  5715,  9765,  8740, 13926,  1011,  9765,  4241, 17155,
        16778,  2078,  1012,   102,  2365,  8945, 12514,  9765,  1037,  1016,
         1010,  1019,  2463,  1037,  1048,  1005,  9765,  2139,  3002,  1011,
         5578,  1011,  1041, 25394,  3366,  3802,  1037,  1022,  2463,  1037,
         1048,  1005, 15068,  4355,  2139,  3347, 21031,  3126,  1012,   102,
         3393, 18856,  9581,  2102, 21864, 14418, 21162,  5562,  2474,  5715,
         9765, 24209, 11475,  8873,  2063,  1010,  4372,  2230,  1010,  2139,
         1077, 18856,  9581,  2102,  4153,  7413, 23151,  2278,  1090,  1010,
         7367,  7811,  2474,  5939, 18155,  8649,  2666,  4078, 18856,  9581,
         3215,  2139,  2474,  2605, 21864,  4012, 13876,  2063,  2632,  5668,
        17504,  2102,  2882,  2015,  4127,  2139, 18856,  9581,  3215,  4372,
         6005, 15049,  1012,   102,  4372, 12609,  1010,  2474,  5715, 24501,
        21748,  2102,  4241,  2828,  1077, 18856,  9581,  2102, 

In [40]:
df_train["flat_contents"][df_train.index[0]]

"\n\n\nLa commune est au nord-est du Cotentin. Son bourg est à 2,5 km à l'est de Saint-Pierre-Église et à 8 km à l'ouest de Barfleur.\n\n\nLe climat qui caractérise la commune est qualifié, en 2010, de « climat océanique franc », selon la typologie des climats de la France qui compte alors huit grands types de climats en métropole. En 2020, la commune ressort du type « climat océanique » dans la classification établie par Météo-France, qui ne compte désormais, en première approche, que cinq grands types de climats en métropole. Ce type de climat se traduit par des températures douces et une pluviométrie relativement abondante (en liaison avec les perturbations venant de l'Atlantique), répartie tout au long de l'année avec un léger maximum d'octobre à février.\nLes paramètres climatiques qui ont permis d’établir la typologie de 2010 comportent six variables pour les températures et huit pour les précipitations, dont les valeurs correspondent à la normale 1971-2000. Les sept principales 

In [41]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
sep_token_id = tokenizer.sep_token_id

print(f"ID du token [SEP]: {sep_token_id}")

ID du token [SEP]: 102


In [43]:
for i in range(len(train_dataset)):
  print(len(train_dataset[0]["input_ids"]))

512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
