### Installing BookNLP

In [None]:
!pip install booknlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting booknlp
  Downloading booknlp-1.0.7.tar.gz (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers>=4.11.3 (from booknlp)
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m70.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers>=4.11.3->booknlp)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers>=4.11.3->booknlp)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 

### Loading the datasets

In [None]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Importing necessary libraries for preprocessing
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
# Loading both the datasets
df_movie = pd.read_csv('/content/drive/MyDrive/Visual Story Telling/Dataset - Story Generation/CMU_Movie_Dataset')
df_books = pd.read_csv('/content/drive/MyDrive/Visual Story Telling/Dataset - Story Generation/CMU_Book_Dataset')

In [None]:
df_movie['Characters'] = ""

# Adding new required columns
df_movie = df_movie.assign(Relations="")
df_books = df_books.assign(Characters="",Relations="")

### BookNLP Implementation + Sentiment Extraction

1) Implementing BookNLP on the datasets \\
2) Extraction of sentences with two people in it connected using a verb \\
3) Getting the sentiment using NLTK Sentiment Analyzer 

In [None]:
# Creating the pipeline - Installing the Models necessary for different processing done by BookNLP
from booknlp.booknlp import BookNLP

model_params={
		"pipeline":"entity,quote,supersense,event,coref", 
		"model":"big"
	}
	
booknlp=BookNLP("en", model_params)

using device cpu
{'pipeline': 'entity,quote,supersense,event,coref', 'model': 'big'}
downloading entities_google_bert_uncased_L-6_H-768_A-12-v1.0.model
downloading coref_google_bert_uncased_L-12_H-768_A-12-v1.0.model
downloading speaker_google_bert_uncased_L-12_H-768_A-12-v1.0.1.model


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/270M [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

--- startup: 41.313 seconds ---


In [None]:
# Importing the NLTK Sentiment Analyzer VADER
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

### CMU Movie Summary Dataset Processing

In [None]:
def apply_booknlp_movies(x):
  # Creating file for the summary 
  unique_id = x['Wikipedia movie ID']
  
  with open(f'{unique_id}.txt', 'w') as f:
    f.write(x['Summary'])
  
  # Input file to process
  input_file = f'{unique_id}.txt'

  # Output directory to store resulting files in
  output_directory = "./create_dataset/movie/"

  # File within this directory will be named ${book_id}.entities, ${book_id}.tokens, etc.
  book_id = x['Wikipedia movie ID']

  # Process the file 
  booknlp.process(input_file, output_directory, book_id)

  # Loading the .tokens file in a table format for extraction of sentences
  df_sentences = pd.read_table(f'/content/create_dataset/movie/{book_id}.tokens')
  sentences = df_sentences.groupby(['sentence_ID'])['word'].apply(lambda x: ' '.join(x)).values

  sentences = sentences.tolist()

  lengths = [len(x.split()) for x in sentences]

  # Calculating the cumulative lengths of the sentences 
  for i in range(len(lengths)):
    if i!=0:
      lengths[i] = lengths[i]+lengths[i-1]
  
  # .entities file consists of different components such as category, start token, etc that will be used for extraction 
  df = pd.read_table(f'/content/create_dataset/movie/{book_id}.entities')
  
  text_values = df.text.values
  ref_id = df.COREF.values 
  cat = df.cat.values
  start_token_id = df.start_token.values
  type_of_identity = df.prop.values

  # .verb file will be utilized to filter sentences where the identities are connected with an verb
  df_verb = pd.read_table(f'/content/create_dataset/movie/{book_id}.supersense')

  # Dictionary to hold identity paris along with the sentences in which they were referenced
  paired_sent = {}

  # Extraction of sentences with two identities connected using a verb
  for j in range(len(sentences)):
    check = False
    identities = []
    for i in range(len(text_values)):
      # Do not want to check for identities that are referenced ebfore or after the current sentence
      if start_token_id[i]<lengths[j-1] and i!=0:
        continue
      if start_token_id[i]>lengths[j]:
        break
      # Identities should only be PER (person) and referred as a PROP (Proper Noun)
      if text_values[i] in sentences[j] and cat[i]=='PER' and type_of_identity[i]=="PROP":
        identities.append([text_values[i], ref_id[i]])
    if len(identities)==2 and identities[1][0]!=identities[0][0]:
      check = True
    for idx, row in df_verb.iterrows():
      if "verb.social" in row.supersense_category and check:
        if row.start_token>lengths[j]:
          break  
        if row.start_token<lengths[j-1] and i!=0:
          continue
        # Collection of all sentences for the pair extracted earlier
        if f'{identities[0][0]} and {identities[1][0]}' in paired_sent or f'{identities[1][0]} and {identities[0][0]}' in paired_sent:
          if paired_sent.get(f'{identities[0][0]} and {identities[1][0]}')!=None:
            paired_sent[f'{identities[0][0]} and {identities[1][0]}'] += " "+sentences[j]
            break
          else:
            paired_sent[f'{identities[1][0]} and {identities[0][0]}'] += " "+sentences[j]
            break
        else:
          paired_sent[f'{identities[0][0]} and {identities[1][0]}'] = ""
          paired_sent[f'{identities[0][0]} and {identities[1][0]}'] += " "+sentences[j]
          break
  
  # Extracting character names 
  with open(f'/content/create_dataset/movie/{book_id}.book.html', 'r') as f:
    contents = f.read()
  doc = BeautifulSoup(contents, "html.parser")
  characters = []
  tag = doc.findAll(True)[0] 
  for idx,i_tag in enumerate(tag):
    if idx>0 and idx%2==0 and idx<len(tag)-1:
      char_name = ' '.join(i_tag.split("/")[0].split()[1:-1])
      if char_name!="":
        characters.append(char_name)

  # Getting a sentiment for the pairs, according to the sentences extracted
  relations = {'neu':"neutral",'pos':"positive",'neg':"negative"}
  inter_char_relations = []
  for key in paired_sent:
    ss = SentimentIntensityAnalyzer().polarity_scores(paired_sent[key])
    del ss['compound']
    inter_char_relations.append(f'{key} have {relations[max(ss, key=ss.get)]} relationship')

  return (", ".join(characters),". ".join(inter_char_relations))

In [None]:
# REMOVE THIS CELL AT THE END AND CHANGE THE LOCATION FILE NAME 
df_movie = df_movie[6000:7000]
df_movie.reset_index(drop=True, inplace=True)

In [None]:
# Processing for character extraction and relation analysis
labels = []
for idx, row in df_movie.iterrows():
  try:
    df_movie['Characters'].iloc[idx], df_movie['Relations'].iloc[idx] =  apply_booknlp_movies(row)
  except:
    labels.append(idx)

In [None]:
# Removing all the columns that were not compatible with the preprocessing
df_movie.drop(index=labels, axis=0, inplace=True)
df_movie.reset_index(drop=True, inplace=True)

In [None]:
# Saving the updated dataset
df_movie.to_csv('/content/drive/MyDrive/Visual Story Telling/Dataset - Story Generation/M_Dataset_6K_7K',index=False)

### CMU Books Summary Dataset Processing

In [None]:
def apply_booknlp_books(x):

  # Creating file for the summary 
  unique_id = x['Wikipedia article ID']
  
  with open(f'{unique_id}.txt', 'w') as f:
    f.write(x['Plot summary'])
  
  # Input file to process
  input_file = f'{unique_id}.txt'

  # Output directory to store resulting files in
  output_directory = "./create_dataset/book/"

  # File within this directory will be named ${book_id}.entities, ${book_id}.tokens, etc.
  book_id = x['Wikipedia article ID']

  # Process the file 
  booknlp.process(input_file, output_directory, book_id)

  # Loading the .tokens file in a table format for extraction of sentences
  df_sentences = pd.read_table(f'/content/create_dataset/book/{book_id}.tokens')
  sentences = df_sentences.groupby(['sentence_ID'])['word'].apply(lambda x: ' '.join(x)).values

  sentences = sentences.tolist()

  lengths = [len(x.split()) for x in sentences]

  # Calculating the cumulative lengths of the sentences 
  for i in range(len(lengths)):
    if i!=0:
      lengths[i] = lengths[i]+lengths[i-1]
  
  # .entities file consists of different components such as category, start token, etc that will be used for extraction 
  df = pd.read_table(f'/content/create_dataset/book/{book_id}.entities')
  
  text_values = df.text.values
  ref_id = df.COREF.values 
  cat = df.cat.values
  start_token_id = df.start_token.values
  type_of_identity = df.prop.values

  # .verb file will be utilized to filter sentences where the identities are connected with an verb
  df_verb = pd.read_table(f'/content/create_dataset/book/{book_id}.supersense')

  # Dictionary to hold identity paris along with the sentences in which they were referenced
  paired_sent = {}

  # Extraction of sentences with two identities connected using a verb
  for j in range(len(sentences)):
    check = False
    identities = []
    for i in range(len(text_values)):
      # Do not want to check for identities that are referenced ebfore or after the current sentence
      if start_token_id[i]<lengths[j-1] and i!=0:
        continue
      if start_token_id[i]>lengths[j]:
        break
      # Identities should only be PER (person) and referred as a PROP (Proper Noun)
      if text_values[i] in sentences[j] and cat[i]=='PER' and type_of_identity[i]=="PROP":
        identities.append([text_values[i], ref_id[i]])
    if len(identities)==2 and identities[1][0]!=identities[0][0]:
      check = True
    for idx, row in df_verb.iterrows():
      if "verb.social" in row.supersense_category and check:
        if row.start_token>lengths[j]:
          break  
        if row.start_token<lengths[j-1] and i!=0:
          continue
        # Collection of all sentences for the pair extracted earlier
        if f'{identities[0][0]} and {identities[1][0]}' in paired_sent or f'{identities[1][0]} and {identities[0][0]}' in paired_sent:
          if paired_sent.get(f'{identities[0][0]} and {identities[1][0]}')!=None:
            paired_sent[f'{identities[0][0]} and {identities[1][0]}'] += " "+sentences[j]
            break
          else:
            paired_sent[f'{identities[1][0]} and {identities[0][0]}'] += " "+sentences[j]
            break
        else:
          paired_sent[f'{identities[0][0]} and {identities[1][0]}'] = ""
          paired_sent[f'{identities[0][0]} and {identities[1][0]}'] += " "+sentences[j]
          break
  
  # Extracting character names 
  with open(f'/content/create_dataset/book/{book_id}.book.html', 'r') as f:
    contents = f.read()
  doc = BeautifulSoup(contents, "html.parser")
  characters = []
  tag = doc.findAll(True)[0] 
  for idx,i_tag in enumerate(tag):
    if idx>0 and idx%2==0 and idx<len(tag)-1:
      char_name = ' '.join(i_tag.split("/")[0].split()[1:-1])
      if char_name!="":
        characters.append(char_name)

  # Getting a sentiment for the pairs, according to the sentences extracted
  relations = {'neu':"neutral",'pos':"positive",'neg':"negative"}
  inter_char_relations = []
  for key in paired_sent:
    ss = SentimentIntensityAnalyzer().polarity_scores(paired_sent[key])
    del ss['compound']
    inter_char_relations.append(f'{key} have {relations[max(ss, key=ss.get)]} relationship')

  return (", ".join(characters),". ".join(inter_char_relations))

In [None]:
# REMOVE THIS CELL AT THE END AND CHANGE THE LOCATION FILE NAME 
df_books = df_books[6000:8000]
df_books.reset_index(drop=True, inplace=True)

In [None]:
# Preprocessing of the book summaries
labels= []
for idx, row in df_books.iterrows():
  try:
    df_books['Characters'].iloc[idx], df_books['Relations'].iloc[idx] =  apply_booknlp_books(row)
  except:
    labels.append(idx)

In [None]:
df_books.drop(index=labels, axis=0, inplace=True)
df_books.reset_index(drop=True, inplace=True)

In [None]:
# Saving the updated dataset
df_books.to_csv('/content/drive/MyDrive/Visual Story Telling/Dataset - Story Generation/B_Dataset_6K_8K',index=False)