## Setup

In [1]:
# Import Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from gensim.utils import tokenize
import re
import string

### Define parameters

In [70]:
class Config:
  bookpath = "/content/drive/MyDrive/NLP/Book5.txt"
  booktitle = "harry potter the order of the phoenix"
  kw_per_sentence = 5
  bigfreq_threshold = 14
  smallfreq_threshold = 14

opts = Config()

## Read and clean the file text
In this section, we read the raw text, remove all punctuation marks and footer sentences from it, and finally extract all sentences, as well as the cleaned corpus. 

In [4]:
def clean_text(path, title):
  '''  
  Takes as input the book path and title  
  Return a list of all sentences and cleaned text
  '''
  # Read the text file
  with open(path) as f:
    book = f.read()

  ### Remove the added'\n' from the corpus converted to lowercase
  book_cln = re.sub('\n','',book.lower())

  ### Remove the punctuation marks from the corpus
  for p in string.punctuation: # For loop to replace each punctuation mark with None in the corpus
      # not removed in the corpus, as it is used to differentiate the sentences
      if p not in ['.',',','-']: 
          book_cln = book_cln.replace(p,'')

  ### Tokenize the corpus into sentences in form of list
  nltk.download('punkt')
  sentence = sent_tokenize(book_cln)

  # Footer sentence is there as part of corpus - as shown
  footer = [s for s in sentence if s.startswith('page')]

  # Using regex pattern to remove this footer sentence from the corpus
  sent_clean = []
  for s in sentence:
      s = re.sub("page  \d+" + title + "j.k. rowling",'',s)
      sent_clean.append(s)

  ### Joining the list of sentences to get cleaned corpus
  corpus = ''.join(sent_clean)

  return(sentence, corpus)


In [5]:
sentence, corpus = clean_text(opts.bookpath, opts.booktitle.lower())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### Words extraction
Let's now break our corpus into a list of words.

In [6]:
words = word_tokenize(corpus)

nltk.download('stopwords')
stop_words = list(stopwords.words('english'))
my_stopwords = stop_words + ['.',',','“','’','”','—','...']
words = [w for w in words if w not in my_stopwords]

# length of words
print('\n')
print("Total number of words in this book - ",len(words))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Total number of words in this book -  143721


In [7]:
# Finding the frequency of each unique words
count_dict = Counter(words)

print("Number of unique words in the book -",len(count_dict.keys()))

Number of unique words in the book - 20291


## Keyword extraction 
Here we extract keywords from the text using a BERT model. Such keywords are then saved into a .txt file.

In [None]:
!pip install keybert
from keybert import KeyBERT

In [None]:
# keyword extraction algorithm that takes advantage of SBERT embeddings
kw_model = KeyBERT(model='all-mpnet-base-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
######## To generate the textfile and save keywords and their weights

sent_keyword = []
kw_per_sentence = opts.kw_per_sentence
for i,sent in enumerate(sentence):
  keywords = kw_model.extract_keywords(sent,
                                       keyphrase_ngram_range=(1,1), 
                                       stop_words='english', 
                                       highlight=False,
                                       top_n= kw_per_sentence)

  sent_keyword.append(keywords)

with open('/content/drive/MyDrive/NLP/listfile.txt', 'w') as filehandle:
   for listitem in sent_keyword:
      filehandle.write('%s\n' % listitem)

#########

#with open("/content/drive/MyDrive/NLP/listfile.txt") as f:
#   sent_keyword = f.read()

## Distribution estimation

In [71]:
### Load keywords file (generated in the previous section)
with open("/content/drive/MyDrive/NLP/listfile.txt") as file:
    sent_keyword = [line.strip() for line in file]

# Make a flat list out of keywords+freq/keywords
sent_keywords_freq = [item for sublist in sent_keyword for item in eval(sublist)]
sent_keywords = [x[0] for x in sent_keywords_freq]

# Clean keywords file by removing letters and numbers (words of length 1)
sent_keywords = [w for w in sent_keywords if len(w)>1]

In [72]:
bigfreq_threshold, smallfreq_threshold = opts.bigfreq_threshold, opts.smallfreq_threshold

#Create all possible bigrams of keywords
bigrm = nltk.bigrams(sent_keywords)
bigram_fd = nltk.FreqDist(bigrm)

# Keep only bigrams that appear the most frequently in the text for our input set
input_set = list(x for x in bigram_fd.items() if x[1]> bigfreq_threshold)

In [73]:
### Extract keywords that appear in the input set . We will use this subset of keywords as our vocabulary corpora

keywords_extract = set() #We make it a set to avoid adding twice the same word

for i in range(len(input_set)):
 keywords_extract.add(input_set[i][0][0])
 keywords_extract.add(input_set[i][0][1])

keywords_extract = list(keywords_extract) # change type from set to list cuz it's more convenient

print(f"there are {len(keywords_extract)} unique words in the vocabulary set")

there are 39 unique words in the vocabulary set


In [74]:
### Create a list of all possible bigrams made out of our corpora
compositions = [(x,y) for x in keywords_extract for y in keywords_extract if y!=x]  
#cc = [' '.join(e) for e in  c]

# Keep only bigrams that appear the less frequently for our input set
small_freqs = list(x for x in bigram_fd.items() if x[0] in compositions and x[1]<= smallfreq_threshold)

### We add to our input set bigrams that appear less frequently to make it bigger
input_set = input_set + small_freqs

In [75]:
### Create distribution file
freqs = [input_set[i][1] for i in range(len(input_set))]

distribution = list()
for i in range(len(input_set)):
  # Recover indexes of each word of the concept
  idx_1 = keywords_extract.index(input_set[i][0][0])
  idx_2 = keywords_extract.index(input_set[i][0][1])

  distribution.append( ( (idx_1,idx_2) , (input_set[i][1])/sum(freqs) ) )

print(f"there are {len(distribution)} items in the distribution set.")

there are 734 items in the distribution set.


In [76]:
### Save distribution  file
with open('/content/drive/MyDrive/NLP/distribution.txt', 'w') as filehandle:
   for listitem,frequency in distribution:
      filehandle.write('%s\n' % [listitem,frequency])
