In [8]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from PyPDF2 import PdfFileReader
import pandas as pd
import numby as np
#loop in files in folder
import glob
#preprocessing
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

#keyword libraries
import gensim
#!pip install rake-nltk
from rake_nltk import Rake
#!pip install git+https://github.com/LIAAD/yake
import yake
#!pip install git+https://github.com/boudinfl/pke.git
import pke

In [58]:
# loop in pdf fils in folder E:\books
BookTitles=[]
contents=[]
for filepath in glob.iglob(r'E:\books\*.pdf'):
    with open(filepath, 'rb') as fp:
        pdf = PdfFileReader(fp)
        BookTitle=pdf.getDocumentInfo().title
        parser = PDFParser(fp)
        document = PDFDocument(parser)
        outlines = document.get_outlines()

        content_title=""
        for(level,title,dest,a,se) in outlines:
            content_title = content_title + title + ", "

        BookTitles.append(BookTitle)
        contents.append(content_title)

In [60]:
data=np.array(list(zip(BookTitles,contents)))
df = pd.DataFrame(data, columns = ['BookTitle', 'TableContents']) 

In [61]:
df.head()

Unnamed: 0,BookTitle,TableContents
0,Hands-On Machine Learning with Scikit-Learn an...,"Cover, Copyright, Table of Contents, Preface, ..."
1,,"Table of Contents, About the Author, About the..."
2,,"Table of Contents, About the Authors, About th..."
3,,"List of Figures, List of Figures, List of Figu..."
4,,"Table of Contents, About the Author, About the..."


Pre-processing

In [108]:
stop_words = set(stopwords.words('english'))
##Creating a list of custom stopwords
new_words = ["fig","figure","image","sample","using", 
             "show", "result", "large", 
             "first","second" ,"third","one", "two", "three", 
             "four", "five", "seven","eight","nine","chapter","cover",
             "copyright", "table", "content", "preface","author","part"
             ,"problem", "solution", "discussion", "see", "also","section","overview",
             "introduction","acknowledgment","edition","list","summary", "exercise"]
stop_words = list(stop_words.union(new_words))

def pre_process(text):
    # lowercase
    text=text.lower()    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    ##Convert to list from string
    text = text.split()
    # lemmatize
    lmtzr = WordNetLemmatizer()
    text = [lmtzr.lemmatize(word) for word in text]
    # remove stopwords
    text = [word for word in text if word not in stop_words]
    # remove words less than three letters
    text = [word for word in text if len(word) >= 3]
    return ' '.join(text)

In [109]:
df['CleanTableContents'] = df['TableContents'].apply(lambda x:pre_process(x))

In [110]:
df.head()

Keyword Extraction by (gensim, rake_nltk, yake) 

And Keyphrases extraction using pke (TopicRank,SingleRank)

In [214]:
#gensim
def gensim_model(text):
    keywords=gensim.summarization.keywords(text, 
             ratio=0.5,               # use 50% of original text
             words=15,              # Number of returned words
             split=True,              # Whether split keywords
             scores=False,            # Whether score of keyword
             pos_filter=('NN', 'JJ'), # Part of speech (nouns, adjectives etc.) filters
             lemmatize=True,         # If True - lemmatize words
             deacc=True)              # If True - remove accentuation
    return keywords
print(gensim_model(df['CleanTableContents'][1]))

['ann', 'file', 'installed', 'cnn', 'model', 'feature', 'application', 'recognition', 'layer', 'optimize', 'trained', 'weight', 'project', 'python implementation']


In [232]:
#Automatic Keyword Extraction algorithm (RAKE) using NLTK
#Doesn't work with preprocessed data
def rake_model(text):
    r = Rake()
    r.extract_keywords_from_text(text)
    keywords= r.get_ranked_phrases_with_scores()[:10]
    return keywords
print(rake_model(df['TableContents'][1]))

[(35.5, 'deploying trained model using fruits 360 dataset'), (34.0, 'python locate libraries ?, manual installation'), (31.2, 'python installers locate libraries ?, preparing'), (31.166666666666664, 'backpropagation algorithm important ?, forward vs'), (28.5, 'deploying trained model using cifar10 dataset'), (24.166666666666668, 'fruits 360 dataset feature mining'), (19.666666666666664, 'image analysis using fc network'), (16.333333333333332, 'building android application using buildozer'), (16.0, 'projects using pip installer'), (15.666666666666666, 'basic application using boxlayout')]


In [233]:
#Keyword Extractor (Yake)
def get_keywords_yake(text):
    y = yake.KeywordExtractor(lan='en',          # language
                             n = 2,              # n-gram size
                             dedupLim = 0.9,     # deduplicationthresold
                             dedupFunc = 'seqm', #  deduplication algorithm
                             windowsSize = 1,
                             top = 10,           # number of keys
                             features=None)           
    keywords = y.extract_keywords(text)
    return keywords
print("-------------------Before cleaning Data-------------------------")
keywords = get_keywords_yake(df['TableContents'][1])
print(keywords)
print("---------------------After cleaning Data-----------------------")
keywords2 = get_keywords_yake(df['CleanTableContents'][1])
print(keywords2)

-------------------Before cleaning Data-------------------------
[('hidden layer', 0.0001876858385499582), ('trained model', 0.0003367550481360212), ('python implementation', 0.00043387377481509255), ('deploying trained', 0.000539293862029752), ('feature mining', 0.0005633338619663957), ('partial derivatives', 0.0005785845502473087), ('engineered features', 0.0005854036585875834), ('image recognition', 0.0006345839249222994), ('neural networks', 0.0006400209937720095), ('complete code', 0.0006400209937720095)]
---------------------After cleaning Data-----------------------
[('hidden layer', 0.0017542211782990385), ('partial derivative', 0.0021070977872515665), ('test pypi', 0.002407176680042636), ('engineered feature', 0.0024851508133507984), ('trained model', 0.0025873971442424057), ('python implementation', 0.0030798748576603024), ('pooling operation', 0.00316064668087735), ('importing module', 0.0032586473137344294), ('max pooling', 0.0033003471701552574), ('deploying trained', 0.00

In [234]:
#Keyphrases extraction using pke (TopicRank)
def pke_TopicRank(text):
    # 1. create a TopicRank extractor.
    extractor = pke.unsupervised.TopicRank()
    # 2. load the content of the document.
    extractor.load_document(input=text)
    # 3. select the longest sequences of nouns and adjectives, that do
    #    not contain punctuation marks or stopwords as candidates.
    pos = {'NOUN', 'PROPN', 'ADJ'}
    stoplist = list(string.punctuation)
    stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
    stoplist += stopwords.words('english')
    extractor.candidate_selection(pos=pos, stoplist=stoplist)
    # 4. build topics by grouping candidates with HAC (average linkage,
    #    threshold of 1/4 of shared stems). Weight the topics using random
    #    walk, and select the first occuring candidate from each topic.
    extractor.candidate_weighting(threshold=0.80, method='average')
    # 5. get the 10-highest scored candidates as keyphrases
    keyphrases = extractor.get_n_best(n=15)
    return keyphrases
print("-------------------Before cleaning Data-------------------------")
keywords = pke_TopicRank(df['TableContents'][1])
print(keywords)
print("---------------------After cleaning Data-----------------------")
keywords2 = pke_TopicRank(df['CleanTableContents'][1])
print(keywords2)

-------------------Before cleaning Data-------------------------
[('anns', 0.043192441566586595), ('cnn', 0.032480254166656966), ('chapter', 0.029776977101706462), ('hidden layer', 0.02777480258548022), ('linear models', 0.02732021426843336), ('feature extraction', 0.02557925079801863), ('introduction', 0.02288111247268137), ('recognition', 0.02245413820988211), ('file upload', 0.0198146853837059), ('ann optimization', 0.019170460282572604), ('filter example', 0.018820747501588218), ('weight optimization', 0.01792119802480089), ('python implementation', 0.01656274541673227), ('backpropagation', 0.016244422789789853), ('tf core', 0.015713363777847038)]
---------------------After cleaning Data-----------------------
[('layer weight update', 0.08044962297618079), ('feature selection reduction filter wrapper', 0.0734962359728493), ('template dynamic template static file', 0.06457255648910591), ('chain partial derivative interpreting backpropagation', 0.05777641106636927), ('weight backprop

In [235]:
#Keyphrases extraction using pke (SingleRank)
def pke_SingleRank(text):
    # define the set of valid Part-of-Speeches
    pos = {'NOUN', 'PROPN', 'ADJ'}
    # 1. create a SingleRank extractor.
    extractor = pke.unsupervised.SingleRank()
    # 2. load the content of the document.
    extractor.load_document(input=text,language='en',normalization=None)
    # 3. select the longest sequences of nouns and adjectives as candidates.
    extractor.candidate_selection(pos=pos)
    # 4. weight the candidates using the sum of their word's scores that are computed using random walk. 
    #In the graph, nodes are words of certain part-of-speech (nouns and adjectives) that are connected if
    # they occur in a window of 10 words.
    extractor.candidate_weighting(window=10,pos=pos)
    # 5. get the 10-highest scored candidates as keyphrases
    keyphrases = extractor.get_n_best(n=15)
    return keyphrases
print("-------------------Before cleaning Data-------------------------")
keywords = pke_SingleRank(df['TableContents'][1])
print(keywords)
print("---------------------After cleaning Data-----------------------")
keywords2 = pke_SingleRank(df['CleanTableContents'][1])
print(keywords2)

-------------------Before cleaning Data-------------------------
[('building android application using buildozer', 0.041164332459831976), ('image analysis using fc network', 0.03441072097251105), ('ann using ga', 0.03276208120598136), ('complete python implementation', 0.031025182217785476), ('ann implementation', 0.029133072372588255), ('tensorflow recognition application', 0.028216672032801718), ('image recognition pipeline', 0.028174204279277165), ('cnn model', 0.02623059307219868), ('kivy application life cycle', 0.02585740486818981), ('image recognition', 0.025472977686913964), ('ann optimization', 0.025467504307264466), ('training ann', 0.023708015230337613), ('graph visualization using tb', 0.023463259914996537), ('simple python project', 0.02345673550145131), ('python implementation', 0.02321515234085222)]
---------------------After cleaning Data-----------------------
[('rate training ann filter example ann architecture activation function python implementation learning rate t