In [37]:
import os
from os import listdir
from os.path import join, isfile
import re
import pdfplumber
import spacy
import json

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.wrappers import LdaMallet

data_path = os.path.join("D:/", "data", "drmkc", "pdfs")
work_path = os.path.join("D:/", "data", "drmkc", "work")
filenames = [join(data_path,f) for f in listdir(data_path) if isfile(join(data_path, f))]
filenames = [filename for filename in filenames if filename.endswith('.pdf')]

nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
nlp.max_length = 2500000

mallet_path = os.path.join('C:\\', 'mallet', 'mallet-2.0.8', 'bin', 'mallet.bat') # update this path

In [4]:
data = []

In [5]:
problem_files = []

for c, filename in enumerate(filenames, start = 1):
    
    if filename in [entry.get('filename') for entry in data]:
        continue
    
    progress = "|{0}| {1:.2f} %".format(("="*int(c/len(filenames) * 50)).ljust(50), c/len(filenames) * 100)
    
    entry = {}
    entry['filename'] = filename
    
    try:
        with pdfplumber.open(join(data_path, filename)) as pdf:
            try:
                pdf_text = '\n'.join([page.extract_text() for page in pdf.pages if page.extract_text() is not None])
                entry['text'] = pdf_text
            except Exception as e:
                print(filename)
                raise e
    except:
        problem_files.append(filename)
    
    data.append(entry)
    
    print(progress, end = "\r")



In [6]:
len(data)

122

In [29]:
data = [entry for entry in data if 'text' in entry]

In [31]:
len(data)

120

In [34]:
# Tokenizer

stop_words = list(nlp.Defaults.stop_words)

def tokenizer_custom(text, stop_words=stop_words, tags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    doc = nlp(text)
        
    pos_tags = tags
    
    tokens = []
      
    for word in doc:
        if (len(word.text) < 2):
            continue
        if word.pos_ in pos_tags:
            token = word.text.lower() # Returning the word in lower-case.
            tokens.append(token)
    
    return(tokens)

In [35]:
for c, entry in enumerate(data, start = 1):
    entry['tokens'] = tokenizer_custom(entry.get('text'))
    
    progress = "|{0}| {1:.2f} %".format(("="*int(c/len(data) * 50)).ljust(50), c/len(data) * 100)
    print(progress, end = "\r")



In [39]:
outname = "drmkc-pdfs_tokenized.json"

with open(os.path.join(work_path, outname), 'w', encoding = 'utf-8') as f:
    json.dump(data, f)

In [None]:
tokens_list = [entry.get('tokens') for entry in data]

# Create Dictionary
id2word = corpora.Dictionary(tokens_list) # integer id per word

# Term Document Frequency
corpus = [id2word.doc2bow(tokens) for tokens in tokens_list] # bag-of-word(bow) tuple for each text - id, count

# View
print(corpus[:1])

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [None]:
lda_model = LdaMallet(mallet_path, corpus=corpus, num_topics=10, id2word=id2word)
lda_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_model)

In [None]:
# Show Topics
pprint(lda_model.show_topics(formatted=False))

# Compute Coherence Score - https://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf
coherence_model_lda = CoherenceModel(model=lda_model, texts=comments_tokens, dictionary=id2word, coherence='c_v')
coherence_ldamodel = coherence_model_lda.get_coherence() 
print('\nCoherence Score: ', coherence_ldamodel)

In [None]:
pprint(lda_model.print_topics(num_words=10))

In [None]:
# Function for getting dominant topic for one corpus entry (bag-of-word tuple - bowt)
def get_dominant_topic(corpus_bowt, ldamodel = lda_model):
    
    dominant_topic_dict = dict()
    
    topics_doc = ldamodel[corpus_bowt]
    
    dominant_topic = sorted(topics_doc, key = lambda t: t[1], reverse = True)[0]
    topic_num = dominant_topic[0]
    topic_prob = dominant_topic[1]
    
    topic_kws = [word for word, prop in ldamodel.show_topic(topic_num)]
    
    dominant_topic_dict['dominant_topic'] = topic_num
    dominant_topic_dict['topic_probability'] = topic_prob
    dominant_topic_dict['topic_keywords'] = topic_kws
    
    return(dominant_topic_dict) # Note that domninant topic info is returned as dictionary

# Creating list of dictionaries - one dictionary contatining dominant topic info for each corpus entry
corpus_dominant_topics = list()
for bowt in corpus:
    dominant_topic = get_dominant_topic(bowt)
    corpus_dominant_topics.append(dominant_topic)

**Workflow**

- [x] Read in pdfs
- [x] tokenize
- [ ] filter keywords
- [ ] keyword analysis
- [ ] topic model (see reddit-miner example)