### Setting Working Directory

In [None]:
import os
# r means raw text i.e consider thus text as it is
os.chdir(r"D:\ORBA Winter Semester 2020-2021\Scientific Project\Text Mining\Recommended Materials")
print(os.getcwd())

### Installing Libraries

In [None]:
# Library needed to read the PDF file containing conference proceedings
!pip install PyPDF2

# Library needed for text pre-processing
!pip install nltk

# Libraries needed for generating word cloud (Exploratory Data Analysis of the pre-processed tokens)
!pip install wordcloud
!pip install matplotlib

# Installing GenSim- a python-based open-source framework for unsupervised topic modeling and natural language processing
!pip install gensim

# Installing Regular Expressions (RegEx) library
# RegEx is a sequence of characters that forms a search pattern. RegEx are used to match strings of text such
#as particular characters, words, or patterns of characters.
#!pip install regex

# Installing the pyldavis library for visualization
!pip install pyldavis


### Importing Library Functions

In [None]:
# Importing function to read PDF file contents
from PyPDF2 import PdfFileReader

# Importing functions needed to handle regular expressions, tokenize text (sentence & word), remove stopwords,punctuations and lemmatize (Text Pre-processing)
import re
import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer

# Importing POS tags function of NLTK
from nltk import pos_tag

# Importing Functions for Generating Term frequency histogram andWord Cloud
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Importing functions needed for topic modelling
import gensim
from gensim import corpora

# Importing functions needed to perform model evaluation
from gensim.models import CoherenceModel

#Importing function for topic model visualization
import pyLDAvis.gensim_models

### Reading the PDF File and Extracting Text

In [None]:
filename = 'INFORMS.pdf'
pdfFileObj = open(filename,'rb')
pdfReader = PdfFileReader(pdfFileObj,strict=False)

In [None]:
# Getting the number of pages in the file and Initializing a count object
num_pages = pdfReader.numPages 
print(num_pages)

In [None]:
count = 0
text = '''  '''

# Creation of loop to read all pages of the PDF file
while count < num_pages:
    pageObj = pdfReader.getPage(count)
    count +=1
    text += pageObj.extractText()
    

In [None]:
print(text)

### Tokenization of Text- Using NLTK Tokenizer

In [None]:
# Performing Word Tokenization
tokens= word_tokenize(text)
type(tokens)
len(tokens)
print(tokens)

# Alternative Way to Tokenize 
#tokens= text.split()


In [None]:
#Extracting Alphabets only from Tokens
alphabetic_only = [word for word in tokens if word.isalpha()]
print(alphabetic_only)

In [None]:
# Converting alphabet_only tokens to lower case
lower_case_tokens = [word.lower() for word in alphabetic_only]
print(lower_case_tokens)
print(len(lower_case_tokens))

In [None]:
# Performing Parts of Speech Tagging (POS-Tagging)
POS_tags=pos_tag(tokens)
print(POS_tags)


### Removal of Stopwords, Numeric Characters and Punctuations 

In [None]:
# Removing stopwords
exclude = set(string.punctuation)
print(exclude)

stop = set(stopwords.words('english'))
print(stop)

# Extracting Tokens Without Stopwords
tokens_without_stopwords = [word for word in lower_case_tokens if word not in stop]
print(tokens_without_stopwords)
print(len(tokens_without_stopwords))
type(tokens_without_stopwords)

In [None]:
# Removing numeric characters ( Fun fact: use of ".extend" and ".append"; ".isdigit() and ".isnumeric())

stopwords_num_free_tokens =[]
        
        
stopwords_num_free_tokens.extend(str(j) for j in tokens_without_stopwords if not j.isnumeric())  
print(stopwords_num_free_tokens)
        

In [None]:
# Removing Punctuations

punc_free_tokens=[]
punc_free_tokens.extend(k for k in stopwords_num_free_tokens if k not in exclude)

print(punc_free_tokens)

### Lemmatization/Stemming of Strings

In [None]:
# Lemmatization of Tokens

lemmatizer= WordNetLemmatizer()
#lemma =lemma.join(lemmatizer.lemmatize(word) for word in punc_free_tokens)
#lemma = ''' '''.join(lemmatizer.lemmatize(word) for word in punc_free_tokens.split())


# Alternative way to Lemmatize
lemma = [lemmatizer.lemmatize (t) for t in punc_free_tokens]

print(lemma)
print(len(lemma))
type(lemma)

In [None]:
# Stemming of Tokens

#from nltk.stem.porter import PorterStemmer
#porter= PorterStemmer()
#stemmed=[porter.stem(token) for token in lemma]
#lemma=stemmed
#print(lemma)
#print(len(lemma))

### Creating a gensim corpus Using the Lemmatized Strings

In [None]:
#  splitting each element in lemma (list) to create an array needed by the dictionary 
#(without this an error "doc2bow expects an array of unicode tokens on input, not a single string") will be generated.
lemma_array = [item.split() for item in lemma]


# Alternative way to split strings in the lemma
#lemma_array= [item.split('-') for item in lemma]

In [None]:
# Defining the dictionary of corpus(words); all unique terms are assigned an index
dictionary = corpora.Dictionary(lemma_array)

# The corpus = lemma_array

# Converting tokenized and cleaned text into bag of words vectors/Generation of Document Term Matrix using the dictionary
corpus = [dictionary.doc2bow(text) for text in lemma_array]

# Printing corpus and dictionary
# dictionary.token2id 
corpus
#print(dictionary)

for j in dictionary.values():
    print(j)

### Exploratory Data Analysis of Pre-Processed Tokens 

In [None]:
# Plotting Term Frequency Histogram
%matplotlib inline

word_freq= nltk.FreqDist(lemma)
plt.hist(word_freq.values(), bins = range(50))
plt.show()

In [None]:
len(word_freq)

In [None]:
print(word_freq.hapaxes())

In [None]:
# Plotting of Word Cloud
data_tokens=(str(lemma_array))  
print(type(data_tokens))
wordcloud = WordCloud(width = 800, height = 500, 
                background_color ='white',  
                min_font_size = 10).generate(data_tokens) 

plt.figure(figsize = (5, 5), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show()

### Topic Modelling Using Latent Dirichlet Allocation (LDA) with gensim

In [None]:
# SPECIFYING FEW MODEL PARAMETERS

# Define a LDA model with 3 topics: Training the LDA model on document term matrix
#ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=100)

# Define a LDA model with 4 topics: Training the LDA model on document term matrix 
#ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=4, id2word=dictionary, passes=100)

# Define a LDA model with 5 topics: Training the LDA model on document term matrix 
#ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=100)

# SPECIFYING MORE MODEL PARAMETERS

# Define a LDA model with 3 topics: Training the LDA model on document term matrix
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, random_state=100, update_every=1, chunksize=100,passes=100, alpha='auto',per_word_topics=True)

# Define a LDA model with 4 topics : Training the LDA model on document term matrix 
#ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=4, random_state=100, update_every=1, chunksize=100,passes=100, alpha='auto',per_word_topics=True)

# Define a LDA model with 5 topics : Training the LDA model on document term matrix
#ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=100, update_every=1, chunksize=100,passes=100, alpha='auto',per_word_topics=True)

In [None]:
#Printing the three topics from the model with weight of the top 20 keywords contributing to each topic
topics = ldamodel.print_topics(num_words=10)
# Print the results
for topic in topics:
    print(topic)

### Evaluating The LDA Topic Model By Computing Model Perplexity and Coherence Score

In [None]:
# Computing Perplexity : a measure of how good the model is. The lower the perplexity,the better the model is.
print('\nPerplexity: ', ldamodel.log_perplexity(corpus))  

# Computing Coherence Score
coherence_model_lda = CoherenceModel(model=ldamodel, texts=lemma_array, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

### Determining Optimal Number of Topic

In [None]:
coherence_values = []
model_list = []

for i in range(1,11,1):
        model = gensim.models.ldamodel.LdaModel(corpus, num_topics=i, id2word=dictionary, passes=20)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=lemma_array, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        #print(coherence_values)
        #print(model_list)
      

In [None]:
x=range(1,11,1)               
plt.plot(x, coherence_values) 
plt.title('Plot of Coherence Scores Vs Topic Numbers')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Scores')
plt.grid(True)
plt.show()

### Visualizing the Topics

In [None]:
lda_display= pyLDAvis.gensim_models.prepare(ldamodel,corpus,dictionary,sort_topics=False)

pyLDAvis.display(lda_display)

### Assigning Topics To The Lemmatized Strings(Documents) - Maybe Not Important

In [None]:
# Installing the Pandas library 
!pip install pandas

In [None]:
import pandas as pd

In [None]:
#  Determining What topic a given text is about by finding the topic number with the highest percentage contribution in the text.

def get_topic_details(ldamodel, corpus): 
    topic_details_df = pd.DataFrame() 
    for i, row in enumerate(ldamodel[corpus]): 
        row = sorted(row, key=lambda x: (x[1]), reverse=True) 
        for j, (topic_num, prop_topic) in enumerate(row): 
            if j == 0:  # => dominant topic 
                wp = ldamodel.show_topic(topic_num) 
                topic_details_df = topic_details_df.append(pd.Series([topic_num, prop_topic]),  
                                                                      ignore_index=True) 
    topic_details_df.columns = ['Dominant_Topic', '% Score'] 
    return topic_details_df 

In [None]:
contents = pd.DataFrame({'Original text':lemma}) 
topic_details = pd.concat([get_topic_details(ldamodel, 
                           corpus), contents], axis=1) 
topic_details.head() 

In [None]:
# Generating the topic associated with each document
count = 0 
for k in ldamodel[corpus]: 
    print("doc : ",count,k) 
    count. += 1 