### Step 1. Imports

In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
import spacy
import gensim.corpora as corpora
from pprint import pprint
import pyLDAvis.gensim_models as gensimvis
import pickle
import pyLDAvis
import os
from gensim.models import CoherenceModel

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Step 2. Data read

In [2]:
papers = pd.read_csv('papers.csv')
papers = papers.drop(columns = ['id', 'event_type', 'pdf_name'], axis = 1)
papers = papers.sample(100)

In [3]:
papers.head(2)

Unnamed: 0,year,title,abstract,paper_text
47,1995,Empirical Entropy Manipulation for Real-World ...,Abstract Missing,Empirical Entropy Manipulation for\nReal-World...
1050,1989,A Computer Modeling Approach to Understanding ...,Abstract Missing,A Computer Modeling Approach to Understanding\...


### Step 3. Remove punctuation

In [4]:
papers['clean_text'] = papers['paper_text'].map(lambda x: re.sub('[,\.!?]', '', x))

  papers['clean_text'] = papers['paper_text'].map(lambda x: re.sub('[,\.!?]', '', x))


### Step 4. Remove digits

In [5]:
papers['clean_text'] = papers['clean_text'].apply(lambda x: re.sub('\w*\d\w*', '', x))

  papers['clean_text'] = papers['clean_text'].apply(lambda x: re.sub('\w*\d\w*', '', x))


### Step 5. Lowercase the text

In [6]:
papers['clean_text'] = papers['clean_text'].map(lambda x: x.lower())

### Step 6. Create document term matrix

In [7]:
def create_document_term_matrix(dataframe, column_name):
  cv = CountVectorizer(analyzer = 'word')
  data = cv.fit_transform(dataframe[column_name])
  df_dtm = pd.DataFrame(data.toarray(), columns = cv.get_feature_names_out())
  df_dtm.index = dataframe.index
  return df_dtm

df_dtm = create_document_term_matrix(papers, 'clean_text')
df_dtm.head(3)

Unnamed: 0,__,___,____,_____,________,_a_t_n_e_t_s,_g,_ge,_l,_l_,...,zucker,zurich,zuvs,zuylen,zv,zvi,zx,zxk,zy,zz
47,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1050,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2716,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,3,0,0,0,0,0


### Step 7. Remove stopwords

In [23]:
stop_words = stopwords.words('english')
stop_words.extend(['has', 'been', 're', 'com', 'edu', 'use'])

def convert_sentences_to_words(sentences):
  for sentence in sentences:
    yield(gensim.utils.simple_preprocess(str(sentence), deacc = True))
    
def remove_all_stop_words(texts):
  return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

text_to_list = papers.clean_text.values.tolist()
text_as_words = list(convert_sentences_to_words(text_to_list))
clean_words = remove_all_stop_words(text_as_words)

### Step 8. Create Bigram and Trigram

In [9]:
bigram = gensim.models.Phrases(clean_words, min_count = 5, threshold = 100)
trigram = gensim.models.Phrases(bigram[clean_words], threshold = 100)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
  return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
  return [trigram_mod[bigram_mod[doc]] for doc in texts]

### Step 9. Lemmatization

In [16]:
def lemmatization(texts, allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']):
  texts_out = []
  for sent in texts:
    doc = nlp(' '.join(sent))
    texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
  return texts_out

nlp = spacy.load('en_core_web_sm', disable = ['parser', 'ner'])

clean_words_bigrams = make_bigrams(clean_words)

clean_words_lemmatized = lemmatization(clean_words_bigrams)

### Step 10. Tokenization

In [25]:
id2word = corpora.Dictionary(clean_words_lemmatized)
texts = clean_words_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

### Step 11. LDA Model

In [12]:
num_topics = 5

lda_model = gensim.models.LdaMulticore(corpus = corpus, id2word = id2word, num_topics = num_topics,
                                     
                                       
                                      )

pprint(lda_model.print_topics())

[(0,
  '0.011*"model" + 0.009*"learn" + 0.009*"use" + 0.007*"set" + 0.007*"datum" + '
  '0.006*"distribution" + 0.006*"network" + 0.005*"result" + 0.005*"give" + '
  '0.005*"function"'),
 (1,
  '0.013*"model" + 0.009*"learn" + 0.008*"use" + 0.006*"function" + '
  '0.006*"datum" + 0.006*"set" + 0.005*"result" + 0.005*"show" + '
  '0.005*"network" + 0.005*"number"'),
 (2,
  '0.012*"model" + 0.008*"use" + 0.008*"learn" + 0.006*"datum" + '
  '0.005*"figure" + 0.005*"function" + 0.005*"result" + 0.005*"show" + '
  '0.005*"distribution" + 0.004*"set"'),
 (3,
  '0.017*"model" + 0.009*"use" + 0.007*"set" + 0.007*"learn" + '
  '0.006*"function" + 0.006*"show" + 0.006*"give" + 0.005*"datum" + '
  '0.005*"feature" + 0.004*"problem"'),
 (4,
  '0.014*"model" + 0.010*"use" + 0.008*"learn" + 0.007*"set" + 0.006*"network" '
  '+ 0.005*"image" + 0.005*"datum" + 0.004*"show" + 0.004*"problem" + '
  '0.004*"sample"')]


In [13]:
doc_lda = lda_model[corpus]

### Step 12. Model Results evaluation

In [14]:
pyLDAvis.enable_notebook()

model_filepath = 'LDA_Model_' + str(num_topics)

LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
with open(model_filepath, 'wb') as f:
  pickle.dump(LDAvis_prepared, f)
  
with open(model_filepath, 'rb') as f:
  LDAvis_prepared = pickle.load(f)
  
pyLDAvis.save_html(LDAvis_prepared, 'LDAvis_prepared_' + str(num_topics) + '.html')

LDAvis_prepared

  default_term_info = default_term_info.sort_values(


### Step 13. Calculate the Coherence score

In [29]:
coherence_model_lda = CoherenceModel(model = lda_model, texts = clean_words_lemmatized, dictionary = id2word, coherence = 'c_v')

coherence_lda = coherence_model_lda.get_coherence()

print(coherence_lda)

0.24360441299038746
