In [1]:
import os.path
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt

import gensim 
from gensim.models import LdaModel
from gensim.models.wrappers import LdaMallet

import gensim.corpora as corpora
from gensim.corpora import Dictionary

from gensim import matutils, models

import pyLDAvis.gensim
import string
import time
import spacy
# Load English model for SpaCy
nlp = spacy.load("en_core_web_sm")

pd.set_option('display.max_colwidth', None) 

In [115]:
text = pd.read_csv("../processed_data/text_by_paragraph.csv").drop(columns=['Unnamed: 0'])

In [None]:
text.head()

In [117]:
def preprocess(text, 
               min_token_len = 2, 
               irrelevant_pos = ['PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'ADJ']): 
    """
    Given text, min_token_len, and irrelevant_pos carry out preprocessing of the text 
    and return a preprocessed string. 
    
    Parameters
    -------------
    text : (str) 
        the text to be preprocessed
    min_token_len : (int) 
        min_token_length required
    irrelevant_pos : (list) 
        a list of irrelevant pos tags
    
    Returns
    -------------
    (str) the preprocessed text
    """
    try:
        doc = nlp(text)
    except:
        return "missing value"
    
    results = []
    
    for token in doc:
        
        # Irrelevant POS
        if token.pos_ in irrelevant_pos:
            continue
            
        # Stop words
        if token.is_stop:
            continue
        
        # Word length    
        if len(token)<2:
            continue
            
            
        results.append(token.lemma_)
   
    return " ".join(results)

In [118]:
text['Preprocessed_text'] = text['Corpus'].apply(preprocess)

In [None]:
text.head()

## Creating the model

In [120]:
corpus = [doc.split() for doc in text['Preprocessed_text'].tolist()]
dictionary = corpora.Dictionary(corpus)
dictionary.filter_extremes(no_below=10, no_above=0.1, keep_n= 100000)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus]

In [121]:
dictionary.get(1)

'Mali'

In [122]:
experiments = {
    'num_topics': [3, 10, 20],
    'passes': [10, 20, 30, 40]
}

In [123]:
results = {
    'model': [],
    'model_name': [],
    'running_time': []
}

In [None]:
for j in experiments['num_topics']:
    for k in experiments['passes']:
        start = time.time()
        model = models.LdaModel(corpus=doc_term_matrix, 
                                id2word=dictionary, 
                                num_topics=j, 
                                passes=k)
        end = time.time() - start
            
        title = f'LDA using {j} topics and {k} passes'
            
        results['model'].append(model)
        results['model_name'].append(title)
        results['running_time'].append(end)
            
        print(title)
        print('-------------------')
        print(model.print_topics())
        print('\n')
            

## Visualizing the data

In [None]:
pyLDAvis.enable_notebook()
lda = results['model'][3]
vis_wpn = pyLDAvis.gensim.prepare(lda, doc_term_matrix, dictionary, sort_topics=False)
vis_wpn

In [176]:
def get_most_prob_topic(unseen_document, dictionary=dictionary, model = lda):
    """
    Given an unseen_document, and a trained LDA model, this function
    finds the most likely topic (topic with the highest probability) from the 
    topic distribution of the unseen document and returns the best topic with 
    its probability. . 
    
    Parameters
    ------------
    unseen_document : (str) 
        the document to be labeled with a topic
    
    dictionary: (gensim Dictionary)
        dictionary of the LDA model
    
    model : (gensim ldamodel) 
        the trained LDA model
    
    
    Returns: 
    -------------
        (str) a string of the form 
        `most likely topic label:probability of that label` 
    
    Examples:
    ----------
    >> get_most_prob_topic("The research uses an HMM for discovering gene sequence.", 
                            model = lda)
    Science and Technology:0.435
    """  
    unseen_doc_preprocessed = preprocess(unseen_document)
    corpus_list = [unseen_doc_preprocessed.split()]
    other_corpus = [dictionary.doc2bow(text) for text in corpus_list]
    
    results = model[other_corpus]
    
    topic = max(results[0],key=lambda item:item[1])
    prob = 'Ambiguous' if topic[1] < 0.6 else 'Not Ambiguous'

    return topic[0], prob

def get_first(the_tuple):
    '''get first element of the tuple (for the dataframe)
    '''
    return the_tuple[0]

def get_second(the_tuple):
    '''get second element of the tuple (for the dataframe)
    '''
    return the_tuple[1]

def split_freq(text):
    '''helper function for the frequency dataframe
    '''
    text_split = text.split("*")
    first, second = text_split[0], text_split[1].replace('"','')
    
    return first, second

In [149]:
text['Topic'] = text['Preprocessed_text'].apply(get_most_prob_topic, args=(dictionary, lda,))
text['Ambiguity'] = text['Topic'].apply(get_second)
text['Topic'] = text['Topic'].apply(get_first)

In [None]:
text.head()

In [151]:
text.iloc[:,[0,7,8,10,9, 11]].to_csv("../processed_data/topics.csv")

# Word importance

In [188]:
topic_freq = lda.show_topics(0,100)
topic_res = {
    'Topic': [],
    'Words': []
}

for topic in topic_freq:
    topic_res['Topic'].append(topic[0])
    topic_res['Words'].append(topic[1].split("+"))
    
topic_freq_df = pd.DataFrame(topic_res).explode(column = 'Words')
topic_freq_df['Words'] = topic_freq_df['Words'].apply(split_freq)
topic_freq_df['Freq'] = topic_freq_df['Words'].apply(get_first)
topic_freq_df['Words'] = topic_freq_df['Words'].apply(get_second)

In [190]:
topic_freq_df.to_csv("../processed_data/frequency.csv")