In [2]:
import sys


# import os to capture file in the directory
import os
from os.path import join

# import np and pandas to frame the text data, plot the figure by using plt
import numpy as np, pandas as pd, matplotlib.pyplot as plt

# import re to clean text, 
import re
from pprint import pprint


# NLTK Stop words
import nltk
from nltk.corpus import stopwords

# delete the meaningles words
stop_words = stopwords.words('english')
stop_words.extend(['paper', 'research', 'study', 'literature', 'article'
                   'author' , 'purpose', 'develop',  'apply', 'make', 'set',
                   'assume', 'investigate', 'explore', 'propose', 'analyze', 'examine', 'analysis',
                   'result', 'discuss', 'aim', 'demonstrate', 'illustrate', 'show', 'find', 'suggest', 
                   'significant', 'important', 'addition', 'finally'])




# Gensim is for model construction
import gensim
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel

# spacy is for lematization
import spacy

import en_core_web_sm

# Enable logging for gensim - optional
import logging, warnings

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

# visulzie the key themes by using pyLDAvis
import pyLDAvis
import pyLDAvis.gensim  # important !!!

In [4]:
content = pd.read_excel(join('F:\\Projects\\SCF\\SCF_LR\\1_data', 'SCF_Modelling_Topic_5_3_3_1.xlsx'), sheet_name='Topic_5_3_3_1').fillna('')
# read the cleaned data
df = pd.DataFrame(content['Article'] + ". " + content['Abstract'] + ". " +content['Keywords']) 
# Combine the title, abstract and keywords together
df.columns = ["content"]

In [68]:
## # Tokenize and Clean-up using gensim’s simple_preprocess()

In [17]:
# delete unwanted signs and sparate gra into single word
def para_to_words(parahs):
    for parah in parahs:
        parah = re.sub("\'", "", parah)  # remove single quotes
        parah = gensim.utils.simple_preprocess(str(parah), deacc=True) 
        # split the garah into the words
        yield(parah)

In [20]:
parahs = df.content.values.tolist()
# Convert content in the dataframe to list
parahs_words = list(para_to_words(parahs)) 
print(parahs_words[:1]) 
# list the words for the first pagragh

[['mixture', 'inventory', 'model', 'of', 'lost', 'sale', 'and', 'back', 'order', 'with', 'stochastic', 'lead', 'time', 'demand', 'on', 'permissible', 'delay', 'in', 'payments', 'it', 'is', 'seen', 'that', 'the', 'trade', 'credit', 'period', 'has', 'an', 'important', 'role', 'in', 'real', 'business', 'world', 'in', 'this', 'article', 'an', 'inventory', 'model', 'has', 'been', 'developed', 'by', 'considering', 'stochastic', 'lead', 'time', 'demand', 'with', 'lead', 'time', 'crashing', 'cost', 'here', 'also', 'to', 'get', 'the', 'impact', 'between', 'credit', 'period', 'and', 'lead', 'time', 'lead', 'time', 'dependent', 'credit', 'period', 'has', 'been', 'considered', 'in', 'this', 'model', 'considering', 'partial', 'back', 'order', 'the', 'effect', 'of', 'lost', 'sale', 'has', 'been', 'included', 'under', 'the', 'above', 'considerations', 'an', 'inventory', 'model', 'has', 'been', 'optimized', 'in', 'the', 'parlance', 'of', 'infinite', 'time', 'horizon', 'here', 'three', 'objective', 'fu

In [22]:
# Build the bigram and trigram models 
bigram = gensim.models.Phrases(parahs_words, min_count=5, threshold=1) 
# include three paramaters: 
# paragrahs need to be analyzed, 
# the minimum score for a bigram to be taken into account, higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[parahs_words], threshold=1)  
# train trigram model
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    # delete stop words 
    texts = [bigram_mod[doc] for doc in texts]
    # construct word group with two words
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    # construct word group with three words
    texts_out = []
    # construct empty list
    nlp = en_core_web_sm.load(disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

parahs_ready = process_words(parahs_words)  # processed Text Data!
print(parahs_ready[:1])



[['mixture', 'inventory_model', 'lost', 'sale', 'back', 'order', 'stochastic', 'lead_time', 'demand', 'payment', 'see', 'trade_credit', 'period', 'role', 'real', 'business', 'world', 'article', 'inventory_model', 'consider', 'stochastic', 'lead_time', 'demand', 'lead_time', 'crash', 'cost', 'also', 'get', 'impact', 'lead_time', 'lead_time', 'dependent', 'credit_period', 'consider', 'model', 'consider', 'partial', 'back', 'order', 'effect', 'lose', 'sale', 'include', 'consideration', 'inventory_model', 'optimize', 'parlance', 'infinite', 'time', 'horizon', 'objective', 'function', 'basis', 'position', 'credit_period', 'business', 'period', 'feasibility', 'model', 'different', 'sensitivity', 'respect', 'parameter', 'crash', 'cost', 'inventory_model', 'lead_time', 'stochastic', 'demand']]


In [23]:
# To do topic Modeling, we needs two elements: Dictionary and Corpus.

# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus by terming Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

In [24]:
# Build normal LDA model

In [25]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

# for detial, see https://radimrehurek.com/gensim/models/ldamodel.html

# para1 (corpus): Stream of document vectors or sparse matrix of shape; 
# para2 (id2word): The dictionary; Mapping from word IDs to words.
# para3 (num_topics): The number of requested latent topics to be extracted from the training corpus.
# para4 (random_state): Either a randomState object or a seed to generate one. Useful for reproducibility.
# para5 (update_every): Number of documents to be iterated through for each update. Set to 0 for batch learning, > 1 for online iterative learning.
# para6 (chunksize):  Number of documents to be used in each training chunk.
# para6 (passes):  Number of passes through the corpus during training.
# para7 (alpha):  Number of passes through the corpus during training.
# para8 (iterations):  Maximum number of iterations through the corpus when inferring the topic distribution of a corpus.
# para8 (per_word_topics):  If True, the model also computes a list of topics, 
# sorted in descending order of most likely topics for each word, along with their phi values multiplied by the feature length (i.e. word count).

In [96]:
# Show dominant topic and its percentage contribution in each document

In [27]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=parahs_ready)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(5)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,2.0,0.7428,"dcf, full, real, objective, discount, lead_tim...","[mixture, inventory_model, lost, sale, back, o..."
1,1,1.0,0.5148,"supplier, demand, retailer, offer, credit_peri...","[retailer, optimal, ordering_policy, order, si..."
2,2,1.0,0.9887,"supplier, demand, retailer, offer, credit_peri...","[level, credit_period, coordination, periodic,..."
3,3,3.0,0.8424,"price, function, optimal, pricing, also, retai...","[effect, two_echelon, trade_credit, pricing, i..."
4,4,1.0,0.6776,"supplier, demand, retailer, offer, credit_peri...","[retailer, supplier, supply_chain, model, trad..."


In [30]:
writer = pd.ExcelWriter(join('F:\\Projects\\SCF\\SCF_LR\\1_data', 'SCFpapers_Classification_modelling_5_3_3_1.xlsx'))
# save the new file to the new directory 
df_dominant_topic.to_excel(writer,'Sheet1')
writer.save()

In [29]:
pyLDAvis.enable_notebook()
visul = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
# paramater 1: trianed model 
# paramater 2: Stream of document vectors or sparse matrix of shape
# paramater 3: dictionary
visul

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
