In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing important modules

In [2]:
import nltk 
nltk.download('stopwords')
import re 
import numpy as np
import pandas as pd
from pprint import pprint

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#Spacy for lemmatization
import spacy 

#plotting tools 
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt 
%matplotlib inline

# Enable logging for gensim 
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore" , category=DeprecationWarning)


# **Prepare Stopwords**
We have already downloaded the stopwords. Let’s import them and make it available in stop_words

In [3]:
#NLTK stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [4]:
pip install openpyxl


In [5]:
#importing Dataset
df = pd.read_excel('../input/dataset/Pubmed5k.xlsx')
df.head()

In [6]:
print(df.Title.unique())

**cleaning the data**

In [7]:
#convert to list 
data = df.Abstract.values.tolist()

#removing e-mails if exists
data = [re.sub('\S*@\S*\s' , '',sent) for sent in data]

#remove newline chars
data = [re.sub('\s+' , ' ' , sent) for sent in data]

#Remove distracting single quotes 
data = [re.sub("\'" , "" , sent) for sent in data]

pprint(data[:1])

# Tokenize words and clean-up text
**Let’s tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether.**

In [8]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence),deacc=True))

data_words = list(sent_to_words(data))

print(data_words[:1])
        

# Creating Bigram and Trigram Models

In [9]:
#building the bigram and trigram models 
bigram = gensim.models.Phrases(data_words , min_count=5 , threshold=100)
trigram = gensim.models.Phrases(bigram[data_words] , threshold=100)

#faster way to get sentence clubbed as trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

#see trigram example 
print(trigram_mod[bigram_mod[data_words[0]]])

# Remove Stopwords, Make Bigrams and Lemmatize
The bigrams model is ready. Let’s define the functions to remove the stopwords, make bigrams and lemmatization and call them sequentially.

In [10]:
#define functions for stopwords , bigrams , trigrams , lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts , allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out        

**Let's call the functions in order to perform the operations**

In [11]:
#remove stopwords
data_words_nostops = remove_stopwords(data_words)

#forming bi-grams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm' , disable=['parser', 'ner'])

#do lemmatizayion keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])


# Create the Dictionary and Corpus needed for Topic Modeling

In [12]:
#create Dictionary 
id2word = corpora.Dictionary(data_lemmatized)

#ceate corpus
texts = data_lemmatized

#term document frequency 
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

> **Gensim creates a unique id for each word in the document. The produced corpus shown above is a mapping of (word_id, word_frequency).**

> **For example, (0, 1) above implies, word id 0 occurs once in the first document. Likewise, word id 1 occurs twice and so on.**

In [13]:
# readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

# **Building The Topic Model**
***Let's try number of topics to be 10 then consider the optimal number of topics***

In [14]:
#Build LDA Model 
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# **View The Topics in LDA Model**
**the above model is built with 10 different topics where each topic is consists of variety of keywords each of these keyword has its certain weight that contributes to the topic**

***let's see in the next cell the importance score of the keyword for each topic***

In [15]:
#print the keyword in the 10 topics 
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

# **Compute Model Perplexity and Coherence Score**

***Model perplexity and topic coherence provide a convenient measure to judge how good a given topic model is.***

In [16]:
#Compute Perplexity
#a measure of how good the model is , more lower more better 
print('\nPerplexity: ' , lda_model.log_perplexity(corpus))

#Compute Coherence Score 
coherence_model_lda = CoherenceModel(model=lda_model , texts=data_lemmatized , dictionary=id2word , coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ' , coherence_lda)

# **Let's Now Visualize The Topics-Keywords**

***Now that the LDA model is built, the next step is to examine the produced topics and the associated keywords. There is no better tool than pyLDAvis package’s interactive chart and is designed to work well with jupyter notebooks.***

In [17]:
#Visualize The Topics 
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model , corpus , id2word)
vis

So how to infer pyLDAvis’s output?

Each bubble on the left-hand side plot represents a topic. The larger the bubble, the more prevalent is that topic.

A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.

A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.

Alright, if you move the cursor over one of the bubbles, the words and bars on the right-hand side will update. These words are the salient keywords that form the selected topic.
We have successfully built a good looking topic model.

Given our prior knowledge of the number of natural topics in the document, finding the best model was fairly straightforward.

Upnext, we will improve upon this model by using Mallet’s version of LDA algorithm and then we will focus on how to arrive at the optimal number of topics given any large corpus of text

# **How to find the optimal number of topics for LDA?**

***My approach to finding the optimal number of topics is to build many LDA models with different values of number of topics (k) and pick the one that gives the highest coherence value.***


In [18]:
def compute_coherence_values(dictionary , corpus , texts , limit , start = 2 , step = 1):
    """
    compute coherence score for various number of topics 
    
    parameters :
    ------------
    - dictionnary : Gensim Dictionary 
    - corpus : gensim corpus 
    - texts : list of input texts 
    - limit : maximum number of topics
    
    returns:
    --------
    - model list :List of LDA topic Models 
    - coherence_values : coherence values corresponding to the lda model with respective number of topics 
    """
    x = 1
    coherence_values = []
    model_list = []
    for num_topics in range(start , limit , step):
        model =  gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=x,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model , texts=texts , dictionary=dictionary , coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        x = x+1
        
    return model_list , coherence_values
            

In [19]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=20, step=1)

In [20]:
#show graph
x = range(2 , 20 , 1)
plt.plot(x , coherence_values)
plt.xlabel("Num of topics")
plt.ylabel("Coherence Score")
plt.legend(("coherence_values") , loc='best')
plt.title("Chossing the optimal model with coherence score")
plt.show()

In [21]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

# **As required**
**i'll continue using top 3 topics**

In [22]:
optimal_model = model_list[2]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

# Let's buld the model with the required number of topics
**and visulaize it**

In [23]:
#Build LDA Model 
lda_model_opt = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [24]:
pprint(lda_model_opt.print_topics())
doc_lda = lda_model_opt[corpus]

# **Let's again Compute Model Perplexity and Coherence Score**
***Model perplexity and topic coherence provide a convenient measure to judge how good a given topic model is.***

In [25]:
#Compute Perplexity
#a measure of how good the model is , more lower more better 
print('\nPerplexity: ' , lda_model_opt.log_perplexity(corpus))

#Compute Coherence Score 
coherence_model_lda_opt = CoherenceModel(model=lda_model_opt , texts=data_lemmatized , dictionary=id2word , coherence='c_v')
coherence_lda_opt = coherence_model_lda_opt.get_coherence()
print('\nCoherence Score: ' , coherence_lda_opt)

# **Let's Now Visualize The Topics-Keywords**

***Now that the LDA model is built, the next step is to examine the produced topics and the associated keywords. There is no better tool than pyLDAvis package’s interactive chart and is designed to work well with jupyter notebooks.***

In [26]:
#Visualize The Topics 
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_opt , corpus , id2word)
vis

# Let's Find out the dominant topic in each sentence
**One of the practical application of topic modeling is to determine what topic a given document is about.**

**To find that, we find the topic number that has the highest percentage contribution in that document.**

In [27]:
def format_topics_sentences(ldamodel = lda_model_opt , corpus = corpus, texts=data):
    #Init output
    sent_topics_df = pd.DataFrame()
    

    #Get The Main Topic in Each Document
    for i , row in enumerate(ldamodel[corpus]):
        row2 = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        #Get The Dominant Topic , Perc% Contibution and keywords for each document 
        for j , (topic_num , prop_topic) in enumerate(row2):
            if j == 0: # Dominant Topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word , prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num) , round(prop_topic,4) , topic_keywords]) , ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    
    #Add original text in the end of output 
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df , contents] , axis=1)
    return (sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model_opt , corpus=corpus , texts=data)

#Formatting 
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

# **Conclusion**

> **First things first i want to thank everyone that give me the honor to proceed to this step**

> **Second I really hope this simple NoteBook Meet your Expectations**

> **And I really happy to say i'm really honord by this chance , whatever the result you see i accept it happily**

# **About the next steps to improve The Model**
> **We Could Expand The Number Of Topics To get More Better Coherence Score**

> **Expand The DataSet And Get More Organised Documents For Better Results**

> ***Finally I want To Say SomeThing .. Maybe This Work Isn't Ideal For Your Needs , But to Be Honest This is The First Time For Me To work With This Algorithm(LDA) , Shame On Me Ofcourse , But I really Tried My Best To Learn With The Fly To Get The Best Results as much as i can , I really happy to deal with new algorithm and of course i'll add this valuable Task To my Portfolio***