In [3]:
#https://github.com/kapadias/mediumposts/blob/master/natural_language_processing/topic_modeling/notebooks/Introduction%20to%20Topic%20Modeling.ipynb

# Importing modules
import pandas as pd
import os
import numpy as np
import re


os.chdir('..')

# Read data into papers
papers = pd.read_csv('D:\Anaconda\python\scopus.csv')

# Print head
papers.head()

Unnamed: 0,Authors,Title,Cited by,Link,Abstract
0,"Pupysheva I., Kiss I.",Two Medical Cases - Two Ways of Self-Realization,,https://www.scopus.com/inward/record.uri?eid=2...,The paper compares two works of art as two med...
1,"Kofod F., Crane A.",The body and the verb Emotion in Gija,,https://www.scopus.com/inward/record.uri?eid=2...,This paper explores the figurative expression ...
2,"Hackman C.L., Bettergarcia J.N., Wedell E., Si...",Qualitative Exploration of Perceptions of Sexu...,8.0,https://www.scopus.com/inward/record.uri?eid=2...,"Lesbian, Gay, Bisexual, Transgender and Queer/..."
3,"Barroso C.V., Arguedas-Morales M., Sánchez R.M...",Adolescent psychological strength: Relationshi...,1.0,https://www.scopus.com/inward/record.uri?eid=2...,"Protective factors of mental health, such as e..."
4,Torres-Hostench O.,Translator training outdoors,,https://www.scopus.com/inward/record.uri?eid=2...,"Before the COVID-19 pandemic, there was no rea..."


In [4]:
papers.shape

(2000, 5)

In [6]:
papers.isnull().sum()

Authors       0
Title         0
Cited by    402
Link          0
Abstract      0
dtype: int64

In [7]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
stop_words.extend(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves',
                   'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 
                   'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 
                   'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 
                   'into', 'through', 'during', 'before', 'after', 'above', 
                   'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 
                   'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', 
                   "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 
                   'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't",'two','paper'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]


data = papers.Abstract.values.tolist()
data_words = list(sent_to_words(data))

# remove stop words
data_words = remove_stopwords(data_words)

print(data_words[:1][0][:30])

['compares', 'works', 'art', 'medical', 'cases', 'novel', 'hungarian', 'writer', 'frigyes', 'karinthy', 'journey', 'round', 'skull', 'tells', 'detection', 'diagnosis', 'operation', 'brain', 'tumour', 'solzhenitsyn', 'novel', 'cancer', 'ward', 'treatment', 'throat', 'cancer', 'oncological', 'cases', 'disease', 'seen']


In [10]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 3), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1), (19, 3), (20, 1), (21, 1), (22, 2), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1)]


In [11]:
#Model traning
from pprint import pprint

# number of topics
num_topics = 20

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

# Print the Keyword in the 20 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.011*"health" + 0.009*"study" + 0.007*"children" + 0.006*"family" + '
  '0.006*"mental" + 0.005*"research" + 0.005*"associated" + 0.004*"anxiety" + '
  '0.004*"treatment" + 0.004*"used"'),
 (1,
  '0.025*"health" + 0.023*"mental" + 0.005*"social" + 0.005*"study" + '
  '0.005*"physical" + 0.004*"among" + 0.004*"well" + 0.004*"research" + '
  '0.004*"based" + 0.003*"findings"'),
 (2,
  '0.008*"health" + 0.008*"study" + 0.007*"school" + 0.007*"social" + '
  '0.005*"results" + 0.005*"based" + 0.005*"mental" + 0.005*"education" + '
  '0.004*"data" + 0.004*"well"'),
 (3,
  '0.017*"health" + 0.014*"mental" + 0.006*"students" + 0.006*"study" + '
  '0.005*"among" + 0.004*"social" + 0.004*"depression" + 0.004*"support" + '
  '0.004*"people" + 0.004*"symptoms"'),
 (4,
  '0.017*"health" + 0.014*"mental" + 0.009*"social" + 0.007*"study" + '
  '0.005*"well" + 0.005*"group" + 0.004*"results" + 0.004*"symptoms" + '
  '0.004*"among" + 0.004*"associated"'),
 (5,
  '0.018*"health" + 0.011*"mental

In [12]:
import pyLDAvis.gensim as gensimvis
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_data_filepath = os.path.join('D:\Anaconda\python'+str(num_topics))

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, 'D:\Anaconda\python'+ str(num_topics) +'.html')

LDAvis_prepared