In [3]:
import os
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import string
import spacy
import pprint

from gensim.models import LdaModel,CoherenceModel
from gensim.corpora import Dictionary

import pyLDAvis.gensim
import pickle
import pyLDAvis
SEED=7

In [4]:
data_raw=pd.read_csv('/kaggle/input/ieee-research-papers-dataset/database.csv')

In [6]:
data_raw.head()

Unnamed: 0.1,Unnamed: 0,Id,title,link,year,authors,citations,abstract
0,261,261,Interplay Between γ–Ray Irradiation and 3DEG f...,/document/9724229/,2022,"['Khushwant Sehra', 'Vandana Kumari', 'Poonam ...",0,This work investigates the cumulative dose \n6...
1,132,132,Practical Methods for Efficient Resource Utili...,/document/9281038/,2020,"['George Koutitas', 'Shashwat Vyas', 'Chaitany...",0,This work presents a novel approach that adopt...
2,576,576,Application of Non-Negative Tensor Factorizati...,/document/8911489/,2019,"['Han Zhong', 'Geqi Qi', 'Wei Guan', 'Xiaochen...",1,With the rapid development of civil aviation t...
3,454,454,Novel Method for Magnetic Flux Density Estimat...,/document/9705564/,2022,"['Adnan Mujezinovic', 'Emir Turajlic', 'Ajdin ...",0,"In this paper, a novel method for the magnetic..."
4,337,337,Separation Behaviour Difference Between Gelati...,/document/8918179/,2019,"['Chao Cao', 'Jiyun Zhao', 'Guilin Li', 'Haiga...",2,Different from the rigid separation of biologi...


In [5]:
data_raw.drop_duplicates(subset=['title'],inplace=True)
data_raw=data_raw.sample(frac=1,random_state=SEED).reset_index(drop=True)
data_raw.head(3)

Unnamed: 0.1,Unnamed: 0,Id,title,link,year,authors,citations,abstract
0,261,261,Interplay Between γ–Ray Irradiation and 3DEG f...,/document/9724229/,2022,"['Khushwant Sehra', 'Vandana Kumari', 'Poonam ...",0,This work investigates the cumulative dose \n6...
1,132,132,Practical Methods for Efficient Resource Utili...,/document/9281038/,2020,"['George Koutitas', 'Shashwat Vyas', 'Chaitany...",0,This work presents a novel approach that adopt...
2,576,576,Application of Non-Negative Tensor Factorizati...,/document/8911489/,2019,"['Han Zhong', 'Geqi Qi', 'Wei Guan', 'Xiaochen...",1,With the rapid development of civil aviation t...


In [7]:
stop_words=set(stopwords.words('english'))
nlp=spacy.load('en_core_web_sm')

In [9]:
def text_preprocessing(text):
    text=str(text)
    text=text.lower()
    text=re.sub(r'\[.*?\]',' ',text)
    text=re.sub(r'@\w+\s*',' ',text)
    text=re.sub(r'\\W',' ',text)
    text=re.sub(r'https?://S+|www\.\S+',' ',text)
    text=re.sub(r'http',' ',text)
    text=re.sub(r'<.*?>+',' ',text)
    text=re.sub(r'[%s]' %re.escape(string.punctuation),' ',text)
    text=re.sub(r'\n',' ',text)
    text=re.sub(r'\w*\d\w*',' ',text)
    text=re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]',' ',text)
    return text

In [10]:
def drop_stopwords(text):
    dropped=[word for word in text.split() if word not in stop_words]
    final_text=' '.join(dropped)
    return final_text

In [11]:
def lemmatization(text):
    doc=nlp(text)
    lemmatized_tokens=[token.lemma_ for token in doc]
    lemmatized_text=' '.join(lemmatized_tokens)
    return lemmatized_text

In [14]:
def delete_one_characters(text):
    deleted=[word if len(word)>1 else "" for word in text.split()]
    final_text=' '.join(deleted)
    return final_text

In [15]:
data=data_raw.copy()
data['preprocessed_abstract']=data['abstract'].apply(text_preprocessing).apply(drop_stopwords).apply(lemmatization).apply(delete_one_characters)

In [16]:
data.head()

Unnamed: 0.1,Unnamed: 0,Id,title,link,year,authors,citations,abstract,preprocessed_abstract
0,261,261,Interplay Between γ–Ray Irradiation and 3DEG f...,/document/9724229/,2022,"['Khushwant Sehra', 'Vandana Kumari', 'Poonam ...",0,This work investigates the cumulative dose \n6...,work investigate cumulative dose co gamma gam...
1,132,132,Practical Methods for Efficient Resource Utili...,/document/9281038/,2020,"['George Koutitas', 'Shashwat Vyas', 'Chaitany...",0,This work presents a novel approach that adopt...,work present novel approach adopt content cach...
2,576,576,Application of Non-Negative Tensor Factorizati...,/document/8911489/,2019,"['Han Zhong', 'Geqi Qi', 'Wei Guan', 'Xiaochen...",1,With the rapid development of civil aviation t...,rapid development civil aviation transportatio...
3,454,454,Novel Method for Magnetic Flux Density Estimat...,/document/9705564/,2022,"['Adnan Mujezinovic', 'Emir Turajlic', 'Ajdin ...",0,"In this paper, a novel method for the magnetic...",paper novel method magnetic flux density estim...
4,337,337,Separation Behaviour Difference Between Gelati...,/document/8918179/,2019,"['Chao Cao', 'Jiyun Zhao', 'Guilin Li', 'Haiga...",2,Different from the rigid separation of biologi...,different rigid separation biological tissue s...


In [17]:
text_corpus=data['preprocessed_abstract'].values
nested_document_tokens=[t.split() for t in text_corpus]

id2word=Dictionary(nested_document_tokens)
id2word.filter_extremes(no_below=15,no_above=0.8)

corpus=[id2word.doc2bow(text) for text in nested_document_tokens]

In [18]:
NUM_TOPICS=6

lda_model=LdaModel(corpus,num_topics=NUM_TOPICS,id2word=id2word,iterations=300,random_state=SEED,passes=15)

In [19]:
pprint.pprint(lda_model.print_topics())

[(0,
  '0.038*"image" + 0.025*"feature" + 0.017*"use" + 0.017*"method" + '
  '0.015*"propose" + 0.013*"detection" + 0.010*"result" + 0.010*"base" + '
  '0.009*"performance" + 0.009*"model"'),
 (1,
  '0.021*"method" + 0.021*"propose" + 0.020*"system" + 0.017*"base" + '
  '0.015*"control" + 0.013*"use" + 0.010*"approach" + 0.009*"user" + '
  '0.008*"study" + 0.008*"set"'),
 (2,
  '0.026*"algorithm" + 0.020*"propose" + 0.019*"use" + 0.016*"method" + '
  '0.016*"base" + 0.015*"network" + 0.014*"problem" + 0.012*"system" + '
  '0.012*"optimization" + 0.011*"datum"'),
 (3,
  '0.047*"model" + 0.020*"propose" + 0.020*"network" + 0.019*"use" + '
  '0.018*"datum" + 0.016*"method" + 0.015*"base" + 0.013*"prediction" + '
  '0.013*"result" + 0.012*"performance"'),
 (4,
  '0.024*"power" + 0.016*"high" + 0.016*"voltage" + 0.013*"result" + '
  '0.013*"use" + 0.012*"antenna" + 0.011*"frequency" + 0.011*"propose" + '
  '0.011*"system" + 0.011*"low"'),
 (5,
  '0.022*"propose" + 0.022*"signal" + 0.015*"sy

In [20]:
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('/kaggle/working/ldavis.html')

if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, '/kaggle/working/ldavis.html')
LDAvis_prepared

In [21]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=nested_document_tokens, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print("Coherence Score \t:{0:.7f}".format(coherence_lda))

Coherence Score 	:0.3914033
