# Topic Modeling 

In [1]:
#Install Required Libraries
!python -m pip install gensim nltk pyLDAvis


Collecting pyLDAvis
  Obtaining dependency information for pyLDAvis from https://files.pythonhosted.org/packages/6b/5a/66364c6799f2362bfb9b7100bc1ce6ffcdfe7f17e8d2e85a591bfe427643/pyLDAvis-3.4.1-py3-none-any.whl.metadata
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting FuzzyTM>=0.4.0 (from gensim)
  Obtaining dependency information for FuzzyTM>=0.4.0 from https://files.pythonhosted.org/packages/2d/30/074bac7a25866a2807c1005c7852c0139ac22ba837871fc01f16df29b9dc/FuzzyTM-2.0.9-py3-none-any.whl.metadata
  Downloading FuzzyTM-2.0.9-py3-none-any.whl.metadata (7.9 kB)
Collecting funcy (from pyLDAvis)
  Obtaining dependency information for funcy from https://files.pythonhosted.org/packages/d5/08/c2409cb01d5368dcfedcbaffa7d044cc8957d57a9d0855244a5eb4709d30/funcy-2.0-py2.py3-none-any.whl.metadata
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim)
  Obtaining dependency information for pyfume from https://files.

In [2]:
# Insall Required Libraries
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download stopwords and punkt tokenizer
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HE\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Data Preprocessing

In [3]:
import pandas as pd
import csv
import re

# Load the data 
df = pd.read_csv('./Train.csv')
df.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [4]:
# extract the first 500 documents (paper_text) from the dataframe

documents = df['ABSTRACT'].values[0:500]
documents 

array(["  Predictive models allow subject-specific inference when analyzing disease\nrelated alterations in neuroimaging data. Given a subject's data, inference can\nbe made at two levels: global, i.e. identifiying condition presence for the\nsubject, and local, i.e. detecting condition effect on each individual\nmeasurement extracted from the subject's data. While global inference is widely\nused, local inference, which can be used to form subject-specific effect maps,\nis rarely used because existing models often yield noisy detections composed of\ndispersed isolated islands. In this article, we propose a reconstruction\nmethod, named RSM, to improve subject-specific detections of predictive\nmodeling approaches and in particular, binary classifiers. RSM specifically\naims to reduce noise due to sampling error associated with using a finite\nsample of examples to train classifiers. The proposed method is a wrapper-type\nalgorithm that can be used with different binary classifiers in 

In [5]:
# Tokenization and stop word removal
stop_words = set(stopwords.words('english'))
processed_docs = []
for doc in documents:
    tokens = word_tokenize(doc.lower())
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    processed_docs.append(filtered_tokens)

# Creating a dictionary and corpus
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

### Apply LDA 

In [9]:
# Number of topics
num_topics = 20

# Building the LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# Print the topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))


Topic: 0 
Words: 0.007*"graded" + 0.006*"theory" + 0.006*"learning" + 0.006*"local" + 0.006*"method" + 0.005*"new" + 0.005*"model" + 0.004*"metal" + 0.004*"based" + 0.004*"l"
Topic: 1 
Words: 0.007*"algorithm" + 0.005*"linear" + 0.005*"problem" + 0.004*"used" + 0.004*"proposed" + 0.004*"image" + 0.004*"code" + 0.004*"method" + 0.004*"way" + 0.004*"propagation"
Topic: 2 
Words: 0.006*"g" + 0.005*"method" + 0.005*"also" + 0.005*"case" + 0.005*"q" + 0.005*"2" + 0.004*"results" + 0.004*"transition" + 0.004*"p" + 0.004*"network"
Topic: 3 
Words: 0.008*"data" + 0.006*"group" + 0.006*"model" + 0.005*"one" + 0.004*"method" + 0.004*"paper" + 0.004*"formation" + 0.004*"star" + 0.004*"problems" + 0.004*"1"
Topic: 4 
Words: 0.009*"model" + 0.008*"algorithm" + 0.006*"problem" + 0.005*"two" + 0.005*"system" + 0.005*"data" + 0.004*"paper" + 0.004*"k" + 0.004*"show" + 0.004*"based"
Topic: 5 
Words: 0.019*"n" + 0.013*"p" + 0.012*"x" + 0.009*"k" + 0.008*"conjecture" + 0.007*"functions" + 0.007*"prove" +

### Visualize the topics

In [10]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Visualize the topics
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_vis)
