## Topic Modeling

In [1]:
import pandas as pd

In [2]:
 dataset = pd.read_csv('datasets/Lezione_7-Topic_modeling/dataset_Research_Article.csv')

In [3]:
dataset

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
20967,20968,Contemporary machine learning: a guide for pra...,Machine learning is finding increasingly bro...,1,1,0,0,0,0
20968,20969,Uniform diamond coatings on WC-Co hard alloy c...,Polycrystalline diamond coatings have been g...,0,1,0,0,0,0
20969,20970,Analysing Soccer Games with Clustering and Con...,We present a new approach for identifying si...,1,0,0,0,0,0
20970,20971,On the Efficient Simulation of the Left-Tail o...,The sum of Log-normal variates is encountere...,0,0,1,1,0,0


In [4]:
import gensim
from gensim.utils import simple_preprocess
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lavoro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
documents = dataset['TITLE'] + " " + dataset['ABSTRACT']

In [8]:
def sent_to_words(items):
    for item in items:
        yield(simple_preprocess(item, deacc=True))
        
def remove_stopwords(texts):
    return [[word for word in words if word not in stop_words and len(word) >=5 ] for words in texts]

data_words = list(sent_to_words(documents))
data_words = remove_stopwords(data_words)

In [9]:
import gensim.corpora as corpora

id2word = corpora.Dictionary(data_words)

corpus = [id2word.doc2bow(text) for text in data_words]

In [11]:
from pprint import pprint
num_topics = 10

lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics = num_topics,
                                       passes=3
                                      )

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.007*"model" + 0.005*"analysis" + 0.004*"using" + 0.004*"process" + '
  '0.004*"distribution" + 0.004*"magnetic" + 0.004*"causal" + 0.003*"based" + '
  '0.003*"state" + 0.003*"structure"'),
 (1,
  '0.007*"order" + 0.006*"paper" + 0.005*"model" + 0.005*"theory" + '
  '0.004*"results" + 0.004*"algebra" + 0.004*"study" + 0.004*"functions" + '
  '0.004*"method" + 0.004*"using"'),
 (2,
  '0.015*"networks" + 0.012*"learning" + 0.011*"network" + 0.008*"based" + '
  '0.006*"neural" + 0.005*"results" + 0.005*"algorithm" + 0.005*"graph" + '
  '0.005*"using" + 0.005*"problem"'),
 (3,
  '0.008*"model" + 0.008*"algorithm" + 0.007*"matrix" + 0.005*"methods" + '
  '0.005*"problem" + 0.005*"stochastic" + 0.005*"number" + '
  '0.004*"distribution" + 0.004*"optimal" + 0.004*"estimator"'),
 (4,
  '0.008*"mathbb" + 0.007*"prove" + 0.007*"problem" + 0.006*"graphs" + '
  '0.006*"number" + 0.005*"results" + 0.005*"graph" + 0.005*"mathcal" + '
  '0.005*"group" + 0.005*"groups"'),
 (5,
  '0.011*"learn

In [30]:
unseen_document1 = "NETHIC: A system for automatic text classification using neural networks and hierarchical taxonomies. This paper presents NETHIC, a software system for the automatic classification of textual documents based on hierarchical taxonomies and artificial neural networks. This approach combines the advantages of highly-structured hierarchies of textual labels with the versatility and scalability of neural networks, thus bringing about a textual classifier that displays high levels of performance in terms of both effectiveness and efficiency. The system has first been tested as a general-purpose classifier on a generic document corpus, and then applied to the specific domain tackled by DANTE, a European project that is meant to address criminal and terrorist-related online contents, showing consistent results across both application domains."
document = id2word.doc2bow(simple_preprocess(unseen_document1))
for index,score in lda_model[document]:
    print("TOPIC: "+ str(index))
    print("SCORE: "+str(score))

TOPIC: 5
SCORE: 0.98747003


In [31]:
unseen_document2 = "An Automatic Text Classification Method Based on Hierarchical Taxonomies, Neural Networks and Document Embedding: The NETHIC Tool. This work describes an automatic text classification method implemented in a software tool called NETHIC, which takes advantage of the inner capabilities of highly-scalable neural networks combined with the expressiveness of hierarchical taxonomies. As such, NETHIC succeeds in bringing about a mechanism for text classification that proves to be significantly effective as well as efficient. The tool had undergone an experimentation process against both a generic and a domain-specific corpus, outputting promising results. On the basis of this experimentation, NETHIC has been now further refined and extended by adding a document embedding mechanism, which has shown improvements in terms of performance on the individual networks and on the whole hierarchical model."
document = id2word.doc2bow(simple_preprocess(unseen_document2))
for index,score in lda_model[document]:
    print("TOPIC: "+ str(index))
    print("SCORE: "+str(score))

TOPIC: 2
SCORE: 0.11601843
TOPIC: 3
SCORE: 0.02957998
TOPIC: 5
SCORE: 0.84356415
