In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('datasets/Lezione_7-Topic_modeling/dataset_Research_Article.csv')

In [3]:
titles = dataset['TITLE'] + dataset['ABSTRACT']

In [4]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aless\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def sent_to_words(items):
    for item in items:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(item), deacc=True))

def remove_stopwords(texts):
    return [[word for word in words if word not in stop_words and len(word) >=5] for words in texts]

data_words = list(sent_to_words(titles))
data_words = remove_stopwords(data_words)

In [6]:
data_words

[['reconstructing',
  'specific',
  'effect',
  'predictive',
  'models',
  'allow',
  'specific',
  'inference',
  'analyzing',
  'disease',
  'related',
  'alterations',
  'neuroimaging',
  'given',
  'inference',
  'levels',
  'global',
  'identifiying',
  'condition',
  'presence',
  'local',
  'detecting',
  'condition',
  'effect',
  'individual',
  'measurement',
  'extracted',
  'global',
  'inference',
  'widely',
  'local',
  'inference',
  'specific',
  'effect',
  'rarely',
  'existing',
  'models',
  'often',
  'yield',
  'noisy',
  'detections',
  'composed',
  'dispersed',
  'isolated',
  'islands',
  'article',
  'propose',
  'reconstruction',
  'method',
  'named',
  'improve',
  'specific',
  'detections',
  'predictive',
  'modeling',
  'approaches',
  'particular',
  'binary',
  'classifiers',
  'specifically',
  'reduce',
  'noise',
  'sampling',
  'error',
  'associated',
  'using',
  'finite',
  'sample',
  'examples',
  'train',
  'classifiers',
  'proposed',
  

In [7]:
len(data_words)

20972

In [8]:
import gensim.corpora as corpora 
# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_words]

In [9]:
len(corpus)

20972

In [12]:
import gensim
from pprint import pprint

# number of topics
num_topics = 10

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       passes = 3)
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.012*"graph" + 0.010*"graphs" + 0.007*"problem" + 0.007*"number" + '
  '0.006*"paper" + 0.005*"results" + 0.005*"study" + 0.004*"space" + '
  '0.004*"given" + 0.004*"theory"'),
 (1,
  '0.006*"field" + 0.006*"model" + 0.006*"phase" + 0.005*"magnetic" + '
  '0.005*"energy" + 0.005*"quantum" + 0.005*"using" + 0.004*"temperature" + '
  '0.004*"state" + 0.004*"density"'),
 (2,
  '0.008*"equations" + 0.008*"problem" + 0.007*"method" + 0.006*"algorithm" + '
  '0.006*"equation" + 0.005*"solutions" + 0.005*"learning" + 0.005*"using" + '
  '0.005*"paper" + 0.005*"results"'),
 (3,
  '0.013*"model" + 0.009*"algorithm" + 0.008*"problem" + 0.007*"based" + '
  '0.007*"method" + 0.006*"models" + 0.006*"paper" + 0.006*"learning" + '
  '0.006*"approach" + 0.006*"using"'),
 (4,
  '0.013*"model" + 0.010*"based" + 0.008*"estimation" + 0.007*"models" + '
  '0.007*"proposed" + 0.007*"method" + 0.007*"learning" + 0.006*"using" + '
  '0.006*"methods" + 0.006*"approach"'),
 (5,
  '0.011*"system" + 0.00

In [13]:
unseen_document1 = "NETHIC: A system for automatic text classification using neural networks and hierarchical taxonomies. This paper presents NETHIC, a software system for the automatic classification of textual documents based on hierarchical taxonomies and artificial neural networks. This approach combines the advantages of highly-structured hierarchies of textual labels with the versatility and scalability of neural networks, thus bringing about a textual classifier that displays high levels of performance in terms of both effectiveness and efficiency. The system has first been tested as a general-purpose classifier on a generic document corpus, and then applied to the specific domain tackled by DANTE, a European project that is meant to address criminal and terrorist-related online contents, showing consistent results across both application domains."
title = id2word.doc2bow(simple_preprocess(unseen_document1))
for index, score in lda_model[title]:
    print("TOPIC: "+str(index))
    print("SCORE:"+str(score))

TOPIC: 9
SCORE:0.98747855


In [14]:
unseen_document2 = "An Automatic Text Classification Method Based on Hierarchical Taxonomies, Neural Networks and Document Embedding: The NETHIC Tool. This work describes an automatic text classification method implemented in a software tool called NETHIC, which takes advantage of the inner capabilities of highly-scalable neural networks combined with the expressiveness of hierarchical taxonomies. As such, NETHIC succeeds in bringing about a mechanism for text classification that proves to be significantly effective as well as efficient. The tool had undergone an experimentation process against both a generic and a domain-specific corpus, outputting promising results. On the basis of this experimentation, NETHIC has been now further refined and extended by adding a document embedding mechanism, which has shown improvements in terms of performance on the individual networks and on the whole hierarchical model."
title = id2word.doc2bow(simple_preprocess(unseen_document2))
for index, score in lda_model[title]:
    print("TOPIC: "+str(index))
    print("SCORE:"+str(score))

TOPIC: 2
SCORE:0.07083743
TOPIC: 9
SCORE:0.91681325
