In [1394]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import spacy
from spacy import displacy
from spacy.language import Language
import pyLDAvis.gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.phrases import Phrases, Phraser
from gensim.models import LdaMulticore
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import yake
import fasttext
import re

In [1239]:
# read csv
authors = pd.read_csv('input/top_20_authors.csv')
publications = pd.read_csv('input/publications-top_20_authors.csv', sep=',')

In [1356]:
authors = pd.read_csv('input/some_authors.csv')
publications = pd.read_csv('input/publications-some_authors.csv', sep=',')

In [1357]:
# make a dictionary containing the combined abstracts for each author
author_publication_pairs = list(zip(publications['user_id'], publications['abstract_text']))
texts = {author_id: [] for author_id in authors["id"]}

# load fastText model
model = fasttext.load_model('lid.176.bin')
for author_id, abstract in author_publication_pairs:
    if abstract and isinstance(abstract, str) and re.match('^(?=.*[a-zA-Z])', abstract):
        # predict the language
        predictions = model.predict(abstract)
        language = predictions[0][0].replace('__label__', '')
        # keep only texts written in English
        if language == 'en':
            texts[author_id].append(abstract)



In [1249]:
stop_words = ['abstract', 'al', 'amount', 'approach', 'article', 'available', 'base', 'based', 'benefit',
              'bucharest',
              'case', 'category', 'condition', 'conference', 'context', 'copyright', 'datum', 'demonstrate',
              'demonstrates', 'demonstrated',
              'different', 'difficult', 'et', 'experiment', 'experimental', 'faculty', 'helpful', 'high',
              'ieee', 'importance', 'important', 'inconvenience', 'interest', 'interested', 'interests', 'jat',
              'jats', 'laboratory',
              'main', 'multiple', 'new', 'obtain', 'obtained', 'obtains', 'old', 'order', 'organization', 'paper',
              'people', 'policy', 'politehnica', 'polytechnic',
              'present', 'presents', 'presented', 'privacy', 'professor', 'propose', 'proposes', 'proposed',
              'quality', 'range', 'ranges', 'real',
              'recent', 'research', 'researcher', 'result', 'scale', 'show', 'shows', 'showed', 'student', 'study',
              'subject', 'studies', 'studied', 'task',
              'teacher', 'term', 'text', 'title', 'type', 'unavailable', 'university', 'useful',
              'workshop']

EXTRACT KEYWORDS/KEYPHRASES WITH YAKE

In [1391]:
nlp = spacy.load('en_core_web_lg', exclude=['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'])

In [1389]:
nlp = spacy.load('en_core_web_trf', exclude=['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'])

In [1376]:
abstract_list = texts[549]

In [1378]:
# remove abstracts that contain mostly person and organization names
clean_abstract_list = []

for abstract in abstract_list:
    nlp.max_length = len(abstract) + 1000
    doc = nlp(abstract)
    person_orgs_count = 0
    other_words_count = 0

    for token in doc:
        if token.ent_type_ == 'PERSON' or token.ent_type_ == 'ORG':
            person_orgs_count += 1
        elif token.is_alpha:
            other_words_count += 1

    if person_orgs_count < other_words_count:
        clean_abstract_list.append(abstract)

In [1379]:
# concatenate the abstracts into a single string
text = '\n'.join(clean_abstract_list)

In [1380]:
from time import perf_counter

In [1393]:
# remove some named entities
t1_start = perf_counter()
nlp.max_length = len(text) + 1000
doc = nlp(text)
remove_entities = ['PERSON', 'NORP', 'FAC', 'GPE', 'LOC', 'DATE', 'TIME', 'PERCENT', 'MONEY',
                   'QUANTITY', 'CARDINAL', 'ORDINAL']

transformed_text = ' '.join([token.text for token in doc if token.ent_type_ not in remove_entities])
t1_stop = perf_counter()
print(t1_stop - t1_start)

0.10986020002746955


In [1382]:
# set parameters for yake keyword extractor
max_ngram = 3
deduplication_threshold = 0.5
keywords_nr = 15
windows_size = 1

kw_extractor = yake.KeywordExtractor(lan="en", n=max_ngram, dedupLim=deduplication_threshold,
                                     top=keywords_nr, windowsSize=windows_size)
# add custom stop words to the default set from yake
kw_extractor.stopword_set.update(set(stop_words))

# extract keywords
keywords = kw_extractor.extract_keywords(transformed_text)
for kw, _ in keywords:
    print(kw)

monitoring project budgets
project budget monitoring
percentage execution budgets
external drive magnet
cold plastic deformation
Human Resources Development
efficient project management
Sectoral Operational Program
magnetic drive pumps
plastic deformation equipments
total manufacturing cost
product procurement price
induced magnetic field
imposed major penalties
clauses imposed major


In [1383]:
print(len(doc.ents))
# print(doc.ents)
for tok in doc:
    if tok.ent_type_ == "ORG":
        print(tok.text + " " + tok.ent_type_)

35
Agricultural ORG
the ORG
Sectoral ORG
Operational ORG
Program ORG
for ORG
Human ORG
Resources ORG
Development ORG
National ORG
Institute ORG
of ORG
Optoelectronics ORG
TiAlN ORG
PVD ORG
CATIA ORG


In [1384]:
org_count = 0
for ent in doc.ents:
    if ent.label_ == 'ORG':
        print(ent.text + ' ' + ent.label_)
        org_count += 1

Agricultural ORG
the Sectoral Operational Program for Human Resources Development ORG
National Institute of Optoelectronics ORG
TiAlN PVD ORG
CATIA ORG


In [1385]:
print(org_count)

5


In [1386]:
for ent in doc.ents:
    if ent.label_ == 'ORG' and ('Big Data' in ent.text or 'Cloud' in ent.text):
        print(ent.text + ' ' + ent.label_)

In [1387]:
displacy.render(doc, style="ent")

  from IPython.core.display import HTML, display


In [1388]:
from termcolor import colored

# Funcție pentru evidențierea cuvintelor cheie
def highlight_keywords(text, keywords):
    for kw, _ in keywords:
        text = text.replace(kw, colored(kw, 'red', attrs=['bold']))
    return text

# Evidențierea cuvintelor cheie în text
highlighted_text = highlight_keywords(text, keywords)
print(highlighted_text)

The paper presents the current stage of constructive-technological and functional researches on [1m[31mmagnetic drive pumps[0m. This type of pump is provided with a closed system for driving and pumping the fluid. It is envisaged that the rotation of the motor shaft is done by induction of a magnetic field without external drive from an electric motor, as in the case of classical hydraulic pumps. The studied pumps use a construction system containing an [1m[31mexternal drive magnet[0m and an internal magnet that is connected to an impeller that moves the fluid. The drive magnet and the inner magnet are separated by a housing, resulting in less sealing. Driving the pump with the [1m[31minduced magnetic field[0m helps in avoiding fluid leakage and in performance optimization.
<jats:p>The paper presents a method for analysing the variance of manufacturing cost and price components using a scenario method. The analysis model is based on the usual relations for calculus of the [1m

EXTRACT KEYWORDS WITH LDA

In [1221]:
abstract_list = texts[1146]

In [1222]:
nlp = spacy.load('en_core_web_lg')

In [1223]:
# remove abstracts that contain mostly person and organization names
clean_abstract_list = []

for abstract in abstract_list:
    nlp.max_length = len(abstract) + 1000
    doc = nlp(abstract)
    person_orgs_count = 0
    other_words_count = 0

    for token in doc:
        if token.ent_type_ == 'PERSON' or token.ent_type_ == 'ORG':
            person_orgs_count += 1
        elif token.is_alpha:
            other_words_count += 1

    if person_orgs_count < other_words_count:
        clean_abstract_list.append(abstract)

In [1224]:
# keep only adjectives and nouns
remove_pos = ['ADV', 'PRON', 'PART', 'DET', 'SPACE', 'NUM', 'SYM', 'ADP', 'VERB', 'CCONJ', 'INTJ']
remove_entities = ['PERSON', 'NORP', 'FAC', 'GPE', 'LOC', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY',
                   'QUANTITY', 'CARDINAL', 'ORDINAL']

tokens = []
for abstract in clean_abstract_list:
    nlp.max_length = len(abstract) + 1000
    doc = nlp(abstract)
    t = [token.lemma_.upper() for token in doc if token.is_alpha and token.ent_type_ not in remove_entities
                             and token.lemma_.lower() not in stop_words and token.pos_ not in remove_pos and not token.is_stop]
    tokens.append(t)

# add bigrams to the token list
bigram = Phrases(tokens, min_count=3, delimiter=' ', threshold=1)
tokens = [bigram[text] for text in tokens]
trigram = Phrases(tokens, min_count=3, delimiter=' ', threshold=1)
tokens = [trigram[text] for text in tokens]

In [1225]:
# remove single words
ngrams = [[token for token in text if len(token.split(" ")) > 1] for text in tokens]

In [1226]:
word_count = 0
for text in ngrams:
    word_count += len(text)

if word_count > 100:
    tokens = ngrams

In [1227]:
print(word_count)

2063


In [1228]:
# create dictionary with gensim
dictionary = Dictionary(tokens)
# create corpus
corpus = [dictionary.doc2bow(text) for text in tokens]

In [1236]:
# apply LDA
num_topics = 1
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=1, num_topics=num_topics, workers=3, passes=1)

In [1237]:
for i in range(num_topics):
    topics = lda_model.show_topic(topicid=i, topn=15)
    words = [word for word, _ in topics]

    for w in words:
        print(w)
        
    print()

VIRTUAL REALITY
VIRTUAL ENVIRONMENT
SMITH CHART
VIRTUAL SPACE
SOUND SOURCE
SOUND LOCALIZATION
SOUND VISION
FEAR LEVEL
LEARNING PROCESS
VIRTUAL WORLD
HUMAN BODY
MIXED REALITY
LARGE NUMBER
SENSORY SUBSTITUTION DEVICE
HUMAN HEALTH



In [1231]:
cm = CoherenceModel(model=lda_model, texts=tokens, dictionary=dictionary, coherence='c_v', topn=15)
coherence = cm.get_coherence()
print(coherence)

0.5768884219744295


In [1232]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)