In [107]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import spacy
from spacy import displacy
from spacy.language import Language
import pyLDAvis.gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.phrases import Phrases, Phraser
from gensim.models import LdaMulticore
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import yake
from langdetect import detect, detect_langs, DetectorFactory
import fasttext
import re

In [564]:
# read csv
authors = pd.read_csv('input/top_20_authors.csv')
publications = pd.read_csv('input/publications-top_20_authors.csv', sep=',')

In [533]:
authors = pd.read_csv('input/some_authors.csv')
publications = pd.read_csv('input/publications-some_authors.csv', sep=',')

In [565]:
# make a dictionary containing the combined abstracts for each author
author_publication_pairs = list(zip(publications['user_id'], publications['abstract_text']))
texts = {author_id: [] for author_id in authors["id"]}

# load fastText model
model = fasttext.load_model('lid.176.bin')
i = 0
for author_id, abstract in author_publication_pairs:
    if abstract and isinstance(abstract, str) and re.match('^(?=.*[a-zA-Z])', abstract):
        # predict the language
        predictions = model.predict(abstract)
        language = predictions[0][0].replace('__label__', '')
        # keep only texts written in English
        if language == 'en':
            texts[author_id].append(abstract)



In [247]:
stop_words = ['abstract', 'al', 'amount', 'approach', 'article', 'available', 'base', 'based', 'benefit',
              'bucharest',
              'case', 'category', 'condition', 'conference', 'context', 'copyright', 'datum', 'demonstrate',
              'demonstrates', 'demonstrated',
              'different', 'difficult', 'et', 'experiment', 'experimental', 'faculty', 'helpful', 'high',
              'ieee', 'importance', 'important', 'inconvenience', 'interest', 'interested', 'interests', 'jat',
              'jats', 'laboratory',
              'main', 'multiple', 'new', 'obtain', 'obtained', 'obtains', 'old', 'order', 'organization', 'paper',
              'people', 'policy', 'politehnica', 'polytechnic',
              'present', 'presents', 'presented', 'privacy', 'professor', 'propose', 'proposes', 'proposed',
              'quality', 'range', 'ranges', 'real',
              'recent', 'research', 'researcher', 'result', 'scale', 'show', 'shows', 'showed', 'student', 'study',
              'subject', 'studies', 'studied', 'task',
              'teacher', 'term', 'text', 'title', 'type', 'unavailable', 'university', 'useful',
              'workshop']

EXTRACT KEYWORDS/KEYPHRASES WITH YAKE

In [227]:
nlp = spacy.load('en_core_web_lg', exclude=['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'])

In [237]:
abstract_list = texts[584]

In [238]:
# remove abstracts that contain mostly person and organization names
clean_abstract_list = []

for abstract in abstract_list:
    nlp.max_length = len(abstract) + 1000
    doc = nlp(abstract)
    person_orgs_count = 0
    other_words_count = 0

    for token in doc:
        if token.ent_type_ == 'PERSON' or token.ent_type_ == 'ORG':
            person_orgs_count += 1
        elif token.is_alpha:
            other_words_count += 1

    if person_orgs_count < other_words_count:
        clean_abstract_list.append(abstract)

In [239]:
# concatenate the abstracts into a single string
text = '\n'.join(clean_abstract_list)

In [240]:
# remove some named entities
nlp.max_length = len(text) + 1000
doc = nlp(text)
remove_entities = ['PERSON', 'NORP', 'FAC', 'GPE', 'LOC', 'DATE', 'TIME', 'PERCENT', 'MONEY',
                   'QUANTITY', 'CARDINAL', 'ORDINAL']

transformed_text = ' '.join([token.text for token in doc if token.ent_type_ not in remove_entities])

In [241]:
print(len(doc.ents))
# print(doc.ents)
for tok in doc:
    if tok.ent_type_ == "ORG":
        print(tok.text + " " + tok.ent_type_)

1487
the ORG
CONPROF ORG
Project ORG
IPA ORG
SA ORG
the ORG
University ORG
POLITEHNICA ORG
of ORG
Bucharest ORG
the ORG
Romanian ORG
- ORG
American ORG
University ORG
ROCT ORG
Romanian ORG
Coordination ORG
of ORG
Training ORG
Firms ORG
IBM ORG
the ORG
Faculty ORG
of ORG
Engineering ORG
in ORG
Foreign ORG
Languages ORG
the ORG
University ORG
POLITEHNICA ORG
of ORG
Bucharest ORG
Business ORG
Administration ORG
and ORG
Engineering ORG
INNOV8 ORG
IBM ORG
ARVEE ORG
DAU ORG
Daily ORG
Active ORG
Users ORG
MAU ORG
Monthly ORG
Active ORG
Users ORG
PCU ORG
Peak ORG
Current ORG
Users ORG
UAC ORG
LTV ORG
Lifetime ORG
Network ORG
Value ORG
Coulrophobia ORG
e ORG
- ORG
Learning ORG
International ORG
Project ORG
Management ORG
Association ORG
Web ORG
of ORG
Science ORG
the ORG
Romanian ORG
Association ORG
of ORG
Project ORG
Management ORG
the ORG
International ORG
Project ORG
Management ORG
Association ORG
IPMA ORG
the ORG
Association ORG
for ORG
the ORG
preparation ORG
process ORG
University ORG
Pol

In [251]:
# set parameters for yake keyword extractor
max_ngram = 3
deduplication_threshold = 0.8
keywords_nr = 15
windows_size = 1

kw_extractor = yake.KeywordExtractor(lan="en", n=max_ngram, dedupLim=deduplication_threshold,
                                     top=keywords_nr, windowsSize=windows_size)
# add custom stop words to the default set from yake
kw_extractor.stopword_set.update(set(stop_words))

# extract keywords
keywords = kw_extractor.extract_keywords(transformed_text)
for kw, _ in keywords:
    print(kw)

Natural Language Processing
machine learning models
Natural Language
Language Processing
Language Processing techniques
language models
learning models
trained language models
learning
language processing tools
BERT language model
Cohesion Network Analysis
FPGA Spartan III
Language
game learning environment


In [190]:
displacy.render(doc, style="ent")

  from IPython.core.display import HTML, display


In [148]:
from termcolor import colored

# Funcție pentru evidențierea cuvintelor cheie
def highlight_keywords(text, keywords):
    for kw, _ in keywords:
        text = text.replace(kw, colored(kw, 'red', attrs=['bold']))
    return text

# Evidențierea cuvintelor cheie în text
highlighted_text = highlight_keywords(text, keywords)
print(highlighted_text)

Today, almost everyone is connected to the Internet and uses different [1m[31mCloud[0m solutions to store, deliver and process [1m[31mdata[0m. [1m[31m[1m[31mCloud[0m computing[0m assembles large networks of virtualized services such as hardware and software resources. The new era in which ICT penetrated almost all domains (healthcare, aged-care, social assistance, surveillance, education, etc.) creates the need of new multimedia content-driven [1m[31mapplications[0m. These [1m[31mapplications[0m generate huge amount of [1m[31mdata[0m, require gathering, processing and then aggregation in a fault-tolerant, reliable and secure heterogeneous distributed [1m[31msystem[0m created by a mixture of [1m[31mCloud[0m [1m[31msystem[0ms (public/private), mobile devices networks, desktop-based clusters, etc. In this context dynamic resource provisioning for [1m[31mBig Data[0m application [1m[31mscheduling[0m became a challenge in modern [1m[31msystem[0ms. We pr

EXTRACT TOPICS WITH GENSIM LDA

In [1201]:
abstract_list = texts[1246]

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
# remove abstracts that contain mostly person and organization names
clean_abstract_list = []

for abstract in abstract_list:
    nlp.max_length = len(abstract) + 1000
    doc = nlp(abstract)
    person_orgs_count = 0
    other_words_count = 0

    for token in doc:
        if token.ent_type_ == 'PERSON' or token.ent_type_ == 'ORG':
            person_orgs_count += 1
        elif token.is_alpha:
            other_words_count += 1

    if person_orgs_count < other_words_count:
        clean_abstract_list.append(abstract)

In [None]:
# keep only adjectives and nouns
remove_pos = ['ADV', 'PRON', 'PART', 'DET', 'SPACE', 'NUM', 'SYM', 'ADP', 'VERB', 'CCONJ', 'INTJ']
remove_entities = ['PERSON', 'NORP', 'FAC', 'GPE', 'LOC', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY',
                   'QUANTITY', 'CARDINAL', 'ORDINAL']

tokens = []
for abstract in clean_abstract_list:
    nlp.max_length = len(abstract) + 1000
    doc = nlp(abstract)
    t = [token.lemma_.upper() for token in doc if token.is_alpha and token.ent_type_ not in remove_entities
                             and token.lemma_.lower() not in stop_words and token.pos_ not in remove_pos and not token.is_stop]
    tokens.append(t)

# add bigrams to the token list
bigram = Phrases(tokens, min_count=3, delimiter=' ', threshold=1)
tokens = [bigram[text] for text in tokens]
trigram = Phrases(tokens, min_count=3, delimiter=' ', threshold=1)
tokens = [trigram[text] for text in tokens]

In [1195]:
# remove single words
ngrams = [[token for token in text if len(token.split(" ")) > 1] for text in tokens]

In [1196]:
word_count = 0
for text in ngrams:
    word_count += len(text)

if word_count > 100:
    tokens = ngrams

In [1197]:
print(word_count)

2880


In [1198]:
# create dictionary with gensim
dictionary = Dictionary(tokens)
# create corpus
corpus = [dictionary.doc2bow(text) for text in tokens]

In [1199]:
# apply LDA
num_topics = 2
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=1, num_topics=num_topics, workers=3, passes=1)

In [1200]:
duplicates = 0
word_list = []
for i in range(num_topics):
    topics = lda_model.show_topic(topicid=i, topn=15)
    words = [word for word, _ in topics]

    for w in words:
        # print(w)
        if w in word_list:
            duplicates += 1
        else:
            word_list.append(w)
        
    # print()
print(duplicates)

11


In [1025]:
cm = CoherenceModel(model=lda_model, texts=tokens, dictionary=dictionary, coherence='c_v', topn=15)
coherence = cm.get_coherence()
print(coherence)

0.5953481250532997


In [1026]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)