In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import spacy
import pyLDAvis.gensim_models
import en_core_web_md
from gensim.corpora.dictionary import Dictionary
from gensim.models.phrases import Phrases, Phraser
from gensim.models import LdaMulticore
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import yake

In [2]:
# read csv
authors = pd.read_csv('top_20_authors.csv')
publications = pd.read_csv('publications-top_20_authors.csv', sep=',')

In [187]:
# make a dictionary containing the combined abstracts for each author
author_publication_pairs = list(zip(publications['user_id'], publications['abstract_text']))

authors_texts = {author_id:"" for author_id in authors["id"]}
for author_id, abstract in author_publication_pairs:
    if abstract and isinstance(abstract, str):
        authors_texts[author_id] = authors_texts[author_id] + "\n" + abstract

In [188]:
nlp = spacy.load('en_core_web_md')

EXTRACT KEYWORDS/KEYPHRASES WITH YAKE

In [190]:
kw_extractor = yake.KeywordExtractor()
max_ngram = 2
deduplication_threshold = 0.2
keywords_nr = 5
custom_kw_extractor = yake.KeywordExtractor(lan="en", n=max_ngram, dedupLim=deduplication_threshold, top=keywords_nr, features=None)

text = authors_texts[534]
doc = nlp(text)
remove_entities = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY',\
                   'QUANTITY', 'CARDINAL', 'ORDINAL']
# stop_words = ['paper', 'present', 'propose', 'datum', 'people', 'result', 'solution', 'case', 'order', 'base', 'ieee', 'privacy', 'policy']
stop_words = ['paper', 'present', 'propose', 'datum', 'people', 'result', 'solution', 'case', 'order', 'base', 'ieee', 'privacy', 'policy',
             'new', 'old', 'context', 'high', 'different', 'new', 'old', 'research', 'type', 'approach', 'important', 'main', 'range',
             'helpful', 'large', 'difficult', 'available', 'amount', 'useful', 'importance', 'article', 'abstract', 'scale', 'copyright',
             'real', 'quality', 'inconvenience', 'benefit', 'unavailable', 'term', 'condition', 'interest', 'organization', 'use',
             'task', 'student', 'professor', 'teacher', 'university']

transformed_text = ' '.join([token.text for token in doc if token.is_alpha and token.ent_type_ not in remove_entities \
                             and token.lemma_.lower() not in stop_words and token.pos_ != 'ADV'])

keywords = custom_kw_extractor.extract_keywords(transformed_text)
for kw in keywords:
    print(kw)

('distributed systems', 2.901974187368726e-07)
('Opportunistic networks', 9.515088520974068e-07)
('Cloud Computing', 1.5880804478435832e-06)
('mobile device', 3.9487285378171594e-06)
('scheduling algorithms', 8.984441309101113e-06)


EXTRACT TOPICS WITH GENSIM LDA

In [6]:
# make a dictionary containing a list of abstracts for each author
author_publication_pairs = list(zip(publications['user_id'], publications['abstract_text']))

authors_texts = {author_id:[] for author_id in authors["id"]}
for author_id, abstract in author_publication_pairs:
    if abstract and isinstance(abstract, str):
        authors_texts[author_id].append(abstract)

In [182]:
texts = authors_texts[1284]

remove_pos = ['ADV', 'PRON', 'PART', 'DET', 'SPACE', 'NUM', 'SYM', 'ADP', 'VERB', 'CCONJ']
stop_words = ['paper', 'present', 'propose', 'datum', 'people', 'result', 'solution', 'case', 'order', 'base', 'ieee', 'privacy', 'policy',
             'new', 'old', 'context', 'high', 'different', 'new', 'old', 'research', 'type', 'approach', 'important', 'main', 'range',
             'helpful', 'large', 'difficult', 'available', 'amount', 'useful', 'importance', 'article', 'abstract', 'scale', 'copyright',
             'real', 'quality', 'inconvenience', 'benefit', 'unavailable', 'term', 'condition', 'interest', 'organization', 'use',
             'task', 'student', 'professor', 'teacher', 'university']

# keep only adjectives and nouns
remove_entities = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY',\
                   'QUANTITY', 'CARDINAL', 'ORDINAL']

tokens = []
for abstract in texts:
    doc = nlp(abstract)
    t = [token.lemma_.lower() for token in doc if token.is_alpha and token.ent_type_ not in remove_entities \
                             and token.lemma_.lower() not in stop_words and token.pos_ not in remove_pos and not token.is_stop]
    tokens.append(t)

# add bigrams to the token list
bigram = Phrases(tokens, min_count=2, delimiter=' ', threshold=1)
tokens = [bigram[text] for text in tokens]
trigram = Phrases(bigram[tokens], min_count=2, delimiter=' ', threshold=1)
tokens = [bigram[text] for text in tokens]

In [183]:
# remove single words
tokens = [[token for token in text if len(token.split(" ")) > 1] for text in tokens]

In [184]:
print(tokens)

[['opportunistic network', 'selfish node', 'routing process', 'node routing', 'incentive mechanism', 'selfish node', 'detection algorithm', 'incentive mechanism', 'selfish node'], ['selfish node', 'opportunistic network', 'node routing', 'incentive mechanism', 'selfish node', 'detection algorithm', 'incentive mechanism', 'selfish node', 'opportunistic network'], ['need tool', 'conversation analysis'], ['recommender system', 'user experience', 'online environment', 'content collaborative', 'recommender system', 'content collaborative', 'collaborative filtering', 'semantic repository', 'domain natural'], [], ['question answer', 'challenge domain', 'natural language', 'question system', 'similarity measure', 'question answer'], ['natural language', 'processing technique'], ['traditional game', 'game user', 'medical knowledge', 'movement user', 'user movement', 'user movement', 'movement user'], ['cultural heritage'], ['way knowledge', 'website digital', 'text graphic', 'user information',

In [185]:
# create dictionary with gensim
dictionary = Dictionary(tokens)
# create corpus
corpus = [dictionary.doc2bow(text) for text in tokens]

In [186]:
# apply LDA
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=500, num_topics=1, workers=4, passes=200)
topics = lda_model.print_topics(-1)

for idx, topic in topics:
    print(f"{idx}: {topic}")

0: 0.012*"natural language" + 0.009*"e learning" + 0.007*"textual complexity" + 0.007*"chat conversation" + 0.006*"social network" + 0.004*"polyphonic model" + 0.004*"learning process" + 0.003*"artificial intelligence" + 0.003*"machine learning" + 0.003*"collaborative learning"


EXTRACT KEYWORDS FROM PUBLICATION TITLES

In [191]:
# make a dictionary containing the combined titles for each author
author_title_pairs = list(zip(publications['user_id'], publications['title']))

authors_titles = {author_id:"" for author_id in authors["id"]}
for author_id, title in author_title_pairs:
    if title and isinstance(title, str):
        authors_titles[author_id] += "\n" + title

In [234]:
kw_extractor = yake.KeywordExtractor()
max_ngram = 2
deduplication_threshold = 0.2
keywords_nr = 5
custom_kw_extractor = yake.KeywordExtractor(lan="en", n=max_ngram, dedupLim=deduplication_threshold, top=keywords_nr, features=None)

text = authors_titles[562]
doc = nlp(text)

transformed_text = ' '.join([token.text for token in doc if token.is_alpha])

keywords = custom_kw_extractor.extract_keywords(transformed_text)
for kw in keywords:
    print(kw)

('Cloud Computing', 3.0867732827543088e-06)
('Big Data', 3.115603220655388e-06)
('distributed systems', 3.5027023281787467e-06)
('International Workshop', 1.1762803994170617e-05)
('grid environments', 1.7025587706221265e-05)
