In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import spacy
import pyLDAvis.gensim_models
import en_core_web_md
from gensim.corpora.dictionary import Dictionary
from gensim.models.phrases import Phrases, Phraser
from gensim.models import LdaMulticore
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import yake
from langdetect import detect, detect_langs, DetectorFactory
import fasttext
import re

In [40]:
# read csv
authors = pd.read_csv('top_20_authors.csv')
publications = pd.read_csv('publications-top_20_authors.csv', sep=',')

In [41]:
# make a dictionary containing the combined abstracts for each author
author_publication_pairs = list(zip(publications['user_id'], publications['abstract_text']))

# load fastText model
model = fasttext.load_model('lid.176.bin')

authors_texts = {author_id: "" for author_id in authors["id"]}
for author_id, abstract in author_publication_pairs:
    if abstract and isinstance(abstract, str) and re.match('^(?=.*[a-zA-Z])', abstract):
        predictions = model.predict(abstract)
        language = predictions[0][0].replace('__label__', '')
        if language == 'en':
            authors_texts[author_id] = authors_texts[author_id] + '\n' + abstract



In [42]:
nlp = spacy.load('en_core_web_md')

In [43]:
stop_words = ['paper', 'present', 'propose', 'show', 'datum', 'people', 'result', 'solution', 'case', 'order',
              'base', 'ieee', 'privacy', 'policy', 'new', 'old', 'context', 'high', 'different', 'research', 'type',
              'approach', 'important', 'main', 'range', 'helpful', 'large', 'difficult', 'available', 'amount',
              'useful', 'importance', 'article', 'abstract', 'scale', 'copyright', 'real', 'quality', 'demonstrate',
              'inconvenience', 'benefit', 'unavailable', 'term', 'condition', 'interest', 'recent', 'obtain',
              'title', 'jat', 'jats',
              'organization', 'task', 'student', 'professor', 'teacher', 'university', 'workshop', 'study', 'text',
              'conference']

EXTRACT KEYWORDS/KEYPHRASES WITH YAKE

In [44]:
kw_extractor = yake.KeywordExtractor()
max_ngram = 3
deduplication_threshold = 0.5
keywords_nr = 15
custom_kw_extractor = yake.KeywordExtractor(lan="en", n=max_ngram, dedupLim=deduplication_threshold, top=keywords_nr, features=None)

text = authors_texts[534]
doc = nlp(text)
remove_entities = ['PERSON', 'NORP', 'FAC', 'GPE', 'LOC', 'DATE', 'TIME', 'PERCENT', 'MONEY',
                   'QUANTITY', 'CARDINAL', 'ORDINAL']

transformed_text = ' '.join([token.text for token in doc if token.ent_type_ not in remove_entities
                             and token.lemma_.lower() not in stop_words])

keywords = custom_kw_extractor.extract_keywords(transformed_text)
for kw in keywords:
    print(kw)

('distributed systems', 3.4364722657217926e-05)
('Mobile Cloud computing', 3.725239951607386e-05)
('cloud computing systems', 5.28856534941652e-05)
('mobile devices', 8.456675262873203e-05)
('Opportunistic networks', 9.144074241533435e-05)
('network management services', 0.00014725209553199654)
('mobile social networks', 0.00014988644505110576)
('systems', 0.00015569770970941547)
('network', 0.00019029201586369998)
('mobile opportunistic cloud', 0.00019605864499052693)
('Services', 0.00022863763957386274)
('mobile', 0.00023057430805365037)
('intelligent transportation systems', 0.00024168145356403626)
('service computing architectures', 0.0002616473841634621)
('distributed network services', 0.00026547025450600136)


EXTRACT KEYWORDS/KEYPHRASES WITH YAKE - from each abstract

In [45]:
# make a dictionary containing a list of abstracts for each author
author_publication_pairs = list(zip(publications['user_id'], publications['abstract_text']))

authors_texts = {author_id:[] for author_id in authors["id"]}
for author_id, abstract in author_publication_pairs:
    if abstract and isinstance(abstract, str) and re.match('^(?=.*[a-zA-Z])', abstract):
        language = detect(abstract)
        if language == 'en':
            authors_texts[author_id].append(abstract)

In [46]:
# concatenate chunks of 25 abstracts
chunk_size = 100
abstracts = authors_texts[534]
new_abstracts = []
abstract_count = len(abstracts)

for i in range(0, abstract_count, chunk_size):
    end = i + chunk_size if i + 2 * chunk_size <= abstract_count else abstract_count
    abstracts_chunk = ' '.join(abstracts[i:i+chunk_size])
    new_abstracts.append(abstracts_chunk)


In [47]:
kw_extractor = yake.KeywordExtractor()
max_ngram = 3
deduplication_threshold = 0.7
keywords_nr = 5
custom_kw_extractor = yake.KeywordExtractor(lan="en", n=max_ngram, dedupLim=deduplication_threshold, top=keywords_nr, features=None)

remove_entities = ['PERSON', 'NORP', 'FAC', 'GPE', 'LOC', 'DATE', 'TIME', 'PERCENT', 'MONEY',
                   'QUANTITY', 'CARDINAL', 'ORDINAL']

for text in new_abstracts:
    doc = nlp(text)
    transformed_text = ' '.join([token.text for token in doc if token.ent_type_ not in remove_entities
                             and token.lemma_.lower() not in stop_words])
    
    keywords = custom_kw_extractor.extract_keywords(transformed_text)
    for kw in keywords:
        print(kw)
    print()

('distributed systems', 0.00014751986914513155)
('Mobile Cloud computing', 0.0001555608069588123)
('CPC Program Library', 0.0001928830969001555)
('distributed system technologies', 0.00021192143569025227)
('Opportunistic Mobile Networks', 0.00022074089837374123)

('distributed systems', 0.00010283178410928555)
('mobile cloud computing', 0.00013430140315475136)
('mobile cloud systems', 0.00015297031532179986)
('Cloud computing systems', 0.00020430597772334866)
('distributed system technologies', 0.0002949470001309614)

('Intelligent Transportation Systems', 0.0003160180608889025)
('opportunistic networks', 0.0003567229004126022)
('network management services', 0.00036641204050500255)
('distributed systems', 0.00041184666832745853)
('Opportunistic network applications', 0.0004936438126448163)

('technical professional dedicated', 5.470772391643552e-05)
('web site signifies', 9.401472685159501e-05)
('advancing technology', 0.00026436612094685337)
('distributed systems', 0.0003682250908961

EXTRACT TOPICS WITH GENSIM LDA

In [None]:
# make a dictionary containing a list of abstracts for each author
author_publication_pairs = list(zip(publications['user_id'], publications['abstract_text']))

authors_texts = {author_id:[] for author_id in authors["id"]}
for author_id, abstract in author_publication_pairs:
    if abstract and isinstance(abstract, str) and re.match('^(?=.*[a-zA-Z])', abstract):
        language = detect(abstract)
        if language == 'en':
            authors_texts[author_id].append(abstract)

In [None]:
texts = authors_texts[829]

# keep only adjectives and nouns
remove_pos = ['ADV', 'PRON', 'PART', 'DET', 'SPACE', 'NUM', 'SYM', 'ADP', 'VERB', 'CCONJ']
remove_entities = ['PERSON', 'NORP', 'FAC', 'GPE', 'LOC', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY',
                   'QUANTITY', 'CARDINAL', 'ORDINAL']

tokens = []
for abstract in texts:
    doc = nlp(abstract)
    t = [token.lemma_.lower() for token in doc if token.is_alpha and token.ent_type_ not in remove_entities
                             and token.lemma_.lower() not in stop_words and token.pos_ not in remove_pos and not token.is_stop]
    tokens.append(t)

# add bigrams to the token list
bigram = Phrases(tokens, min_count=2, delimiter=' ', threshold=1)
tokens = [bigram[text] for text in tokens]
trigram = Phrases(tokens, min_count=2, delimiter=' ', threshold=1)
tokens = [trigram[text] for text in tokens]

In [None]:
# remove single words
tokens = [[token for token in text if len(token.split(" ")) > 1] for text in tokens]

In [None]:
# create dictionary with gensim
dictionary = Dictionary(tokens)
# create corpus
corpus = [dictionary.doc2bow(text) for text in tokens]

In [None]:
# apply LDA
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=300, num_topics=1, workers=4, passes=50)
topics = lda_model.print_topics(-1)

for idx, topic in topics:
    print(topic)