In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import spacy
import pyLDAvis.gensim_models
import en_core_web_md
from gensim.corpora.dictionary import Dictionary
from gensim.models.phrases import Phrases, Phraser
from gensim.models import LdaMulticore
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import yake

In [2]:
# read csv
authors = pd.read_csv('top_20_authors.csv')
publications = pd.read_csv('publications-top_20_authors.csv', sep=',')

In [3]:
# make a dictionary containing the combined abstracts for each author
author_publication_pairs = list(zip(publications['user_id'], publications['abstract_text']))

authors_texts = {author_id:"" for author_id in authors["id"]}
for author_id, abstract in author_publication_pairs:
    if abstract and isinstance(abstract, str):
        authors_texts[author_id] = authors_texts[author_id] + "\n" + abstract

In [4]:
nlp = spacy.load('en_core_web_md')

EXTRACT KEYWORDS/KEYPHRASES WITH YAKE

In [5]:
kw_extractor = yake.KeywordExtractor()
max_ngram = 2
deduplication_threshold = 0.2
keywords_nr = 5
custom_kw_extractor = yake.KeywordExtractor(lan="en", n=max_ngram, dedupLim=deduplication_threshold, top=keywords_nr, features=None)

text = authors_texts[1146]
doc = nlp(text)
remove_entities = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY',\
                   'QUANTITY', 'CARDINAL', 'ORDINAL']
stop_words = ['paper', 'present', 'propose', 'datum', 'people', 'result', 'solution', 'case', 'order', 'base', 'ieee', 'privacy', 'policy']

transformed_text = ' '.join([token.text for token in doc if token.is_alpha and token.ent_type_ not in remove_entities \
                             and token.lemma_.lower() not in stop_words and token.pos_ != 'ADV'])

keywords = custom_kw_extractor.extract_keywords(transformed_text)
for kw in keywords:
    print(kw)

('virtual reality', 6.378499149358472e-07)
('Online virtual', 6.796923210272136e-06)
('learning process', 1.3985233895355148e-05)
('game learning', 3.081082700689413e-05)
('computer games', 3.111750572800595e-05)


EXTRACT TOPICS WITH GENSIM LDA

In [6]:
# make a dictionary containing a list of abstracts for each author
author_publication_pairs = list(zip(publications['user_id'], publications['abstract_text']))

authors_texts = {author_id:[] for author_id in authors["id"]}
for author_id, abstract in author_publication_pairs:
    if abstract and isinstance(abstract, str):
        authors_texts[author_id].append(abstract)

In [53]:
texts = authors_texts[534]

remove_pos = ['ADV', 'PRON', 'PART', 'DET', 'SPACE', 'NUM', 'SYM', 'ADP', 'VERB', 'CCONJ']
stop_words = ['paper', 'present', 'propose', 'datum', 'people', 'result', 'solution', 'case', 'order', 'base', 'ieee', 'privacy', 'policy',
             'new', 'old', 'context']

# processed_texts = []
# for abstract in texts:
#     abstract_nlp = nlp(abstract)
#     t = [token.text.lower() for token in abstract_nlp if token.pos_ not in remove_pos and token.is_alpha \
#          and token.lemma_.lower() not in stop_words]
#     processed_texts.append(' '.join(t))

# tokens = []
# for abstract in processed_texts:
#     abstract_nlp = nlp(abstract)
#     t = [token.lemma_.lower() for token in abstract_nlp.noun_chunks]
#     tokens.append(t)

# keep only adjectives and nouns
remove_entities = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY',\
                   'QUANTITY', 'CARDINAL', 'ORDINAL']

tokens = []
for abstract in texts:
    doc = nlp(abstract)
    t = [token.lemma_.lower() for token in doc if token.is_alpha and token.ent_type_ not in remove_entities \
                             and token.lemma_.lower() not in stop_words and token.pos_ not in remove_pos and not token.is_stop]
    tokens.append(t)

# add bigrams to the token list
bigram = Phrases(tokens, min_count=2, delimiter=' ', threshold=1)
tokens = [bigram[text] for text in tokens]
trigram = Phrases(bigram[tokens], min_count=2, delimiter=' ', threshold=1)
tokens = [bigram[text] for text in tokens]

In [54]:
# remove single words
tokens = [[token for token in text if len(token.split(" ")) > 1] for text in tokens]

In [55]:
print(tokens)

[['opportunistic network', 'selfish node', 'routing process', 'node routing', 'incentive mechanism', 'selfish node', 'detection algorithm', 'incentive mechanism', 'selfish node'], ['selfish node', 'opportunistic network', 'node routing', 'incentive mechanism', 'selfish node', 'detection algorithm', 'incentive mechanism', 'selfish node', 'opportunistic network'], ['need tool', 'conversation analysis', 'approach actual'], ['recommender system', 'user experience', 'online environment', 'content collaborative', 'recommender system', 'content collaborative', 'similar article', 'collaborative filtering', 'semantic repository', 'domain natural'], [], ['question answer', 'challenge domain', 'natural language', 'question system', 'similarity measure', 'question answer'], ['natural language', 'processing technique'], ['traditional game', 'use computer', 'different type', 'game user', 'medical knowledge', 'movement user', 'type exergame', 'user movement', 'user movement', 'movement user', 'type e

In [56]:
# create dictionary with gensim
dictionary = Dictionary(tokens)
# create corpus
corpus = [dictionary.doc2bow(text) for text in tokens]

In [57]:
# apply LDA
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=500, num_topics=1, workers=4, passes=100)
topics = lda_model.print_topics(-1)

for idx, topic in topics:
    print(f"{idx}: {topic}")

0: 0.011*"natural language" + 0.008*"e learning" + 0.006*"textual complexity" + 0.006*"chat conversation" + 0.005*"social network" + 0.004*"learning process" + 0.003*"polyphonic model" + 0.003*"machine learning" + 0.003*"artificial intelligence" + 0.003*"web page"


TRY TO REMOVE PREPOSITIONS AFTER BUILDING NGRAMS