In [65]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import spacy
import pyLDAvis.gensim_models
import en_core_web_md
from gensim.corpora.dictionary import Dictionary
from gensim.models.phrases import Phrases, Phraser
from gensim.models import LdaMulticore
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import yake

In [92]:
# read csv
authors = pd.read_csv('top_20_authors.csv')
publications = pd.read_csv('publications-top_20_authors.csv', sep=',')

In [159]:
# make a dictionary containing the combined abstracts for each author
author_publication_pairs = list(zip(publications['user_id'], publications['abstract_text']))

authors_texts = {author_id:"" for author_id in authors["id"]}
for author_id, abstract in author_publication_pairs:
    if abstract and isinstance(abstract, str):
        authors_texts[author_id] = authors_texts[author_id] + "\n" + abstract

In [160]:
nlp = spacy.load('en_core_web_md')

EXTRACT KEYWORDS/KEYPHRASES WITH YAKE

In [244]:
kw_extractor = yake.KeywordExtractor()
max_ngram = 2
deduplication_threshold = 0.2
keywords_nr = 5
custom_kw_extractor = yake.KeywordExtractor(lan="en", n=max_ngram, dedupLim=deduplication_threshold, top=keywords_nr, features=None)

text = authors_texts[872]
doc = nlp(text)
remove_entities = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY',\
                   'QUANTITY', 'CARDINAL', 'ORDINAL']
stop_words = ['paper', 'present', 'propose', 'datum', 'people', 'result', 'solution', 'case', 'order', 'base', 'ieee', 'privacy', 'policy']

transformed_text = ' '.join([token.text for token in doc if token.is_alpha and token.ent_type_ not in remove_entities \
                             and token.lemma_.lower() not in stop_words])

keywords = custom_kw_extractor.extract_keywords(transformed_text)
for kw in keywords:
    print(kw)

('neural networks', 1.7430880205276488e-06)
('control system', 2.5896852356877334e-06)
('image processing', 2.8179481095091054e-06)
('process control', 5.760868421795922e-06)
('Unmanned Aerial', 9.571797529239613e-06)


EXTRACT TOPICS WITH GENSIM LDA

In [131]:
# make a dictionary containing a list of abstracts for each author
author_publication_pairs = list(zip(publications['user_id'], publications['abstract_text']))

authors_texts = {author_id:[] for author_id in authors["id"]}
for author_id, abstract in author_publication_pairs:
    if abstract and isinstance(abstract, str):
        authors_texts[author_id].append(abstract)

In [153]:
texts = authors_texts[534]

remove_pos = ['ADV', 'PRON', 'PART', 'DET', 'SPACE', 'NUM', 'SYM', 'PROPN']
stop_words = ['paper', 'present', 'propose', 'datum', 'people', 'result', 'solution', 'case', 'area', 'that', 'order']

processed_texts = []
for abstract in texts:
    abstract_nlp = nlp(abstract)
    t = [token.text.lower() for token in abstract_nlp if token.pos_ not in remove_pos and token.is_alpha \
         and token.lemma_.lower() not in stop_words]
    processed_texts.append(' '.join(t))

tokens = []
for abstract in processed_texts:
    abstract_nlp = nlp(abstract)
    t = [token.lemma_.lower() for token in abstract_nlp.noun_chunks]
    tokens.append(t)

# add bigrams to the token list
bigram = Phrases(tokens, min_count=2, delimiter=' ', threshold=1)
tokens = [bigram[text] for text in tokens]
trigram = Phrases(bigram[tokens], min_count=2, delimiter=' ', threshold=1)
tokens = [bigram[text] for text in tokens]

In [154]:
print(tokens)



In [155]:
# create dictionary with gensim
dictionary = Dictionary(tokens)
# create corpus
corpus = [dictionary.doc2bow(text) for text in tokens]

In [157]:
# apply LDA
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=100, num_topics=1, workers=4, passes=15)
topics = lda_model.print_topics(-1)

for idx, topic in topics:
    print(f"{idx}: {topic}")

0: 0.003*"application" + 0.003*"user" + 0.003*"node" + 0.003*"service" + 0.003*"device" + 0.003*"information" + 0.003*"system" + 0.002*"technology" + 0.002*"mobile device" + 0.002*"algorithm"


EXTRACT KEYWORDS/KEYPHRASES WITH GENSIM

In [28]:
# get most frequent noun_chunks
from collections import Counter
from spacy.language import Language
import spacy_cleaner
from spacy_cleaner import processing, Cleaner

text = authors_texts[534]

cleaner = Cleaner(
    nlp,
    processing.remove_stopword_token,
    processing.remove_punctuation_token,
    processing.remove_number_token,
    processing.mutate_lemma_token,
)

text = cleaner.clean([text])[0]
doc = nlp(text)

tokens = [token.lemma_.lower() for token in doc.noun_chunks]
print(Counter(tokens).most_common())

Cleaning Progress: 100%|██████████████████████████████| 1/1 [00:12<00:00, 12.85s/it]




In [26]:
# extract keywords
kw_extractor = yake.KeywordExtractor()
language = "en"
max_ngram_size = 3
deduplication_threshold = 0.9
numOfKeywords = 100
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
        
for author_id in authors_texts:
    texts = authors_texts[author_id]
    print(author_id)
    
    for text in texts:
        # doc = nlp(text)
        # print(doc.ents)

        keywords = custom_kw_extractor.extract_keywords(text)
        for kw in keywords:
            print(kw)

    print()

829

1672


KeyboardInterrupt: 