In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import spacy
import pyLDAvis.gensim_models
from gensim.corpora.dictionary import Dictionary
from gensim.models.phrases import Phrases, Phraser
from gensim.models import LdaMulticore
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import yake
from langdetect import detect, detect_langs, DetectorFactory
import fasttext
import re

In [49]:
# read csv
authors = pd.read_csv('top_20_authors.csv')
publications = pd.read_csv('publications-top_20_authors.csv', sep=',')
# authors = pd.read_csv('some_authors.csv')
# publications = pd.read_csv('publications-some_authors.csv', sep=',')

In [80]:
# make a dictionary containing the combined abstracts for each author
author_publication_pairs = list(zip(publications['user_id'], publications['abstract_text']))

# load fastText model
model = fasttext.load_model('lid.176.bin')

authors_texts = {author_id: "" for author_id in authors["id"]}
for author_id, abstract in author_publication_pairs:
    if abstract and isinstance(abstract, str) and re.match('^(?=.*[a-zA-Z])', abstract):
        predictions = model.predict(abstract)
        language = predictions[0][0].replace('__label__', '')
        if language == 'en':
            authors_texts[author_id] = authors_texts[author_id] + '\n' + abstract



In [81]:
nlp = spacy.load('en_core_web_lg')

In [None]:
text = authors_texts[802]
doc = nlp(text)

print(len(doc.ents))
for token in doc:
    if "politehnica" in token.text.lower() or "polytechnic" in token.text.lower() or "faculty" in token.text.lower() or "univeristy" in token.text.lower():
        print(token.text, token.ent_type_)

In [87]:
stop_words = ['abstract', 'amount', 'approach', 'article', 'available', 'base', 'based', 'benefit',
              'bucharest',
              'case', 'condition', 'conference', 'context', 'copyright', 'datum', 'demonstrate', 'demonstrates',
              'demonstrated',
              'different', 'difficult', 'experiment', 'experimental', 'faculty', 'helpful', 'high',
              'ieee', 'importance', 'important', 'inconvenience', 'interest', 'interested', 'interests', 'jat',
              'jats', 'laboratory',
              'main', 'new', 'obtain', 'obtained', 'obtains', 'old', 'order', 'organization', 'paper', 'people',
              'policy',
              'politehnica', 'polytechnic',
              'present', 'presents', 'presented', 'privacy', 'professor', 'propose', 'proposes', 'proposed',
              'quality', 'range', 'ranges', 'real',
              'recent', 'research', 'researcher', 'result', 'scale', 'show', 'shows', 'showed', 'student', 'study',
              'studies', 'studied', 'task',
              'teacher', 'term', 'text', 'title', 'type', 'unavailable', 'university', 'useful',
              'workshop']

EXTRACT KEYWORDS/KEYPHRASES WITH YAKE

In [85]:
# set parameters for yake keyword extractor
max_ngram = 3
deduplication_threshold = 0.5
keywords_nr = 15
windowsSize = 1

kw_extractor = yake.KeywordExtractor(lan="en", n=max_ngram, dedupLim=deduplication_threshold,
                                     top=keywords_nr, windowsSize=windowsSize)
kw_extractor.stopword_set.update(set(stop_words))

text = authors_texts[534]
doc = nlp(text)
remove_entities = ['PERSON', 'NORP', 'FAC', 'GPE', 'LOC', 'DATE', 'TIME', 'PERCENT', 'MONEY',
                   'QUANTITY', 'CARDINAL', 'ORDINAL']

transformed_text = ' '.join([token.text for token in doc if token.ent_type_ not in remove_entities])

keywords = kw_extractor.extract_keywords(transformed_text)
for kw, _ in keywords:
    print(kw)

distributed systems
mobile cloud computing
cloud computing systems
Data
mobile big data
Opportunistic networks
mobile devices
systems
network management services
big data processing
mobile social networks
network
Services
mobile data traffic
mobile opportunistic cloud


In [58]:
from termcolor import colored

# Funcție pentru evidențierea cuvintelor cheie
def highlight_keywords(text, keywords):
    for kw, _ in keywords:
        text = text.replace(kw, colored(kw, 'red', attrs=['bold']))
    return text

# Evidențierea cuvintelor cheie în text
highlighted_text = highlight_keywords(text, keywords)
print(highlighted_text)


Cities are areas where [1m[31mBig Data[0m is having a real impact. Town planners and administration bodies just need the right tools at their fingertips to consume all the data points that a town or city generates and then be able to turn that into actions that improve peoples’ lives. In this case, [1m[31mBig Data[0m is definitely a phenomenon that has a direct impact on the quality of life for those of us that choose to live in a town or city. Smart Cities of tomorrow will rely not only on sensors within the city infrastructure, but also on a large number of devices that will willingly sense and integrate their data into technological platforms used for introspection into the habits and situations of individuals and city-large communities. Predictions say that cities will generate over 4.1 terabytes per day per square kilometer of urbanized land area by 2016. Handling efficiently such amounts of data is already a challenge. In this paper we present our solutions …
The MonALISA 

EXTRACT TOPICS WITH GENSIM LDA

In [78]:
# make a dictionary containing a list of abstracts for each author
author_publication_pairs = list(zip(publications['user_id'], publications['abstract_text']))

authors_texts = {author_id:[] for author_id in authors["id"]}
for author_id, abstract in author_publication_pairs:
    if abstract and isinstance(abstract, str) and re.match('^(?=.*[a-zA-Z])', abstract):
        language = detect(abstract)
        if language == 'en':
            authors_texts[author_id].append(abstract)

In [None]:
texts = authors_texts[562]

# keep only adjectives and nouns
remove_pos = ['ADV', 'PRON', 'PART', 'DET', 'SPACE', 'NUM', 'SYM', 'ADP', 'VERB', 'CCONJ']
remove_entities = ['PERSON', 'NORP', 'FAC', 'GPE', 'LOC', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY',
                   'QUANTITY', 'CARDINAL', 'ORDINAL']

tokens = []
for abstract in texts:
    doc = nlp(abstract)
    t = [token.lemma_.lower() for token in doc if token.is_alpha and token.ent_type_ not in remove_entities
                             and token.lemma_.lower() not in stop_words and token.pos_ not in remove_pos and not token.is_stop]
    tokens.append(t)

# add bigrams to the token list
bigram = Phrases(tokens, min_count=2, delimiter=' ', threshold=1)
tokens = [bigram[text] for text in tokens]
trigram = Phrases(tokens, min_count=2, delimiter=' ', threshold=1)
tokens = [trigram[text] for text in tokens]

In [None]:
# remove single words
ngrams = [[token for token in text if len(token.split(" ")) > 1] for text in tokens]

In [None]:
word_count = 0
for text in ngrams:
    word_count += len(text)

if word_count > 100:
    tokens = ngrams

In [None]:
# create dictionary with gensim
dictionary = Dictionary(tokens)
# create corpus
corpus = [dictionary.doc2bow(text) for text in tokens]

In [None]:
# apply LDA
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=300, num_topics=1, workers=4, passes=50)
topics = lda_model.print_topics(-1)

for idx, topic in topics:
    print(topic)