In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import spacy
import pyLDAvis.gensim_models
import en_core_web_md
from gensim.corpora.dictionary import Dictionary
from gensim.models.phrases import Phrases, Phraser
from gensim.models import LdaMulticore
from gensim.models import LdaModel
from gensim.models import CoherenceModel

In [2]:
# read csv
authors = pd.read_csv('top_20_authors.csv')

publications = pd.read_csv('publications-top_20_authors.csv', sep=',')
print(publications['abstract_text'][0])

Cities are areas where Big Data is having a real impact. Town planners and administration bodies just need the right tools at their fingertips to consume all the data points that a town or city generates and then be able to turn that into actions that improve peoples’ lives. In this case, Big Data is definitely a phenomenon that has a direct impact on the quality of life for those of us that choose to live in a town or city. Smart Cities of tomorrow will rely not only on sensors within the city infrastructure, but also on a large number of devices that will willingly sense and integrate their data into technological platforms used for introspection into the habits and situations of individuals and city-large communities. Predictions say that cities will generate over 4.1 terabytes per day per square kilometer of urbanized land area by 2016. Handling efficiently such amounts of data is already a challenge. In this paper we present our solutions …


In [3]:
# tokenize, lemmatize, remove stop words
nlp = spacy.load('en_core_web_md')
remove_pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'PART', 'DET', 'ADP', 'SPACE', 'NUM', 'SYM']

tokens = []
for abstract in publications['abstract_text']:
    if abstract and isinstance(abstract, str):
        abstract_nlp = nlp(abstract)

        t = []
        for tok in abstract_nlp:
            if tok.pos_ not in remove_pos and not tok.is_stop and tok.is_alpha:
                t.append(tok.lemma_.lower())

        tokens.append(t)

print(tokens)



In [4]:
# create dictionary with gensim
dictionary = Dictionary(tokens)
print(dictionary.token2id)



In [5]:
corpus = [dictionary.doc2bow(document) for document in tokens]

In [6]:
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=50, num_topics=10, workers = 4, passes=10)

In [7]:
lda_model.print_topics(-1)

[(0,
  '0.037*"de" + 0.009*"si" + 0.008*"în" + 0.008*"cu" + 0.007*"la" + 0.007*"şi" + 0.006*"o" + 0.006*"pentru" + 0.006*"fost" + 0.006*"au"'),
 (1,
  '0.011*"model" + 0.010*"base" + 0.009*"system" + 0.008*"language" + 0.008*"analysis" + 0.008*"paper" + 0.008*"learning" + 0.006*"present" + 0.006*"text" + 0.005*"network"'),
 (2,
  '0.014*"system" + 0.011*"present" + 0.010*"network" + 0.010*"paper" + 0.009*"datum" + 0.008*"application" + 0.007*"base" + 0.006*"solution" + 0.006*"mobile" + 0.006*"device"'),
 (3,
  '0.012*"extract" + 0.007*"acid" + 0.007*"compound" + 0.006*"study" + 0.006*"activity" + 0.005*"antioxidant" + 0.005*"de" + 0.005*"result" + 0.005*"cell" + 0.004*"method"'),
 (4,
  '0.013*"material" + 0.009*"obtain" + 0.009*"nanoparticle" + 0.008*"study" + 0.008*"property" + 0.007*"drug" + 0.006*"cell" + 0.006*"base" + 0.005*"application" + 0.005*"method"'),
 (5,
  '0.007*"property" + 0.007*"obtain" + 0.007*"temperature" + 0.006*"result" + 0.006*"high" + 0.006*"ceramic" + 0.005*"p

In [None]:
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=300, num_topics=10, workers = 4, passes=50)

In [None]:
lda_model.print_topics(-1)


In [None]:
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
lda_model.show_topics()

In [None]:
bigram = Phrases(tokens)
tokens = [bigram[line] for line in tokens]

In [None]:
print(tokens)


In [None]:
for i, abstract in enumerate(tokens):
    for j, token in enumerate(abstract):
        tokens[i][j] = token.replace("_", " ")

print(tokens)

In [None]:
new_tokens = []

for abstract in tokens:
    new_abstract = []
    for token in abstract:
        new_abstract.append(token.replace("_", " "))
    new_tokens.append(new_abstract)

print(new_tokens)

In [None]:
# create dictionary with gensim
dictionary = Dictionary(new_tokens)
print(dictionary.token2id)

In [None]:
corpus = [dictionary.doc2bow(document) for document in new_tokens]

In [None]:
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=50, num_topics=10, workers = 4, passes=10)

In [None]:
lda_model.print_topics(-1)




GET EACH AUTHOR'S PUBLICATIONS AND DO THE STEPS ABOVE TO FIND A TOPIC FOR EACH AUTHOR

In [8]:
print(authors[["last_name", "first_name"]])

       last_name              first_name
0          DATCU                   Mihai
1     GRUMEZESCU         Alexandru Mihai
2     ANDRONESCU               Ecaterina
3   TRAUSAN-MATU                  STEFAN
4          VOICU                Gheorghe
5          FICAI                   Anton
6          DOBRE           Ciprian Mihai
7      SEMENESCU                Augustin
8           VLAD               MAGDALENA
9            POP                  Florin
10     UNGUREANU                Nicoleta
11      PETRESCU             Florian Ion
12       STASTNY                   PETER
13         BIRIS            SORIN STEFAN
14       DASCALU                   Mihai
15       POPESCU                     Dan
16          RADU          Gabriel Lucian
17          IOVU                   Horia
18        Meghea                 Aurelia
19    MOLDOVEANU  ALIN - DRAGOS - BOGDAN


In [9]:
authors_names = list(zip(authors["last_name"], authors["first_name"], authors["id"]))
print(authors_names)

[('DATCU', 'Mihai', 829), ('GRUMEZESCU', 'Alexandru Mihai', 1672), ('ANDRONESCU', 'Ecaterina', 841), ('TRAUSAN-MATU', 'STEFAN', 1284), ('VOICU', 'Gheorghe', 1225), ('FICAI', 'Anton', 1849), ('DOBRE', 'Ciprian Mihai', 534), ('SEMENESCU', 'Augustin', 733), ('VLAD', 'MAGDALENA', 69354), ('POP', 'Florin', 562), ('UNGUREANU', 'Nicoleta', 1541), ('PETRESCU', 'Florian Ion', 1297), ('STASTNY', 'PETER', 38845), ('BIRIS', 'SORIN STEFAN', 1047), ('DASCALU', 'Mihai', 584), ('POPESCU', 'Dan', 872), ('RADU', 'Gabriel Lucian', 1292), ('IOVU', 'Horia', 1246), ('Meghea', 'Aurelia', 68995), ('MOLDOVEANU', 'ALIN - DRAGOS - BOGDAN', 1146)]


In [10]:
authors_names = [(name[0].upper(), name[1].split(" ")[0].upper(), name[2]) for name in list(authors_names)]
print(authors_names)

[('DATCU', 'MIHAI', 829), ('GRUMEZESCU', 'ALEXANDRU', 1672), ('ANDRONESCU', 'ECATERINA', 841), ('TRAUSAN-MATU', 'STEFAN', 1284), ('VOICU', 'GHEORGHE', 1225), ('FICAI', 'ANTON', 1849), ('DOBRE', 'CIPRIAN', 534), ('SEMENESCU', 'AUGUSTIN', 733), ('VLAD', 'MAGDALENA', 69354), ('POP', 'FLORIN', 562), ('UNGUREANU', 'NICOLETA', 1541), ('PETRESCU', 'FLORIAN', 1297), ('STASTNY', 'PETER', 38845), ('BIRIS', 'SORIN', 1047), ('DASCALU', 'MIHAI', 584), ('POPESCU', 'DAN', 872), ('RADU', 'GABRIEL', 1292), ('IOVU', 'HORIA', 1246), ('MEGHEA', 'AURELIA', 68995), ('MOLDOVEANU', 'ALIN', 1146)]


In [13]:
authors_texts = {}
all_texts = list(zip(publications['abstract_text'], publications['authors']))

# find each author's publications
for last_name, first_name, author_id in authors_names:
    abstracts = []
    for abstract, authors in all_texts:
        if abstract and isinstance(abstract, str) and authors and isinstance(authors, str):
            # check if the current author is one of the authors of this publication
            authors = authors.split(";")
            for a in authors:
                a = a.upper()
                if last_name in a and (first_name in a or (first_name[0] + ".") in a):
                    abstracts.append(abstract)
                    break
    authors_texts[author_id] = abstracts

for id in authors_texts:
    print(f"{id} {len(authors_texts[id])}")

829 715
1672 598
841 712
1284 532
1225 367
1849 482
534 521
733 239
69354 9
562 449
1541 164
1297 267
38845 47
1047 207
584 432
872 393
1292 338
1246 333
68995 344
1146 335


In [14]:
# get one topic for each author
authors_topics = {}
nlp = spacy.load('en_core_web_md')
remove_pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'PART', 'DET', 'ADP', 'SPACE', 'NUM', 'SYM']

for author_id in authors_texts:
    texts = authors_texts[author_id]

    # tokenize, lemmatize, remove stop words
    tokens = []
    for abstract in texts:
        if abstract and isinstance(abstract, str):
            abstract_nlp = nlp(abstract)

            t = []
            for tok in abstract_nlp:
                if tok.pos_ not in remove_pos and not tok.is_stop and tok.is_alpha:
                    t.append(tok.lemma_.lower())

            tokens.append(t)

    print(author_id)
    print()

    # add bigrams to the token list
    # bigram = Phrases(tokens)
    # tokens = [bigram[text] for text in tokens]
    #
    # for i, abstract in enumerate(tokens):
    #     for j, token in enumerate(abstract):
    #         tokens[i][j] = token.replace("_", " ")

    # create dictionary with gensim
    dictionary = Dictionary(tokens)

    # create corpus
    corpus = [dictionary.doc2bow(text) for text in tokens]

    # apply lda
    # lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=50, num_topics=1, workers=4, passes=10)
    # lda_model.print_topics(-1)
    lda_model = LdaModel(corpus=corpus, num_topics=1, id2word=dictionary)
    lda_model.show_topics()
    print()
    

829


1672


841


1284


1225


1849


534


733


69354


562


1541


1297


38845


1047


584


872


1292


1246


68995


1146




In [6]:
author_id = 829
texts = authors_texts[author_id]

# tokenize, lemmatize, remove stop words
nlp = spacy.load('en_core_web_md')
remove_pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'PART', 'DET', 'ADP', 'SPACE', 'NUM', 'SYM']

tokens = []
for abstract in texts:
    if abstract and isinstance(abstract, str):
        abstract_nlp = nlp(abstract)

        t = []
        for tok in abstract_nlp:
            if tok.pos_ not in remove_pos and not tok.is_stop and tok.is_alpha:
                t.append(tok.lemma_.lower())

        tokens.append(t)

# add bigrams to the token list
bigram = Phrases(tokens)
tokens = [bigram[text] for text in tokens]

for i, abstract in enumerate(tokens):
    for j, token in enumerate(abstract):
        tokens[i][j] = token.replace("_", " ")

# create dictionary with gensim
dictionary = Dictionary(tokens)

# create corpus
corpus = [dictionary.doc2bow(text) for text in tokens]

# apply lda
print(author_id)

# lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=50, num_topics=1, workers=4, passes=10)
# lda_model.print_topics(-1)
lda_model = LdaModel(corpus=corpus, num_topics=1, id2word=dictionary)
lda_model.show_topics()
print()


829

