In [107]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import spacy
from spacy import displacy
from spacy.language import Language
import pyLDAvis.gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.phrases import Phrases, Phraser
from gensim.models import LdaMulticore
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import yake
from langdetect import detect, detect_langs, DetectorFactory
import fasttext
import re

In [24]:
# read csv
authors = pd.read_csv('input/top_20_authors.csv')
publications = pd.read_csv('input/publications-top_20_authors.csv', sep=',')
# authors = pd.read_csv('input/some_authors.csv')
# publications = pd.read_csv('input/publications-some_authors.csv', sep=',')

In [91]:
# make a dictionary containing the combined abstracts for each author
author_publication_pairs = list(zip(publications['user_id'], publications['abstract_text']))
texts = {author_id: [] for author_id in authors["id"]}

# load fastText model
model = fasttext.load_model('lid.176.bin')
i = 0
for author_id, abstract in author_publication_pairs:
    if abstract and isinstance(abstract, str) and re.match('^(?=.*[a-zA-Z])', abstract):
        # predict the language
        predictions = model.predict(abstract)
        language = predictions[0][0].replace('__label__', '')
        # keep only texts written in English
        if language == 'en':
            texts[author_id].append(abstract)



In [32]:
stop_words = ['abstract', 'al', 'amount', 'approach', 'article', 'available', 'base', 'based', 'benefit',
              'bucharest',
              'case', 'category', 'condition', 'conference', 'context', 'copyright', 'datum', 'demonstrate',
              'demonstrates', 'demonstrated',
              'different', 'difficult', 'et', 'experiment', 'experimental', 'faculty', 'helpful', 'high',
              'ieee', 'importance', 'important', 'inconvenience', 'interest', 'interested', 'interests', 'jat',
              'jats', 'laboratory',
              'main', 'new', 'obtain', 'obtained', 'obtains', 'old', 'order', 'organization', 'paper', 'people',
              'policy',
              'politehnica', 'polytechnic',
              'present', 'presents', 'presented', 'privacy', 'professor', 'propose', 'proposes', 'proposed',
              'quality', 'range', 'ranges', 'real',
              'recent', 'research', 'researcher', 'result', 'scale', 'show', 'shows', 'showed', 'student', 'study',
              'subject', 'studies', 'studied', 'task',
              'teacher', 'term', 'text', 'title', 'type', 'unavailable', 'university', 'useful',
              'workshop']

EXTRACT KEYWORDS/KEYPHRASES WITH YAKE

In [33]:
nlp = spacy.load('en_core_web_lg', exclude=['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'])

In [114]:
abstract_list = texts[841]

In [115]:
# remove abstracts that contain mostly person and organization names
clean_abstract_list = []

for abstract in abstract_list:
    nlp.max_length = len(abstract) + 1000
    doc = nlp(abstract)
    person_orgs_count = 0
    other_words_count = 0

    for token in doc:
        if token.ent_type_ == 'PERSON' or token.ent_type_ == 'ORG':
            person_orgs_count += 1
        elif token.is_alpha:
            other_words_count += 1

    if person_orgs_count < other_words_count:
        clean_abstract_list.append(abstract)

In [116]:
# concatenate the abstracts into a single string
text = '\n'.join(clean_abstract_list)

In [117]:
# remove some named entities
nlp.max_length = len(text) + 1000
doc = nlp(text)
remove_entities = ['PERSON', 'NORP', 'FAC', 'GPE', 'LOC', 'DATE', 'TIME', 'PERCENT', 'MONEY',
                   'QUANTITY', 'CARDINAL', 'ORDINAL']

transformed_text = ' '.join([token.text for token in doc if token.ent_type_ not in remove_entities])


In [118]:
print(len(doc.ents))
# print(doc.ents)
for tok in doc:
    if tok.ent_type_ == "ORG":
        print(tok.text + " " + tok.ent_type_)

4565
Artificial ORG
Neural ORG
Network ORG
PCA ORG
HAP ORG
HAP+ ORG
SBF ORG
SBF ORG
Ti ORG
- ORG
Sm ORG
The ORG
Ti ORG
- ORG
Sm ORG
: ORG
HAp_7 ORG
Ti ORG
- ORG
Sm ORG
the ORG
Ti ORG
- ORG
Sm ORG
: ORG
HAp_7 ORG
Ti ORG
- ORG
Sm ORG
Candida ORG
CLSM ORG
Epitaxial ORG
SBN ORG
SBN ORG
XRD ORG
SBN ORG
XRD ORG
PVA ORG
Arthur ORG
EDS ORG
Polysulfone ORG
FTlR ORG
ZnO ORG
Bentham ORG
Science ORG
Publishers ORG
XRD ORG
DSC ORG
- ORG
TG ORG
TEM ORG
aminosilanes ORG
Ag ORG
Bentham ORG
Science ORG
Publishers ORG
aminotrimethoxysilane ORG
FeCl ORG
SiO ORG
TEM ORG
XRD ORG
COLL)/ ORG
PVA ORG
PVA ORG
HC ORG
PVA ORG
PVA ORG
XRD ORG
FTIR ORG
Chitosan ORG
PP ORG
XRD ORG
PP ORG
BET ORG
EDS ORG
BET ORG
FTIR ORG
XRD ORG
s11532 ORG
XRD ORG
MSN ORG
Scanning ORG
Electron ORG
Microscopy ORG
XRD ORG
corbicula ORG
XRD ORG
DTA ORG
CMC ORG
CMC ORG
DDS ORG
Escherichia ORG
MSN ORG
FT ORG
- ORG
IR ORG
XRD ORG
BET ORG
DTA ORG
Ag ORG
polyacrylates ORG
cytostatics ORG
BET ORG
TEM ORG
CTAB ORG
DDS ORG
TEM ORG
BET ORG
HPLC

In [119]:
# set parameters for yake keyword extractor
max_ngram = 3
deduplication_threshold = 0.5
keywords_nr = 15
windowsSize = 1

kw_extractor = yake.KeywordExtractor(lan="en", n=max_ngram, dedupLim=deduplication_threshold,
                                     top=keywords_nr, windowsSize=windowsSize)
# add custom stop words to the default set from yake
kw_extractor.stopword_set.update(set(stop_words))

# extract keywords
keywords = kw_extractor.extract_keywords(transformed_text)
for kw, _ in keywords:
    print(kw)

scanning electron microscopy
ray diffraction
composite materials
materials
SEM
XRD
drug delivery systems
properties
dielectric properties
iron oxide nanoparticles
BMT ceramic material
transmission electron
microscopy
method
electron


In [120]:
displacy.render(doc, style="ent")

  from IPython.core.display import HTML, display


In [106]:
from termcolor import colored

# Funcție pentru evidențierea cuvintelor cheie
def highlight_keywords(text, keywords):
    for kw, _ in keywords:
        text = text.replace(kw, colored(kw, 'red', attrs=['bold']))
    return text

# Evidențierea cuvintelor cheie în text
highlighted_text = highlight_keywords(text, keywords)
print(highlighted_text)

Ferrofluids consist of circa 100Angstrom diameter particles of a magnetic solid (usually magnetite, Fe3O4) colloidally suspended in a carrier fluid. Typical carrier fluids include hydrocarbons, water, fluorocarbons, esters, diesters, organometallics, polyphenyl ethers and silicones. Surfactants such as oleic acid and other small molecules as well as a few polymers have been used to coat the surface of microcrystalline Fe3O4 to help prevent particle flocculation. It is of considerable interest, thus, to develop better synthetic approaches for ferrofluids, particularly since their unusual magnetic [1m[31mproperties[0m are finding applications in loudspeakers, zero-leakage seals, damping liquids, non-wearing electrical switches, artificial muscles, magnetic inks, environmental protection and energy converters. The synthesis and the systematic study of the [1m[31mproperties[0m of magnetic fluids (magnetic dispersions) were started a few years ago. In …
The liquid membranes based on f

EXTRACT TOPICS WITH GENSIM LDA

In [80]:
abstract_list = texts[1284]

In [81]:
nlp = spacy.load('en_core_web_lg')

In [82]:
# remove abstracts that contain mostly person and organization names
clean_abstract_list = []

for abstract in abstract_list:
    nlp.max_length = len(abstract) + 1000
    doc = nlp(abstract)
    person_orgs_count = 0
    other_words_count = 0

    for token in doc:
        if token.ent_type_ == 'PERSON' or token.ent_type_ == 'ORG':
            person_orgs_count += 1
        elif token.is_alpha:
            other_words_count += 1

    if person_orgs_count < other_words_count:
        clean_abstract_list.append(abstract)

In [83]:
# keep only adjectives and nouns
remove_pos = ['ADV', 'PRON', 'PART', 'DET', 'SPACE', 'NUM', 'SYM', 'ADP', 'VERB', 'CCONJ']
remove_entities = ['PERSON', 'NORP', 'FAC', 'GPE', 'LOC', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY',
                   'QUANTITY', 'CARDINAL', 'ORDINAL']

tokens = []
for abstract in clean_abstract_list:
    nlp.max_length = len(abstract) + 1000
    doc = nlp(abstract)
    t = [token.lemma_.lower() for token in doc if token.is_alpha and token.ent_type_ not in remove_entities
                             and token.lemma_.lower() not in stop_words and token.pos_ not in remove_pos and not token.is_stop]
    tokens.append(t)

# add bigrams to the token list
bigram = Phrases(tokens, min_count=3, delimiter=' ', threshold=1)
tokens = [bigram[text] for text in tokens]
trigram = Phrases(tokens, min_count=3, delimiter=' ', threshold=1)
tokens = [trigram[text] for text in tokens]

In [68]:
# remove single words
ngrams = [[token for token in text if len(token.split(" ")) > 1] for text in tokens]

In [69]:
word_count = 0
for text in ngrams:
    word_count += len(text)

if word_count > 100:
    tokens = ngrams

In [84]:
# create dictionary with gensim
dictionary = Dictionary(tokens)
# create corpus
corpus = [dictionary.doc2bow(text) for text in tokens]

In [85]:
print(tokens)



In [86]:
# apply LDA
num_topics = 1
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=300, num_topics=num_topics, workers=3, passes=50)


In [87]:
for i in range(num_topics):
    topics = lda_model.show_topic(topicid=i, topn=15)
    words = [word for word, _ in topics]

    for w in words:
        print(w)
        
    print()

system
model
method
de
analysis
knowledge
language
user
information
tool
application
learner
process
learning
time



In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)

In [None]:
cm = CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')
coherence = cm.get_coherence()
print(coherence)

In [None]:
cm = CoherenceModel(model=lda_model, texts=tokens, dictionary=dictionary, coherence='c_v')
coherence = cm.get_coherence()
print(coherence)