# Imports

In [None]:
%matplotlib inline
import litstudy
import logging

logging.getLogger().setLevel(logging.INFO)

# Perform Search

In [None]:
dblp_query = 'exascale and gpu'
docset = litstudy.search_dblp(dblp_query)
litstudy.query_semanticscholar(docset)

scopus_query = 'title-abs-key(exascale and gpu)'
docset = litstudy.search_scopus(scopus_query, docset)

# Plot statistics

## Publications per year

In [None]:
litstudy.prepare_plot(15, 3)
litstudy.plot_year_histogram(docset)

## Publications per source 

In [None]:
litstudy.prepare_plot(10, 10)
litstudy.plot_source_histogram(docset, top_k=50)

In [None]:
litstudy.prepare_plot(10, 3)
litstudy.plot_source_type_histogram(docset)

In [None]:
litstudy.plot_language_histogram(docset)

## Publications per author

In [None]:
litstudy.plot_number_authors_histogram(docset)

In [None]:
litstudy.prepare_plot(10, 15)
litstudy.plot_author_histogram(docset, top_k=75)

In [None]:
litstudy.plot_author_affiliation_histogram(docset, top_k=75)

## Publications per affiliation

In [None]:
litstudy.plot_affiliation_histogram(docset, top_k=75)

In [None]:
litstudy.prepare_plot(10, 3)
litstudy.plot_country_histogram(docset)

In [None]:
litstudy.plot_affiliation_type_histogram(docset)

# Graphs

In [None]:
litstudy.prepare_plot(20, 20)
litstudy.plot_citation_network(docset)

In [None]:
litstudy.plot_coauthor_network(docset, min_degree=50)

# Topic models

## Build corpus

In [None]:
# Filter documents that have either no abstract or a short abstract (less than 50 characters)
filtered_docset = docset.filter(lambda d: d.abstract is not None and len(d.abstract) >= 50)

dic, freqs = litstudy.nlp.build_corpus_simple(filtered_docset, bigrams='bigrams.txt', stopwords='stopwords.txt')

In [None]:
litstudy.prepare_plot(15, 20)
litstudy.plot_words_histogram(freqs, dic, top_k=100)

## Train model

In [None]:
nmf_model = litstudy.nlp.train_nmf_model(dic, freqs, num_topics=9)

## Plot frequent words, topic clouds and distribution

In [None]:
litstudy.prepare_plot(10, 10)
litstudy.plot_topic_clouds(nmf_model, cols=3)

In [None]:
litstudy.prepare_plot(15, 15)
litstudy.plot_topic_map(nmf_model, dic, freqs)

In [None]:
for i in range(len(filtered_docset)):
    if nmf_model.doc2topic[i,3] > 0.5:
        print(filtered_docset[i].title)