# Step 0. Load Packages/Libraries

In [1]:
import csv
import random
import json
import numpy as np
from Tools.keyworder import Keyworder
from Tools.languager import Languager
from Tools.sentimenter import Sentimenter
from Tools.summarizer import Summarizer
from Tools.meaninger import Meaninger
from Tools.filer import Filer

[nltk_data] Downloading package punkt to /home/dxmonteiro/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Step 1. Global Variables

In [2]:
input = '/home/dxmonteiro/Desktop/WORKSPACE/ProfExtra/scripts/data/humanidades_digitais_scopus.csv'

title = 0
citations = 1
doi = 2
link = 3
abstract = 4
keywords = [5,7,9,11,13,15,17,19]
authors = [21,22,23,24,25,26,27,28,29,30,31,32,33,34,35]

list_papers = []
list_authors = []
list_keywords = []

# Step 2b. Aux Funcs

In [3]:
def update_list(elementa, list_authors, citations):
  k = 0
  for check in list_authors:
    if check.get('name') == elementa:
      author = {
        'name': elementa,
        'frequence': check.get('frequence') + 1,
        'citations': check.get('citations') + citations
      }
      list_authors[k] = author
      return True
    k += 1
  return False

def get_values(real_authors, list_authors, citations):
  for elementa in real_authors:
    if not update_list(elementa, list_authors, citations):
      author = {
        'name': elementa,
        'frequence': 1,
        'citations': citations
      }
      list_authors.append(author)
  return list_authors

# Step 2a. Load Data

In [4]:
with open(input, 'r') as file:
  csvreader = csv.reader(file)
  next(csvreader)
  for row in csvreader:
    if not not row[citations]:
      nplist = np.array(row)
      real_authors = list(filter(None, nplist[authors]))
      real_keywords = list(filter(None, nplist[keywords]))
      new_cit = int(row[citations])
      paper = {
          'doi': nplist[doi],
          'title': nplist[title],
          'abstract': nplist[abstract],
          'link': nplist[link],
          'citations': new_cit,
          'keywords': real_keywords,
          'authors': real_authors
      }
      list_papers.append(paper)
      list_authors = get_values(real_authors, list_authors, new_cit)
      list_keywords = get_values(real_keywords, list_keywords, new_cit)

# Step 3. Extract TOP10s

In [5]:
list_papers.sort(key=lambda x: x.get('citations'), reverse=True)
top_10_papers = list_papers[:10]
print(top_10_papers)

[{'doi': '10.1177/2053951714528481', 'title': 'Big Data, new epistemologies and paradigm shifts', 'abstract': 'This article examines how the availability of Big Data, coupled with new data analytics, challenges established epistemologies across the sciences, social sciences and humanities, and assesses the extent to which they are engendering paradigm shifts across multiple disciplines. In particular, it critically explores new forms of empiricism that declare ‘the end of theory’, the creation of data-driven rather than knowledge-driven science, and the development of digital humanities and computational social sciences that propose radically different ways to make sense of culture, history, economy and society. It is argued that: (1) Big Data and new data analytics are disruptive innovations which are reconfiguring in many instances how research is conducted; and (2) there is an urgent need for wider critical reflection within the academy on the epistemological implications of the unf

In [6]:
list_authors.sort(key=lambda x: x.get('citations'), reverse=True)
top_10_authors = list_authors[:10]
print(top_10_authors)

[{'name': 'Kitchin R.', 'frequence': 1, 'citations': 1062}, {'name': 'Terras M.', 'frequence': 8, 'citations': 218}, {'name': 'Warwick C.', 'frequence': 5, 'citations': 191}, {'name': 'Holmberg K.', 'frequence': 1, 'citations': 182}, {'name': 'Thelwall M.', 'frequence': 1, 'citations': 182}, {'name': 'Anderson C.W.', 'frequence': 1, 'citations': 181}, {'name': 'Welsh A.', 'frequence': 3, 'citations': 154}, {'name': 'Kestemont M.', 'frequence': 5, 'citations': 147}, {'name': 'Hu Y.', 'frequence': 1, 'citations': 144}, {'name': 'Boyd-Graber J.', 'frequence': 1, 'citations': 144}]


In [7]:
list_keywords.sort(key=lambda x: x.get('citations'), reverse=True)
top_10_keywords = list_keywords[:10]
print(top_10_keywords)


[{'name': 'digital humanities', 'frequence': 302, 'citations': 2835}, {'name': 'Digital humanities', 'frequence': 593, 'citations': 2692}, {'name': 'epistemology', 'frequence': 7, 'citations': 1134}, {'name': 'Big Data', 'frequence': 5, 'citations': 1131}, {'name': 'computational social sciences', 'frequence': 2, 'citations': 1062}, {'name': 'data analytics', 'frequence': 1, 'citations': 1062}, {'name': 'data-driven science', 'frequence': 1, 'citations': 1062}, {'name': 'end of theory', 'frequence': 1, 'citations': 1062}, {'name': 'paradigms', 'frequence': 1, 'citations': 1062}, {'name': 'Digital Humanities', 'frequence': 175, 'citations': 561}]


# Step 4. Save TOP10s.

In [8]:
filer = Filer('')

top10s = {
    'TOP10_PAPERS': top_10_papers,
    'TOP10_AUTHORS': top_10_authors,
    'TOP10_KEYWORDS': top_10_keywords
}

filer.write_file(top10s, '/home/dxmonteiro/Desktop/WORKSPACE/ProfExtra/scripts/data/top10.json')

# Step 5. Var Models

In [9]:
summ_models = ['facebook/bart-large-cnn', 'sshleifer/distilbart-cnn-12-6','philschmid/bart-large-cnn-samsum', 'google/pegasus-large', 'sshleifer/distill-pegasus-cnn-16-4','google/bigbird-pegasus-large-bigpatent','csebuetnlp/mT5_multilingual_XLSum']
keyword_models = ['KEYBERT', 'YAKE', 'RAKE','POSITION', 'SINGLE', 'MULTIPARTITE', 'TOPIC']
senti_models = ['cardiffnlp/twitter-roberta-base-sentiment', 'finiteautomata/bertweet-base-sentiment-analysis', 'ProsusAI/finbert','pysentimiento/robertuito-sentiment-analysis', 'Seethal/sentiment_analysis_generic_dataset', 'unitary/toxic-bert', 'j-hartmann/emotion-english-distilroberta-base']

# Step 6. Paper Text Analysis

In [10]:
def process_data(summ, key, senti, top_10_papers):
    keyworder = Keyworder(key)
    summarizer = Summarizer(summ)
    sentimenter = Sentimenter(senti)
    languager = Languager('en_core_web_sm')
    meaninger = Meaninger('en_core_web_sm')
    data = []
    
    for paper in top_10_papers:
        
        abstract = str(paper.get('abstract'))
        
        original_abstract = {
            "text": abstract,
            "num_chars": len(abstract),
            "num_words": languager.num_words(abstract),
            "unique_words": languager.unique_words(abstract),
            "points": languager.points(abstract),
            "word_analysis": languager.word_analysis(abstract),
            "sentiment_analysis": sentimenter.sentiment_analysis(abstract)
        }
        
        print(original_abstract)
        
        summarized_text = summarizer.get_summary(abstract)
        
        summarized_abstract = {
            "summarized_text": summarized_text,
            "num_chars": len(summarized_text),
            "num_words": languager.num_words(summarized_text),
            "unique_words": languager.unique_words(summarized_text),
            "points": languager.points(summarized_text),
            "word_analysis": languager.word_analysis(summarized_text),
            "sentiment_analysis": sentimenter.sentiment_analysis(summarized_text),
        }
        
        print(summarized_abstract)
        
        auto_keywords = keyworder.get_keywords(abstract)
        
        print(auto_keywords)
        
        manual_keywords = meaninger.get_all_meanings(abstract, paper.get('keywords'))
        
        new_auto_keys = meaninger.get_all_meanings(abstract, auto_keywords)
            
        paper = {
          'doi': paper.get('doi'),
          'title': paper.get('title'),
          'link': paper.get('link'),
          'citations': paper.get('citations'),
          'authors': paper.get('authors'),
          'original_abstract': original_abstract,
          'summarized_abstract': summarized_abstract,
          'author_keywords': manual_keywords,
          'automatic_keywords': new_auto_keys
        }
        data.append(paper)
    return data

In [11]:
for s, k, l in zip(summ_models, keyword_models, senti_models):
    data = process_data(s, k, l, top_10_papers)
    sumi = s.split('/')[1]
    senti = l.split('/')[1]
    filer.write_file(
        data, f'/home/dxmonteiro/Desktop/WORKSPACE/ProfExtra/scripts/data/output_{sumi}_{k}_{senti}.json')


DET
NOUN
VERB
SCONJ
DET
NOUN
ADP
PROPN
PROPN
PUNCT
VERB
ADP
ADJ
NOUN
NOUN
PUNCT
NOUN
VERB
NOUN
ADP
DET
NOUN
PUNCT
ADJ
NOUN
CCONJ
NOUN
PUNCT
CCONJ
VERB
DET
NOUN
PART
PRON
PRON
AUX
VERB
NOUN
NOUN
ADP
ADJ
NOUN
PUNCT
ADP
ADJ
PUNCT
PRON
ADV
VERB
ADJ
NOUN
ADP
NOUN
PRON
VERB
PUNCT
DET
NOUN
ADP
NOUN
PUNCT
PUNCT
DET
NOUN
ADP
NOUN
PUNCT
VERB
ADV
ADP
NOUN
PUNCT
VERB
NOUN
PUNCT
CCONJ
DET
NOUN
ADP
ADJ
NOUN
CCONJ
ADJ
ADJ
NOUN
PRON
VERB
ADV
ADJ
NOUN
PART
VERB
NOUN
ADP
NOUN
PUNCT
NOUN
PUNCT
NOUN
CCONJ
NOUN
PUNCT
PRON
AUX
VERB
SCONJ
PUNCT
PUNCT
X
PUNCT
PROPN
PROPN
CCONJ
ADJ
NOUN
NOUN
AUX
ADJ
NOUN
PRON
AUX
VERB
ADP
ADJ
NOUN
SCONJ
NOUN
AUX
VERB
PUNCT
CCONJ
PUNCT
X
PUNCT
PRON
VERB
DET
ADJ
NOUN
ADP
ADJ
ADJ
NOUN
ADP
DET
NOUN
ADP
DET
ADJ
NOUN
ADP
DET
VERB
NOUN
NOUN
PUNCT
DET
NOUN
PRON
AUX
ADV
VERB
PART
AUX
VERB
SCONJ
DET
ADJ
NOUN
ADP
NOUN
NOUN
ADV
VERB
NOUN
PUNCT
ADP
ADV
VERB
VERB
ADJ
NOUN
PUNCT
PRON
AUX
VERB
SCONJ
DET
ADV
ADJ
NOUN
AUX
AUX
DET
NOUN
ADP
DET
VERB
PUNCT
ADJ
CCONJ
ADV
VERB
NOUN
PUNCT
PROPN
DET
N

emoji is not installed, thus not converting emoticons or emojis into text. Please install emoji: pip3 install emoji
Token indices sequence length is longer than the specified maximum sequence length for this model (280 > 128). Running this sequence through the model will result in indexing errors


DET
NOUN
VERB
SCONJ
DET
NOUN
ADP
PROPN
PROPN
PUNCT
VERB
ADP
ADJ
NOUN
NOUN
PUNCT
NOUN
VERB
NOUN
ADP
DET
NOUN
PUNCT
ADJ
NOUN
CCONJ
NOUN
PUNCT
CCONJ
VERB
DET
NOUN
PART
PRON
PRON
AUX
VERB
NOUN
NOUN
ADP
ADJ
NOUN
PUNCT
ADP
ADJ
PUNCT
PRON
ADV
VERB
ADJ
NOUN
ADP
NOUN
PRON
VERB
PUNCT
DET
NOUN
ADP
NOUN
PUNCT
PUNCT
DET
NOUN
ADP
NOUN
PUNCT
VERB
ADV
ADP
NOUN
PUNCT
VERB
NOUN
PUNCT
CCONJ
DET
NOUN
ADP
ADJ
NOUN
CCONJ
ADJ
ADJ
NOUN
PRON
VERB
ADV
ADJ
NOUN
PART
VERB
NOUN
ADP
NOUN
PUNCT
NOUN
PUNCT
NOUN
CCONJ
NOUN
PUNCT
PRON
AUX
VERB
SCONJ
PUNCT
PUNCT
X
PUNCT
PROPN
PROPN
CCONJ
ADJ
NOUN
NOUN
AUX
ADJ
NOUN
PRON
AUX
VERB
ADP
ADJ
NOUN
SCONJ
NOUN
AUX
VERB
PUNCT
CCONJ
PUNCT
X
PUNCT
PRON
VERB
DET
ADJ
NOUN
ADP
ADJ
ADJ
NOUN
ADP
DET
NOUN
ADP
DET
ADJ
NOUN
ADP
DET
VERB
NOUN
NOUN
PUNCT
DET
NOUN
PRON
AUX
ADV
VERB
PART
AUX
VERB
SCONJ
DET
ADJ
NOUN
ADP
NOUN
NOUN
ADV
VERB
NOUN
PUNCT
ADP
ADV
VERB
VERB
ADJ
NOUN
PUNCT
PRON
AUX
VERB
SCONJ
DET
ADV
ADJ
NOUN
AUX
AUX
DET
NOUN
ADP
DET
VERB
PUNCT
ADJ
CCONJ
ADV
VERB
NOUN
PUNCT
PROPN
DET
N

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

DET
NOUN
VERB
SCONJ
DET
NOUN
ADP
PROPN
PROPN
PUNCT
VERB
ADP
ADJ
NOUN
NOUN
PUNCT
NOUN
VERB
NOUN
ADP
DET
NOUN
PUNCT
ADJ
NOUN
CCONJ
NOUN
PUNCT
CCONJ
VERB
DET
NOUN
PART
PRON
PRON
AUX
VERB
NOUN
NOUN
ADP
ADJ
NOUN
PUNCT
ADP
ADJ
PUNCT
PRON
ADV
VERB
ADJ
NOUN
ADP
NOUN
PRON
VERB
PUNCT
DET
NOUN
ADP
NOUN
PUNCT
PUNCT
DET
NOUN
ADP
NOUN
PUNCT
VERB
ADV
ADP
NOUN
PUNCT
VERB
NOUN
PUNCT
CCONJ
DET
NOUN
ADP
ADJ
NOUN
CCONJ
ADJ
ADJ
NOUN
PRON
VERB
ADV
ADJ
NOUN
PART
VERB
NOUN
ADP
NOUN
PUNCT
NOUN
PUNCT
NOUN
CCONJ
NOUN
PUNCT
PRON
AUX
VERB
SCONJ
PUNCT
PUNCT
X
PUNCT
PROPN
PROPN
CCONJ
ADJ
NOUN
NOUN
AUX
ADJ
NOUN
PRON
AUX
VERB
ADP
ADJ
NOUN
SCONJ
NOUN
AUX
VERB
PUNCT
CCONJ
PUNCT
X
PUNCT
PRON
VERB
DET
ADJ
NOUN
ADP
ADJ
ADJ
NOUN
ADP
DET
NOUN
ADP
DET
ADJ
NOUN
ADP
DET
VERB
NOUN
NOUN
PUNCT
DET
NOUN
PRON
AUX
ADV
VERB
PART
AUX
VERB
SCONJ
DET
ADJ
NOUN
ADP
NOUN
NOUN
ADV
VERB
NOUN
PUNCT
ADP
ADV
VERB
VERB
ADJ
NOUN
PUNCT
PRON
AUX
VERB
SCONJ
DET
ADV
ADJ
NOUN
AUX
AUX
DET
NOUN
ADP
DET
VERB
PUNCT
ADJ
CCONJ
ADV
VERB
NOUN
PUNCT
PROPN
DET
N

Token indices sequence length is longer than the specified maximum sequence length for this model (575 > 512). Running this sequence through the model will result in indexing errors


NOUN
PUNCT
ADP
NOUN
PUNCT
ADJ
NOUN
AUX
AUX
VERB
PART
VERB
ADJ
SCONJ
NOUN
NOUN
AUX
VERB
ADP
CCONJ
AUX
VERB
NOUN
PUNCT
DET
NOUN
VERB
PART
VERB
DET
NOUN
ADP
PROPN
ADP
DET
ADJ
NOUN
ADP
ADJ
NOUN
NOUN
PUNCT
CCONJ
PART
VERB
DET
VERB
NOUN
PUNCT
AUX
DET
NOUN
ADP
DET
VERB
NOUN
VERB
DET
NOUN
NOUN
PUNCT
NOUN
CCONJ
DET
NOUN
ADP
NOUN
PUNCT
CCONJ
SCONJ
AUX
AUX
VERB
ADP
ADJ
NOUN
PUNCT
CCONJ
AUX
PRON
VERB
DET
NOUN
PRON
AUX
VERB
ADP
DET
NOUN
PUNCT
NOUN
SYM
NOUN
SYM
NOUN
DET
NOUN
VERB
DET
NOUN
ADP
PROPN
ADP
DET
ADJ
NOUN
ADP
DET
PROPN
PROPN
PUNCT
PROPN
PUNCT
NOUN
PUNCT
VERB
ADP
PRON
NOUN
NOUN
ADP
PROPN
ADP
NUM
ADJ
ADJ
NUM
NOUN
PUNCT
DET
VERB
NOUN
ADP
NUM
PUNCT
PROPN
PUNCT
AUX
VERB
VERB
ADJ
ADJ
CCONJ
ADJ
NOUN
PUNCT
VERB
DET
ADJ
NOUN
ADP
PROPN
NOUN
ADP
ADJ
VERB
NOUN
PUNCT
DET
ADJ
NOUN
ADP
NOUN
NOUN
PUNCT
CCONJ
NOUN
NOUN
NOUN
PUNCT
PROPN
PROPN
AUX
VERB
CCONJ
DET
ADJ
ADJ
NOUN
AUX
VERB
PART
VERB
NOUN
PART
NOUN
ADP
DET
VERB
NOUN
PUNCT
NOUN
PUNCT
PROPN
VERB
PROPN
NOUN
AUX
PART
VERB
DET
ADJ
VERB
NOUN
PUNCT
CCONJ


IndexError: list index out of range