# Step 0. Load Packages/Libraries

In [None]:
import csv
import random
import json
import numpy as np
from Tools.keyworder import Keyworder
from Tools.languager import Languager
from Tools.sentimenter import Sentimenter
from Tools.summarizer import Summarizer
from Tools.meaninger import Meaninger
from Tools.filer import Filer

# Step 1. Global Variables

In [None]:
input = '/home/dxmonteiro/Desktop/WORKSPACE/ProfExtra/scripts/data/humanidades_digitais_scopus.csv'

title = 0
citations = 1
doi = 2
link = 3
abstract = 4
keywords = [5,7,9,11,13,15,17,19]
authors = [21,22,23,24,25,26,27,28,29,30,31,32,33,34,35]

list_papers = []
list_authors = []
list_keywords = []

# Step 2b. Aux Funcs

In [None]:
def update_list(elementa, list_authors, citations):
  k = 0
  for check in list_authors:
    if check.get('name') == elementa:
      author = {
        'name': elementa,
        'frequence': check.get('frequence') + 1,
        'citations': check.get('citations') + citations
      }
      list_authors[k] = author
      return True
    k += 1
  return False

def get_values(real_authors, list_authors, citations):
  for elementa in real_authors:
    if not update_list(elementa, list_authors, citations):
      author = {
        'name': elementa,
        'frequence': 1,
        'citations': citations
      }
      list_authors.append(author)
  return list_authors

# Step 2a. Load Data

In [None]:
with open(input, 'r') as file:
  csvreader = csv.reader(file)
  next(csvreader)
  for row in csvreader:
    if not not row[citations]:
      nplist = np.array(row)
      real_authors = list(filter(None, nplist[authors]))
      real_keywords = list(filter(None, nplist[keywords]))
      new_cit = int(row[citations])
      paper = {
          'doi': nplist[doi],
          'title': nplist[title],
          'abstract': nplist[abstract],
          'link': nplist[link],
          'citations': new_cit,
          'keywords': real_keywords,
          'authors': real_authors
      }
      list_papers.append(paper)
      list_authors = get_values(real_authors, list_authors, new_cit)
      list_keywords = get_values(real_keywords, list_keywords, new_cit)

# Step 3. Extract TOP10s

In [None]:
list_papers.sort(key=lambda x: x.get('citations'), reverse=True)
top_10_papers = list_papers[:10]
print(top_10_papers)

In [None]:
list_authors.sort(key=lambda x: x.get('citations'), reverse=True)
top_10_authors = list_authors[:10]
print(top_10_authors)

In [None]:
list_keywords.sort(key=lambda x: x.get('citations'), reverse=True)
top_10_keywords = list_keywords[:10]
print(top_10_keywords)


# Step 4. Save TOP10s.

In [None]:
filer = Filer('')

top10s = {
    'TOP10_PAPERS': top_10_papers,
    'TOP10_AUTHORS': top_10_authors,
    'TOP10_KEYWORDS': top_10_keywords
}

filer.write_file(top10s, '/home/dxmonteiro/Desktop/WORKSPACE/ProfExtra/scripts/data/top10.json')

# Step 5. Var Models

In [None]:
already_summ = ['facebook/bart-large-cnn',
                'sshleifer/distilbart-cnn-12-6', 'philschmid/bart-large-cnn-samsum' ]

already_key = ['KEYBERT', 'YAKE', 'RAKE']

already_senti = ['cardiffnlp/twitter-roberta-base-sentiment',
                 'finiteautomata/bertweet-base-sentiment-analysis', 'ProsusAI/finbert']


summ_models = ['google/pegasus-large', 'sshleifer/distill-pegasus-cnn-16-4','google/bigbird-pegasus-large-bigpatent','csebuetnlp/mT5_multilingual_XLSum']
keyword_models = ['POSITION', 'SINGLE', 'MULTIPARTITE', 'TOPIC']
senti_models = ['pysentimiento/robertuito-sentiment-analysis', 'Seethal/sentiment_analysis_generic_dataset', 'unitary/toxic-bert', 'j-hartmann/emotion-english-distilroberta-base']

# Step 6. Paper Text Analysis

In [None]:
def process_data(summ, key, senti, top_10_papers):
    keyworder = Keyworder(key)
    summarizer = Summarizer(summ)
    sentimenter = Sentimenter(senti)
    languager = Languager('en_core_web_sm')
    meaninger = Meaninger('en_core_web_sm')
    data = []
    
    for paper in top_10_papers:
        
        abstract = str(paper.get('abstract'))
        
        original_abstract = {
            "text": abstract,
            "num_chars": len(abstract),
            "num_words": languager.num_words(abstract),
            "unique_words": languager.unique_words(abstract),
            "points": languager.points(abstract),
            "word_analysis": languager.word_analysis(abstract),
            "sentiment_analysis": sentimenter.sentiment_analysis(abstract)
        }
        
        print(original_abstract)
        
        summarized_text = summarizer.get_summary(abstract)
        
        summarized_abstract = {
            "summarized_text": summarized_text,
            "num_chars": len(summarized_text),
            "num_words": languager.num_words(summarized_text),
            "unique_words": languager.unique_words(summarized_text),
            "points": languager.points(summarized_text),
            "word_analysis": languager.word_analysis(summarized_text),
            "sentiment_analysis": sentimenter.sentiment_analysis(summarized_text),
        }
        
        print(summarized_abstract)
        
        auto_keywords = keyworder.get_keywords(abstract)
        
        print(auto_keywords)
        
        manual_keywords = meaninger.get_all_meanings(abstract, paper.get('keywords'))
        
        new_auto_keys = meaninger.get_all_meanings(abstract, auto_keywords)
            
        paper = {
          'doi': paper.get('doi'),
          'title': paper.get('title'),
          'link': paper.get('link'),
          'citations': paper.get('citations'),
          'authors': paper.get('authors'),
          'original_abstract': original_abstract,
          'summarized_abstract': summarized_abstract,
          'author_keywords': manual_keywords,
          'automatic_keywords': new_auto_keys
        }
        data.append(paper)
    return data

In [None]:
for s, k, l in zip(summ_models, keyword_models, senti_models):
    data = process_data(s, k, l, top_10_papers)
    sumi = s.split('/')[1]
    senti = l.split('/')[1]
    filer.write_file(
        data, f'/home/dxmonteiro/Desktop/WORKSPACE/ProfExtra/scripts/data/output_{sumi}_{k}_{senti}.json')
