In [125]:
import requests
from bs4 import BeautifulSoup
from transformers import AutoConfig,pipeline,PegasusTokenizer, TFPegasusForConditionalGeneration, BertTokenizer, TFBertModel, AutoTokenizer,AutoModelForSequenceClassification
import pandas as pd
import numpy as np
import regex as re
import googletrans
from googletrans import Translator
from google_trans_new import google_translator
from deep_translator import GoogleTranslator

import numpy as np
from scipy.special import softmax

import spacy
import pytextrank

La idea es poder scrapear artículos del día y activo financiero de nuestra elección (empezando por los de Yahoo Finance), y conseguir un resúmen en una frase y traducido al español de cada artículo. Además obtener una puntuación del día para el "sentimiento" hacia el activo. Y si da la capacidad computacional, poder scrapear los datos de varios activos, procesarlos y meterlos un DF; luego graficar ese sentimiento a lo largo del tiempo y usarlo como feature para análisis y/o predicción.</br></br>

HuggingFace (librería **transformers**) es una plataforma colaborativa de modelos NLP preentrenados para toda clase de nichos, importo uno entrenado en Pegasus para resumir artículos financieros en una frase. </br>
**Pegasus** es un modelo de NLP desarrollado por Google para generar resúmenes con mucha calidad. El inconveniente es que pierde accuracy exponencialmente al trabajar con textos largos como input(400 palabras ~ o 512 tokens), y es bastante lento.</br>
Pegasus : https://ai.googleblog.com/2020/06/pegasus-state-of-art-model-for.html</br>
Transformers : https://huggingface.co/human-centered-summarization/financial-summarization-pegasus </br></br>

Para solucionar éste problema pasaré primero los textos por SpaCy (otra librería NLP) con 'textrank', un método extractivo que extrae los componentes más importantes del texto **sin cambiar las palabras**. Además, es muy eficiente a nivel computacional, por lo que Pegasus podrá trabajar con menos texto.</br>
Conseguimos así un resultado superior que con cada modelo por separado.</br>
https://spacy.io/</br></br>


Finalmente, obtengo una puntuación de sentiment para el artículo con BERT, otro modelo de Google, concretamente FinBERT, que viene preentrenado con apuntes de profesionales financieros.</br>https://huggingface.co/yiyanghkust/finbert-tone</br></br>
Para traducir, parece que funciona mejor el traductor de Google, la librería **deep_translator** permite evitar lanzar peticiones directamente a la API de Google.</br>

### Empiezo ejemplificándolo con un sólo artículo:

### Scraping artículo Yahoo Finance

In [134]:
url = 'https://finance.yahoo.com/news/wild-theory-price-bitcoin-being-110000608.html'

#Extracción artículo 

try:
    with requests.get(url) as response:
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraph_texts = [p.text for p in soup.find_all('p')]
        clean_text = ' '.join(paragraph_texts)
        exclude = [' Click here for the latest stock market news and in-depth analysis, including events that move stocks Read the latest financial and business news from Yahoo Finance Download the Yahoo Finance app for Apple or Android Follow Yahoo Finance on Twitter, Facebook, Instagram, Flipboard, LinkedIn, and YouTube Related Quotes','Join the most important conversation in crypto and web3! Secure your seat today ','Related Quotes','For the latest earnings reports and analysis, earnings whispers and expectations, and company earnings news, click here Read the latest financial and business news from Yahoo Finance Download the Yahoo Finance app for Apple or Android Follow Yahoo Finance on Twitter, Facebook, Instagram, Flipboard, LinkedIn, and YouTube']
        for text in exclude:
            clean_text = clean_text.replace(text,'')
except (requests.exceptions.HTTPError, requests.exceptions.RequestException) as e:
    #Excluir /video/
    print(f'Error: {e},{url}')
    

clean_text

"Back in 2017, John Griffin, a professor of finance at the University of Texas McCombs School of Business, noticed something strange. Griffin follows a totally different beat from typical business school finance profs who explore, say, how business cycles influence commodity prices or Fed policy sways the term structure of interest rates. The 6-foot-2 former high school football star views himself as a crusader for good, a moral sleuth who, as he tells Fortune, “looks to expose financial evil, to shed light on the world and expose dark things in the markets.” After the Great Financial Crisis, Griffin became a devout Christian. He has since dedicated his distinguished career to righteous forensic digging that’s unearthed abuses ranging from insider trading to mortgage fraud to the doctoring of bond ratings during the financial crisis. As Griffin and Amin Shams, then a doctoral candidate at McCombs who’s joined Griffin in several gumshoe investigations, screened for misdeeds in 2017, the

### SpaCy - Resumen extractivo

In [135]:
# python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm') 
nlp.add_pipe('textrank') #Cargamos el modelo, textrank=extractive
doc = nlp(clean_text)
spacy_summary = ' '.join([str(sentence) for sentence in doc._.textrank.summary(limit_sentences=12)])
# Primer resumen, puede ser una opción obtenerlo
for sentence in doc._.textrank.summary(limit_sentences=13):
    print(sentence)

The pair saw a strong and questionable pattern in Bitcoin prices.
Just before the start of each period, Bitcoin prices were under pressure.
After the paper appeared, Tether Ltd. insisted that its conclusions were flawed and maintained that Tether couldn’t be used to balloon Bitcoin prices.
The authors focused on the 1% of all one-hour intervals between the beginning of March 2017 and end of March 2018 that featured the largest combinations of large Tether issuance on Bitfinex, and big Bitcoin buys on two other exchanges, Bittrex and Poloniex.
But it’s much harder to short Bitcoin than to short stocks or bonds.
If anything, fallen players such as Celsius and FTX were selling Bitcoin and pushing down prices to prop up their own coins.”
At MicroStrategy, cofounder and executive chairman Michael Saylor loaded his hybrid software provider and Bitcoin speculator with $2.4 billion in debt to buy coins, and when prices fell below $16,000 on Nov. 11, owed far more on his loans than the value of

### Pegasus para resumir en una frase

In [158]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = TFPegasusForConditionalGeneration.from_pretrained(model_name)
input_ids = tokenizer(spacy_summary, return_tensors = 'tf').input_ids

output = model.generate(
    input_ids,
    max_length=35,
    num_beams=5,
    early_stopping=True
)

All model checkpoint layers were used when initializing TFPegasusForConditionalGeneration.

Some layers of TFPegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['final_logits_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [159]:
summarized = tokenizer.decode(output[0], skip_special_tokens=True)
summarized

'MicroStrategy and CoinStrategy published paper on Bitcoin. Bitcoin prices were under pressure just before start of each period'

In [157]:
from transformers import PegasusDecoder

decoder = TFPegasusDecoder(model.config, model.get_input_embeddings(), model.get_decoder())
output = decoder.generate(input_ids, max_length=35, strategy='sampling', temperature=0.7)

ImportError: cannot import name 'PegasusDecoder' from 'transformers' (C:\ProgramData\Anaconda3\lib\site-packages\transformers\__init__.py)

## Modelo basado en BERT (Financial BERT)

In [143]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

sentences = [summarized,'Analysts expect slower earnings, iPhone sales throughout 2020.']
results = nlp(sentences)

In [144]:
results

[{'label': 'Negative', 'score': 0.9999768733978271},
 {'label': 'Negative', 'score': 0.9999984502792358},
 {'label': 'Neutral', 'score': 0.62992924451828}]

#### Tras probar con varias librerías de ML, parece que el mejor traductor es Google

In [145]:
GoogleTranslator('en','spanish').translate(text=summarized)

'MicroStrategy y CoinStrategy publicaron un artículo sobre Bitcoin. Los precios de Bitcoin estaban bajo presión justo antes del comienzo de cada período'