<a href="https://colab.research.google.com/github/Flychuban/Stocks-Crypto-Research/blob/main/Stocks_Crypto_Research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

In [3]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name) 

In [4]:
url = "https://uk.finance.yahoo.com/news/d-put-2-000-tesla-043029225.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [None]:
paragraphs

In [5]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [6]:
ARTICLE

'RCB’s owners have focused on Virat Kohli to leverage their commercial brand Both Tesla (NASDAQ: TSLA) and NIO (NYSE: NIO) stock declined by more than 50% in value in 2022. It was a dreadful year for most growth shares, including electric vehicle (EV) companies. But what if I’d taken a contrarian stance and decided to invest £1,000 in each of these fallen stocks as a New Year gift for myself? How much would I have today? Well, Tesla shares are up a very impressive 65% so far this year. In contrast, NIO shares have declined 17% since the end of December and now sit at just under $8 per share. This means that my Tesla holding would be worth £1,650, while the value of my position in its Chinese EV rival would have fallen to £830. So, my overall investment would be worth £2,480 today. That’s a gain of 24%, which is an exceptional return after just a few months. But what about the future? Should I buy either or both stocks today? There seem to be two big reasons why Tesla stock has come bac

In [7]:
input_ids = tokenizer.encode(ARTICLE, return_tensors = 'pt')
output = model.generate(input_ids, max_length = 55, num_beams = 5, early_stopping = True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [8]:
summary

'Tesla stock is up 65% so far this year, while NIO shares are down 17%.'

In [9]:
monitored_tickers = ['GME', 'TSLA', 'BTC']

In [10]:
def search_stock_news_urls(ticker):
  search_url = f'https://www.google.com/search?q=yahoo+finance+{ticker}&tbm=nws'
  r = requests.get(search_url)
  soup = BeautifulSoup(r.text, 'html.parser')
  atags = soup.find_all('a')
  hrefs = [link['href'] for link in atags]
  return hrefs

In [11]:
search_stock_news_urls('TSLA')

['https://accounts.google.com/ServiceLogin?hl=nl&continue=https://www.google.com/search?q%3Dyahoo%2Bfinance%2BTSLA%26tbm%3Dnws&gae=cb-none',
 'https://policies.google.com/technologies/cookies?hl=nl&utm_source=ucb',
 'https://consent.google.com/dl?continue=https://www.google.com/search?q%3Dyahoo%2Bfinance%2BTSLA%26tbm%3Dnws&gl=NL&hl=nl&pc=srp&uxe=none&src=1',
 'https://policies.google.com/privacy?hl=nl&utm_source=ucb',
 'https://policies.google.com/terms?hl=nl&utm_source=ucb']

In [12]:
raw_urls = {ticker: search_stock_news_urls(ticker) for ticker in monitored_tickers}

In [13]:
raw_urls

{'GME': ['https://accounts.google.com/ServiceLogin?hl=nl&continue=https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws&gae=cb-none',
  'https://policies.google.com/technologies/cookies?hl=nl&utm_source=ucb',
  'https://consent.google.com/dl?continue=https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws&gl=NL&hl=nl&pc=srp&uxe=none&src=1',
  'https://policies.google.com/privacy?hl=nl&utm_source=ucb',
  'https://policies.google.com/terms?hl=nl&utm_source=ucb'],
 'TSLA': ['https://accounts.google.com/ServiceLogin?hl=nl&continue=https://www.google.com/search?q%3Dyahoo%2Bfinance%2BTSLA%26tbm%3Dnws&gae=cb-none',
  'https://policies.google.com/technologies/cookies?hl=nl&utm_source=ucb',
  'https://consent.google.com/dl?continue=https://www.google.com/search?q%3Dyahoo%2Bfinance%2BTSLA%26tbm%3Dnws&gl=NL&hl=nl&pc=srp&uxe=none&src=1',
  'https://policies.google.com/privacy?hl=nl&utm_source=ucb',
  'https://policies.google.com/terms?hl=nl&utm_source=ucb'],
 'BTC': ['htt

In [14]:
import re

In [15]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [16]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [17]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'GME': ['https://consent.google.com/dl?continue=https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws'],
 'TSLA': ['https://consent.google.com/dl?continue=https://www.google.com/search?q%3Dyahoo%2Bfinance%2BTSLA%26tbm%3Dnws'],
 'BTC': ['https://consent.google.com/dl?continue=https://www.google.com/search?q%3Dyahoo%2Bfinance%2BBTC%26tbm%3Dnws']}

In [20]:
def scrape_and_process(URLs):
  ARTICLES = []
  for url in URLs:
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    paragraphs = soup.find_all('p')
    text = [paragraph.text for paragraph in paragraphs]
    words = ' '.join(text).split(' ')[:350]
    ARTICLE = ' '.join(words)
    ARTICLES.append(ARTICLE)
  return ARTICLES

In [21]:
articles = {ticker: scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'GME': ['400. That’s an error.The server cannot process the request because it is malformed. It should not be retried. That’s all we know. The server cannot process the request because it is malformed. It should not be retried. That’s all we know.'],
 'TSLA': ['400. That’s an error.The server cannot process the request because it is malformed. It should not be retried. That’s all we know. The server cannot process the request because it is malformed. It should not be retried. That’s all we know.'],
 'BTC': ['400. That’s an error.The server cannot process the request because it is malformed. It should not be retried. That’s all we know. The server cannot process the request because it is malformed. It should not be retried. That’s all we know.']}

In [19]:
def summarize(articles):
  summaries = []
  for article in articles:
    input_ids = tokenizer.encode(article, return_tensors = 'pt')
    output = model.generate(input_ids, max_length = 55, num_beams = 5, early_stopping = True)
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    summaries.append(summary)
  return summaries

In [22]:
summaries = {ticker: summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'GME': ['It’s an error that’s hard to diagnose because it’s a malformed request.'],
 'TSLA': ['It’s an error that’s hard to diagnose because it’s a malformed request.'],
 'BTC': ['It’s an error that’s hard to diagnose because it’s a malformed request.']}