In [1]:

from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

In [2]:

model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
url = "https://au.finance.yahoo.com/news/nab-bankers-save-aussie-from-devastating-25-million-scam-several-red-flags-024613144.html"

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [4]:

paragraphs[0].text

'A wealthy Aussie has narrowly avoided losing $25 million after “several red” flags went off indicating it was an investment scam.'

In [5]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [6]:
ARTICLE

"A wealthy Aussie has narrowly avoided losing $25 million after “several red” flags went off indicating it was an investment scam. Daniel Smith* had received a phone call from what he believed was a reputable bank, offering him an attractive term-deposit rate. He decided to contact his current bank, NAB, to see if they could match the offer. While the proposed interest rate wasn’t extreme enough to raise alarm bells, the fact Daniel was looking to move $25 million in a single transaction was enough to make his banker suspicious. Have you fallen victim to a scam? Contact tamika.seeto@yahooinc.com to share your story The transaction was flagged with NAB’s private wealth banker, Amit, as well as an investment specialist with the bank. They were asked to investigate the supposed deposit rate and documents Daniel had received. “We went through the various pieces of material sent to the customer from the caller, only to discover several red flags,” Amit said. RELATED NAB customer reveals 7 w

In [7]:

input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [8]:

summary

'Banker stopped client from moving $25 million. Scamwatch received more than 8,000 reports in 2023'

In [9]:

monitored_tickers = ['GME', 'TSLA', 'BTC']

In [10]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs

In [11]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'GME': ['/?sa=X&ved=0ahUKEwiq7Z3AkoaFAxVSe2wGHWlODbcQOwgC',
  '/search?q=yahoo+finance+GME&sca_esv=0d7590debed7ac0b&ie=UTF-8&tbm=nws&gbv=1&sei=0ZH8ZerAMNL2seMP6Zy1uAs',
  '/search?q=yahoo+finance+GME&sca_esv=0d7590debed7ac0b&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwiq7Z3AkoaFAxVSe2wGHWlODbcQ_AUIBSgA',
  '/search?q=yahoo+finance+GME&sca_esv=0d7590debed7ac0b&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwiq7Z3AkoaFAxVSe2wGHWlODbcQ_AUIBygC',
  '/search?q=yahoo+finance+GME&sca_esv=0d7590debed7ac0b&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwiq7Z3AkoaFAxVSe2wGHWlODbcQ_AUICCgD',
  '/search?q=yahoo+finance+GME&sca_esv=0d7590debed7ac0b&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwiq7Z3AkoaFAxVSe2wGHWlODbcQ_AUICSgE',
  '/url?q=https://maps.google.com/maps%3Fq%3Dyahoo%2Bfinance%2BGME%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111&opi=89978449&sa=U&ved=0ahUKEwiq7Z3AkoaFAxVSe2wGHWlODbcQiaAMCAooBQ&usg=AOvVaw0URNHJ1Pl3Fpiw5RNzhKp0',
  '/url?q=/search%3Fq%3Dyahoo%2Bfinance%2BGME%26sca_esv%3D0d7590

In [12]:
raw_urls['GME']

['/?sa=X&ved=0ahUKEwiq7Z3AkoaFAxVSe2wGHWlODbcQOwgC',
 '/search?q=yahoo+finance+GME&sca_esv=0d7590debed7ac0b&ie=UTF-8&tbm=nws&gbv=1&sei=0ZH8ZerAMNL2seMP6Zy1uAs',
 '/search?q=yahoo+finance+GME&sca_esv=0d7590debed7ac0b&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwiq7Z3AkoaFAxVSe2wGHWlODbcQ_AUIBSgA',
 '/search?q=yahoo+finance+GME&sca_esv=0d7590debed7ac0b&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwiq7Z3AkoaFAxVSe2wGHWlODbcQ_AUIBygC',
 '/search?q=yahoo+finance+GME&sca_esv=0d7590debed7ac0b&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwiq7Z3AkoaFAxVSe2wGHWlODbcQ_AUICCgD',
 '/search?q=yahoo+finance+GME&sca_esv=0d7590debed7ac0b&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwiq7Z3AkoaFAxVSe2wGHWlODbcQ_AUICSgE',
 '/url?q=https://maps.google.com/maps%3Fq%3Dyahoo%2Bfinance%2BGME%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111&opi=89978449&sa=U&ved=0ahUKEwiq7Z3AkoaFAxVSe2wGHWlODbcQiaAMCAooBQ&usg=AOvVaw0URNHJ1Pl3Fpiw5RNzhKp0',
 '/url?q=/search%3Fq%3Dyahoo%2Bfinance%2BGME%26sca_esv%3D0d7590debed7ac0b%26ie

In [13]:

import re

In [14]:

exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [15]:

def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [16]:

cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/bull-day-gamestop-gme-120000551.html',
  'https://finance.yahoo.com/news/gamestop-gme-upgraded-strong-buy-170006515.html',
  'https://finance.yahoo.com/news/gamestop-nyse-gme-misses-q3-211648468.html',
  'https://finance.yahoo.com/news/gamestop-corp-gme-reports-narrowed-224622965.html',
  'https://finance.yahoo.com/news/unveiling-gamestop-gme-value-really-155119864.html',
  'https://finance.yahoo.com/news/3-sorry-gaming-stocks-sell-171540757.html',
  'https://finance.yahoo.com/news/gamestop-gme-true-worth-really-163851598.html',
  'https://finance.yahoo.com/news/gamestop-gme-posts-breakeven-earnings-202700458.html',
  'https://finance.yahoo.com/news/down-19-41-4-weeks-133506409.html',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws%26pccc%3D1',
  'https://finance.yahoo.com/news/gamestop-gme-strong-industry-solid-134000154.html'],
 'TSLA': ['https://finance.yahoo.com/news/tesla-stock-tumbles-7-as-shipments-slump-new-price-c

In [17]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [18]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs:
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
            r = requests.get(url, headers=headers)
            r.raise_for_status()  # Raise an error for non-200 status codes
            soup = BeautifulSoup(r.text, 'html.parser')
            paragraphs = soup.find_all('p')
            text = [paragraph.text for paragraph in paragraphs]
            words = ' '.join(text).split(' ')[:350]
            ARTICLE = ' '.join(words)
            ARTICLES.append(ARTICLE)
        except requests.HTTPError as e:
            if e.response.status_code == 404:
                print(f"Error 404: Page not found for URL: {url}")
            else:
                print(f"HTTP Error {e.response.status_code} for URL: {url}")
            ARTICLES.append(None)  # Append None to indicate failure
        except Exception as e:
            print(f"Error scraping {url}: {e}")
            ARTICLES.append(None)  # Append None to indicate failure
    return ARTICLES



In [19]:

articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles


{'GME': ['Before everybody goes nuts over here and calls me an overzealous wannabe that must have just watched “Dumb Money” I want to hammer home the fact that this has nothing to do with Roaring Kitty. Really, it’s just based on facts and actions taken by analysts on Wall Street that Zacks happens to aggregate. All-in-all, it’s a great way to highlight the Zacks Rank by pointing out the action in today’s Bull of the Day, GameStop (GME).  GameStop Corp., a specialty retailer, provides games and entertainment products through its stores and ecommerce platforms in the United States, Canada, Australia, and Europe. The company sells new and pre-owned gaming platforms; accessories, such as controllers, gaming headsets, and virtual reality products; new and pre-owned gaming software; and in-game digital currency, digital downloadable content, and full-game downloads. It also sells collectibles comprising apparel, toys, trading cards, gadgets, and other retail products for pop culture and tec

In [20]:
articles['TSLA'][4]

"The stock market is soaring yet again. But this time, Tesla (NASDAQ: TSLA) is not leading the charge. In fact, the market is up in spite of the electric vehicle (EV) leader. Tesla shares have fallen a whopping 34% year to date (YTD) as of this writing, while the Nasdaq-100 continues to rise. The stock is now off 60% from all-time highs, while the broad market is close to all-time highs. We are at another crossroads with Tesla stock. Bulls will argue this is a perfect buying opportunity as the company prepares for its next leg of growth. Bears will argue the company is finally heading toward a normal valuation that a manufacturing-based business deserves to trade at. Which group is right? Where will Tesla stock be in three years? Tesla continued to grow its unit volumes in 2023. It delivered 1.8 million cars to customers around the globe, up from 1.3 million in 2022 and 936,000 in 2021. This is impressive growth at scale, making Tesla one of the premier manufacturers of not just EVs, b

In [21]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [22]:

summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'GME': ['Two analysts have increased their earnings estimates for the current year. The stock is currently a Rank #1 in the industry',
  'Rating upgrade primarily reflects upward trend in earnings estimates.',
  'Revenue falls 9.1% year-on-year, while non-commerce sales rise.',
  'SG&A expenses reduced to 27.5% of net sales, down from 32.7% in Q3 2022.',
  'Check out our latest analysis for GameStop What is GME undervalued?',
  'Three gaming stocks to sell in February while you still can.',
  'GF Value Line suggests that the stock is significantly undervalued.',
  'Higher inflationary pressures on consumers’ spending have weighed on the company. Soft sales weighed on the company’s results',
  'Relative Strength Index (RSI) reading for GME is 29.62.',
  'Your information may be shared with third parties.',
  'Retail - Consumer Electronics industry is seeing solid earnings estimate revision activity.'],
 'TSLA': ['Shipments in the world’s largest EV market fall 19% from a year ago. Tesl

In [23]:
summaries['BTC']


['MicroStrategy Executive Chairman Michael Saylor says the cryptocurrency is the winner.',
 'Standard Chartered, Fundstrat, VanEck all have targets. Some on Wall Street say it’s difficult to estimate future price',
 'MicroStrategy’s position is up nearly 100%, creating $6.2 billion. Bitcoin supply is growing but at a decreasing rate',
 "On this week's show, we looked at how crypto-adjacent stocks have reacted to the rally.",
 'Biggest gainers and losers on the show this week are listed.',
 'Pantera Capital Portfolio Manager and Crypto Asset Manager Cosmo Jiang.',
 'Bitcoin briefly hits record high as China iPhone sales slump.',
 'Halvings occur every four years after 210,000 blocks are created',
 'Stitch Fix posts wider-than-expected loss; AT&T upgrades to Outperform.',
 'Your information may be shared with third parties.',
 "Mizuho's Dolev warns investors to stay away from crypto investing."]

In [24]:

from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [25]:

sentiment(summaries['BTC'])

[{'label': 'POSITIVE', 'score': 0.9994248151779175},
 {'label': 'NEGATIVE', 'score': 0.9989762306213379},
 {'label': 'POSITIVE', 'score': 0.6712822914123535},
 {'label': 'NEGATIVE', 'score': 0.9847748875617981},
 {'label': 'NEGATIVE', 'score': 0.9909721612930298},
 {'label': 'POSITIVE', 'score': 0.5877715349197388},
 {'label': 'NEGATIVE', 'score': 0.9884768724441528},
 {'label': 'NEGATIVE', 'score': 0.9876707792282104},
 {'label': 'NEGATIVE', 'score': 0.9920054078102112},
 {'label': 'NEGATIVE', 'score': 0.9903545379638672},
 {'label': 'NEGATIVE', 'score': 0.9644929766654968}]

In [26]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'GME': [{'label': 'POSITIVE', 'score': 0.9938066005706787},
  {'label': 'POSITIVE', 'score': 0.7805719971656799},
  {'label': 'NEGATIVE', 'score': 0.9731772541999817},
  {'label': 'NEGATIVE', 'score': 0.9990413784980774},
  {'label': 'NEGATIVE', 'score': 0.9908208250999451},
  {'label': 'POSITIVE', 'score': 0.8924435377120972},
  {'label': 'NEGATIVE', 'score': 0.9977561831474304},
  {'label': 'NEGATIVE', 'score': 0.9859585165977478},
  {'label': 'NEGATIVE', 'score': 0.8664731383323669},
  {'label': 'NEGATIVE', 'score': 0.9903545379638672},
  {'label': 'POSITIVE', 'score': 0.9991598129272461}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.9983276724815369},
  {'label': 'POSITIVE', 'score': 0.6840000152587891},
  {'label': 'NEGATIVE', 'score': 0.9997729659080505},
  {'label': 'POSITIVE', 'score': 0.9960147142410278},
  {'label': 'NEGATIVE', 'score': 0.9997296929359436},
  {'label': 'NEGATIVE', 'score': 0.998216450214386},
  {'label': 'NEGATIVE', 'score': 0.9773924946784973},
  {'label': '

In [27]:

print(summaries['GME'][3], scores['GME'][3]['label'], scores['GME'][3]['score'])

SG&A expenses reduced to 27.5% of net sales, down from 32.7% in Q3 2022. NEGATIVE 0.9990413784980774


In [28]:

scores['BTC'][0]['score']

0.9994248151779175

In [29]:

summaries

{'GME': ['Two analysts have increased their earnings estimates for the current year. The stock is currently a Rank #1 in the industry',
  'Rating upgrade primarily reflects upward trend in earnings estimates.',
  'Revenue falls 9.1% year-on-year, while non-commerce sales rise.',
  'SG&A expenses reduced to 27.5% of net sales, down from 32.7% in Q3 2022.',
  'Check out our latest analysis for GameStop What is GME undervalued?',
  'Three gaming stocks to sell in February while you still can.',
  'GF Value Line suggests that the stock is significantly undervalued.',
  'Higher inflationary pressures on consumers’ spending have weighed on the company. Soft sales weighed on the company’s results',
  'Relative Strength Index (RSI) reading for GME is 29.62.',
  'Your information may be shared with third parties.',
  'Retail - Consumer Electronics industry is seeing solid earnings estimate revision activity.'],
 'TSLA': ['Shipments in the world’s largest EV market fall 19% from a year ago. Tesl

In [30]:
scores

{'GME': [{'label': 'POSITIVE', 'score': 0.9938066005706787},
  {'label': 'POSITIVE', 'score': 0.7805719971656799},
  {'label': 'NEGATIVE', 'score': 0.9731772541999817},
  {'label': 'NEGATIVE', 'score': 0.9990413784980774},
  {'label': 'NEGATIVE', 'score': 0.9908208250999451},
  {'label': 'POSITIVE', 'score': 0.8924435377120972},
  {'label': 'NEGATIVE', 'score': 0.9977561831474304},
  {'label': 'NEGATIVE', 'score': 0.9859585165977478},
  {'label': 'NEGATIVE', 'score': 0.8664731383323669},
  {'label': 'NEGATIVE', 'score': 0.9903545379638672},
  {'label': 'POSITIVE', 'score': 0.9991598129272461}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.9983276724815369},
  {'label': 'POSITIVE', 'score': 0.6840000152587891},
  {'label': 'NEGATIVE', 'score': 0.9997729659080505},
  {'label': 'POSITIVE', 'score': 0.9960147142410278},
  {'label': 'NEGATIVE', 'score': 0.9997296929359436},
  {'label': 'NEGATIVE', 'score': 0.998216450214386},
  {'label': 'NEGATIVE', 'score': 0.9773924946784973},
  {'label': '

In [31]:
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/bull-day-gamestop-gme-120000551.html',
  'https://finance.yahoo.com/news/gamestop-gme-upgraded-strong-buy-170006515.html',
  'https://finance.yahoo.com/news/gamestop-nyse-gme-misses-q3-211648468.html',
  'https://finance.yahoo.com/news/gamestop-corp-gme-reports-narrowed-224622965.html',
  'https://finance.yahoo.com/news/unveiling-gamestop-gme-value-really-155119864.html',
  'https://finance.yahoo.com/news/3-sorry-gaming-stocks-sell-171540757.html',
  'https://finance.yahoo.com/news/gamestop-gme-true-worth-really-163851598.html',
  'https://finance.yahoo.com/news/gamestop-gme-posts-breakeven-earnings-202700458.html',
  'https://finance.yahoo.com/news/down-19-41-4-weeks-133506409.html',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws%26pccc%3D1',
  'https://finance.yahoo.com/news/gamestop-gme-strong-industry-solid-134000154.html'],
 'TSLA': ['https://finance.yahoo.com/news/tesla-stock-tumbles-7-as-shipments-slump-new-price-c

In [32]:
range(len(summaries['GME']))

range(0, 11)

In [33]:
summaries['GME'][8]

'Relative Strength Index (RSI) reading for GME is 29.62.'

In [34]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [35]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['GME',
  'Two analysts have increased their earnings estimates for the current year. The stock is currently a Rank #1 in the industry',
  'POSITIVE',
  0.9938066005706787,
  'https://finance.yahoo.com/news/bull-day-gamestop-gme-120000551.html'],
 ['GME',
  'Rating upgrade primarily reflects upward trend in earnings estimates.',
  'POSITIVE',
  0.7805719971656799,
  'https://finance.yahoo.com/news/gamestop-gme-upgraded-strong-buy-170006515.html'],
 ['GME',
  'Revenue falls 9.1% year-on-year, while non-commerce sales rise.',
  'NEGATIVE',
  0.9731772541999817,
  'https://finance.yahoo.com/news/gamestop-nyse-gme-misses-q3-211648468.html'],
 ['GME',
  'SG&A expenses reduced to 27.5% of net sales, down from 32.7% in Q3 2022.',
  'NEGATIVE',
  0.9990413784980774,
  'https://finance.yahoo.com/news/gamestop-corp-gme-reports-narrowed-224622965.html'],
 ['GME',
  'Check out our latest analysis for GameStop What is GME undervalued?',
  'NEGATIVE',
  0.9908208250999451,
  'https://finance.yahoo.

In [36]:
import csv
with open('assetsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)