## Libraries and setup

In [None]:
!pip install sentencepiece

In [1]:
from bs4 import BeautifulSoup
import requests
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

In [25]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained("human-centered-summarization/financial-summarization-pegasus")
model = PegasusForConditionalGeneration.from_pretrained(model_name)

model.config.vocab_size

96103

## Scrap the web

In [5]:
URL = "https://au.finance.yahoo.com/news/abl-space-systems-scores-60-204923372.html"
r = requests.get(URL)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [6]:
paragraphs

[<p>Launch startup ABL Space Systems has landed a $60 million contract to build out its “responsive launch” operational capacity, as part of the U.S. Space Force and U.S. Air Force Strategic Funding Increase (STRATFI) program. The new funding, which equally matches government funding with private investment, comes as the company prepares for the second launch attempt of its RS1 rocket.</p>,
 <p>In a statement, ABL said that a key challenge with responsive space launches “is breaking from the assumption of a pre-defined orbit, trajectory, and launch site.” For such missions, the company said it would build new operational capacity for short-notice launches.</p>,
 <p>“We believe that operational flexibility is key to meeting the rapidly changing needs of our customers,” Eva Abramson, ABL’s head of strategic development, said in a statement. “This award will help us in further developing on-call launch capabilities to meet mission-driven payload, launch site and target orbit needs.”</p>,


## Clean the paragraph and Quick summary

In [26]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
article = ' '.join(words)

In [28]:
input_ids = tokenizer.encode(article, return_tensors = 'pt')
output = model.generate(input_ids, max_length = 70, num_beams = 3, early_stopping = True) ## Beam Search
summary = tokenizer.decode(output[0], skip_special_tokens = True)

In [29]:
summary

'Alaska rocket startup is preparing for a second launch attempt. New funding comes as company prepares for second launch attempt'

## Automated latest NEWS Search and Summary

In [30]:
search_tickers = ['GME', 'TSLA', 'BTC']

In [31]:
def search_from_tickers(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+()&tbm=nws".format(ticker)
    r = requests.get(URL)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs

In [33]:
search_from_tickers('GME')

['https://au.yahoo.com/',
 'https://mail.yahoo.com/?.intl=au&.lang=en-AU',
 'https://au.news.yahoo.com/',
 'https://au.finance.yahoo.com/',
 'https://au.sports.yahoo.com/',
 'https://au.lifestyle.yahoo.com/',
 'https://au.lifestyle.yahoo.com/entertainment/',
 'https://au.news.yahoo.com/weather/',
 'https://au.yahoo.com/everything/',
 'https://au.finance.yahoo.com/',
 'https://login.yahoo.com/?.lang=en-AU',
 'https://mail.yahoo.com/?.intl=au&.lang=en-AU',
 'https://mail.yahoo.com/?.intl=au&.lang=en-AU',
 'https://au.finance.yahoo.com',
 'https://au.finance.yahoo.com/watchlists/',
 'https://au.finance.yahoo.com/portfolios/',
 '#',
 'https://au.finance.yahoo.com/australia/',
 'https://au.finance.yahoo.com/topic/international/',
 'https://au.finance.yahoo.com/topic/commodities/',
 'https://au.finance.yahoo.com/crypto/',
 'https://au.finance.yahoo.com/world-indices/',
 'https://au.finance.yahoo.com/currencies/',
 'https://au.finance.yahoo.com/currency-converter/',
 '//au.finance.yahoo.com/c

In [34]:
raw_url = {ticker:search_from_tickers(ticker) for ticker in search_tickers}
raw_url

{'GME': ['https://au.yahoo.com/',
  'https://mail.yahoo.com/?.intl=au&.lang=en-AU',
  'https://au.news.yahoo.com/',
  'https://au.finance.yahoo.com/',
  'https://au.sports.yahoo.com/',
  'https://au.lifestyle.yahoo.com/',
  'https://au.lifestyle.yahoo.com/entertainment/',
  'https://au.news.yahoo.com/weather/',
  'https://au.yahoo.com/everything/',
  'https://au.finance.yahoo.com/',
  'https://login.yahoo.com/?.lang=en-AU',
  'https://mail.yahoo.com/?.intl=au&.lang=en-AU',
  'https://mail.yahoo.com/?.intl=au&.lang=en-AU',
  'https://au.finance.yahoo.com',
  'https://au.finance.yahoo.com/watchlists/',
  'https://au.finance.yahoo.com/portfolios/',
  '#',
  'https://au.finance.yahoo.com/australia/',
  'https://au.finance.yahoo.com/topic/international/',
  'https://au.finance.yahoo.com/topic/commodities/',
  'https://au.finance.yahoo.com/crypto/',
  'https://au.finance.yahoo.com/world-indices/',
  'https://au.finance.yahoo.com/currencies/',
  'https://au.finance.yahoo.com/currency-converte

In [35]:
raw_url['GME']

['https://au.yahoo.com/',
 'https://mail.yahoo.com/?.intl=au&.lang=en-AU',
 'https://au.news.yahoo.com/',
 'https://au.finance.yahoo.com/',
 'https://au.sports.yahoo.com/',
 'https://au.lifestyle.yahoo.com/',
 'https://au.lifestyle.yahoo.com/entertainment/',
 'https://au.news.yahoo.com/weather/',
 'https://au.yahoo.com/everything/',
 'https://au.finance.yahoo.com/',
 'https://login.yahoo.com/?.lang=en-AU',
 'https://mail.yahoo.com/?.intl=au&.lang=en-AU',
 'https://mail.yahoo.com/?.intl=au&.lang=en-AU',
 'https://au.finance.yahoo.com',
 'https://au.finance.yahoo.com/watchlists/',
 'https://au.finance.yahoo.com/portfolios/',
 '#',
 'https://au.finance.yahoo.com/australia/',
 'https://au.finance.yahoo.com/topic/international/',
 'https://au.finance.yahoo.com/topic/commodities/',
 'https://au.finance.yahoo.com/crypto/',
 'https://au.finance.yahoo.com/world-indices/',
 'https://au.finance.yahoo.com/currencies/',
 'https://au.finance.yahoo.com/currency-converter/',
 '//au.finance.yahoo.com/c

## Clean URLS

In [46]:
import re
exclude_list = ['facebook', 'subscription', 'techcrunch', 'login', 'twitter']

def strip_url(urls, exlude_list):
    val = []
    for url in urls:
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

       res = re.findall(r'(https?://\S+)', url)[0].split('&')[0] - regular expression to scrap

In [47]:
strip_url(raw_url['GME'], exclude_list)

['https://au.news.yahoo.com/',
 'https://www.independent.co.uk/asia/east-asia/north-korea-nuclear-underwater-drone-test-b2307263.html',
 'https://uk.news.yahoo.com/inside-ukraine-scramble-game-changer-092926919.html',
 'https://au.finance.yahoo.com/work/',
 'https://au.finance.yahoo.com/property/',
 'https://au.news.yahoo.com/weather/',
 'https://au.finance.yahoo.com/news/us-launches-air-strikes-syria-050158495.html',
 'https://au.finance.yahoo.com/currencies/',
 'https://au.finance.yahoo.com/industries/industrials/',
 'https://www.independent.co.uk/news/world/europe/putin-russia-wagner-ukraine-counteroffensive-b2307272.html',
 'https://au.finance.yahoo.com/personal-finance/news/',
 'https://au.lifestyle.yahoo.com/entertainment/',
 'https://au.finance.yahoo.com/industries/real-estate/',
 'https://www.independent.co.uk/news/world/europe/new-zeland-soldier-killed-in-ukraine-b2306416.html',
 'https://www.independent.co.uk/space/asteroid-earth-moon-nasa-city-killer-b2307600.html',
 'https:

In [48]:
clean_url = {ticker:strip_url(raw_url[ticker], exclude_list) for ticker in search_tickers}
clean_url

{'GME': ['https://au.news.yahoo.com/',
  'https://www.independent.co.uk/asia/east-asia/north-korea-nuclear-underwater-drone-test-b2307263.html',
  'https://uk.news.yahoo.com/inside-ukraine-scramble-game-changer-092926919.html',
  'https://au.finance.yahoo.com/work/',
  'https://au.finance.yahoo.com/property/',
  'https://au.news.yahoo.com/weather/',
  'https://au.finance.yahoo.com/news/us-launches-air-strikes-syria-050158495.html',
  'https://au.finance.yahoo.com/currencies/',
  'https://au.finance.yahoo.com/industries/industrials/',
  'https://www.independent.co.uk/news/world/europe/putin-russia-wagner-ukraine-counteroffensive-b2307272.html',
  'https://au.finance.yahoo.com/personal-finance/news/',
  'https://au.lifestyle.yahoo.com/entertainment/',
  'https://au.finance.yahoo.com/industries/real-estate/',
  'https://www.independent.co.uk/news/world/europe/new-zeland-soldier-killed-in-ukraine-b2306416.html',
  'https://www.independent.co.uk/space/asteroid-earth-moon-nasa-city-killer-b2

In [49]:
## now we process for all the links we got like we did for the first one.

def scrap_process(URLS):
    ARTICLES = []
    for url in URLS:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:300]
        article = ' '.join(words)
        ARTICLES.append(article)
    return ARTICLES

In [50]:
articles = {ticker:scrap_process(clean_url[ticker]) for ticker in search_tickers}
articles

{'GME': ["Launch startup ABL Space Systems has landed a $60 million contract to build out its “responsive launch” operational capacity, as part of the U.S. Space Force and U.S. Air Force Strategic Funding Increase (STRATFI) program. The new funding, which equally matches government funding with private investment, comes as the company prepares for the second launch attempt of its RS1 rocket. In a statement, ABL said that a key challenge with responsive space launches “is breaking from the assumption of a pre-defined orbit, trajectory, and launch site.” For such missions, the company said it would build new operational capacity for short-notice launches. “We believe that operational flexibility is key to meeting the rapidly changing needs of our customers,” Eva Abramson, ABL’s head of strategic development, said in a statement. “This award will help us in further developing on-call launch capabilities to meet mission-driven payload, launch site and target orbit needs.” The U.S. Space Fo

In [53]:
articles["TSLA"][0]

"Launch startup ABL Space Systems has landed a $60 million contract to build out its “responsive launch” operational capacity, as part of the U.S. Space Force and U.S. Air Force Strategic Funding Increase (STRATFI) program. The new funding, which equally matches government funding with private investment, comes as the company prepares for the second launch attempt of its RS1 rocket. In a statement, ABL said that a key challenge with responsive space launches “is breaking from the assumption of a pre-defined orbit, trajectory, and launch site.” For such missions, the company said it would build new operational capacity for short-notice launches. “We believe that operational flexibility is key to meeting the rapidly changing needs of our customers,” Eva Abramson, ABL’s head of strategic development, said in a statement. “This award will help us in further developing on-call launch capabilities to meet mission-driven payload, launch site and target orbit needs.” The U.S. Space Force has m

In [54]:
def summarize_text(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors = 'pt')
        output = model.generate(input_ids, max_length = 55, num_beams = 5, early_stopping = True)
        summary = tokenizer.decode(output[0], skip_special_tokens = True)
        summaries.append(summary)
    return summaries

In [58]:
summaries = {ticker:summarize_text(articles[ticker]) for ticker in search_tickers}
summaries

{'GME': ['Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call launch capabilities’',
  'Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call launch capabilities’',
  'Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call launch capabilities’',
  'Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call launch capabilities’',
  'Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call launch capabilities’',
  'Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call launch capabilities’',
  'Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in 

In [63]:
from transformers import pipeline
sentiment = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")

In [64]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in search_tickers}
scores

{'GME': [{'label': 'POSITIVE', 'score': 0.9990478157997131},
  {'label': 'POSITIVE', 'score': 0.9990478157997131},
  {'label': 'POSITIVE', 'score': 0.9990478157997131},
  {'label': 'POSITIVE', 'score': 0.9990478157997131},
  {'label': 'POSITIVE', 'score': 0.9990478157997131},
  {'label': 'POSITIVE', 'score': 0.9990478157997131},
  {'label': 'POSITIVE', 'score': 0.9990478157997131},
  {'label': 'POSITIVE', 'score': 0.9990478157997131},
  {'label': 'POSITIVE', 'score': 0.9990478157997131},
  {'label': 'POSITIVE', 'score': 0.9990478157997131},
  {'label': 'POSITIVE', 'score': 0.9990478157997131},
  {'label': 'POSITIVE', 'score': 0.9990478157997131},
  {'label': 'POSITIVE', 'score': 0.9990478157997131},
  {'label': 'POSITIVE', 'score': 0.9990478157997131},
  {'label': 'POSITIVE', 'score': 0.9990478157997131},
  {'label': 'POSITIVE', 'score': 0.9990478157997131},
  {'label': 'POSITIVE', 'score': 0.9990478157997131},
  {'label': 'POSITIVE', 'score': 0.9990478157997131},
  {'label': 'POSITIVE

In [67]:
print(summaries['TSLA'][0], scores['TSLA'][0]['label'], scores['TSLA'][0]['score'])

Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call launch capabilities’ POSITIVE 0.9990478157997131


## FILE SAVE

## summaries, score, cleaned_url

In [76]:
def store_summary(summaries, score, urls):
    output = []
    for ticker in search_tickers:
        for count in range(len(summaries[ticker])):
            try_output = [
                ticker,
                summaries[ticker][count],
                score[ticker][count]['label'],
                score[ticker][count]['score'],
                urls[ticker][count]
            ]
            output.append(try_output)
    return output

In [77]:
final_output = store_summary(summaries, scores, clean_url)
final_output

[['GME',
  'Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call launch capabilities’',
  'POSITIVE',
  0.9990478157997131,
  'https://au.news.yahoo.com/'],
 ['GME',
  'Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call launch capabilities’',
  'POSITIVE',
  0.9990478157997131,
  'https://www.independent.co.uk/asia/east-asia/north-korea-nuclear-underwater-drone-test-b2307263.html'],
 ['GME',
  'Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call launch capabilities’',
  'POSITIVE',
  0.9990478157997131,
  'https://uk.news.yahoo.com/inside-ukraine-scramble-game-changer-092926919.html'],
 ['GME',
  'Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call launch capabilities’',
  'POSITIVE',
  0.9990478157997131,


In [78]:
len(final_output)

180

In [79]:
final_output[20: 25]

[['GME',
  'Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call launch capabilities’',
  'POSITIVE',
  0.9990478157997131,
  'https://au.finance.yahoo.com/'],
 ['GME',
  'Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call launch capabilities’',
  'POSITIVE',
  0.9990478157997131,
  'https://ca.news.yahoo.com/u-carries-air-strikes-syria-030237336.html'],
 ['GME',
  'Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call launch capabilities’',
  'POSITIVE',
  0.9990478157997131,
  'https://au.finance.yahoo.com/news/ukrainian-forces-soon-launch-counter-073009658.html'],
 ['GME',
  'Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call launch capabilities’',
  'POSITIVE',
  0.9990478157997131,
  'https://mail.yahoo

In [81]:
final_output.insert(0, ["Ticker", "Summary", "Label", "Confidence", "URLs"])

In [82]:
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URLs'],
 ['GME',
  'Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call launch capabilities’',
  'POSITIVE',
  0.9990478157997131,
  'https://au.news.yahoo.com/'],
 ['GME',
  'Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call launch capabilities’',
  'POSITIVE',
  0.9990478157997131,
  'https://www.independent.co.uk/asia/east-asia/north-korea-nuclear-underwater-drone-test-b2307263.html'],
 ['GME',
  'Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call launch capabilities’',
  'POSITIVE',
  0.9990478157997131,
  'https://uk.news.yahoo.com/inside-ukraine-scramble-game-changer-092926919.html'],
 ['GME',
  'Company plans to launch its RS1 rocket from Alaska’s Kodiak Island. ‘This award will help us in further developing on-call laun

## Create CSV

In [84]:
import csv

with open('summary_sentiment.csv', mode = 'w', newline ='') as f:
    csv_writer = csv.writer(f, delimiter =',', quotechar = '"', quoting = csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)

## Checking the file

In [85]:
import pandas as pd

df = pd.read_csv('summary_sentiment.csv')
df.head(10)

Unnamed: 0,Ticker,Summary,Label,Confidence,URLs
0,GME,Company plans to launch its RS1 rocket from Al...,POSITIVE,0.999048,https://au.news.yahoo.com/
1,GME,Company plans to launch its RS1 rocket from Al...,POSITIVE,0.999048,https://www.independent.co.uk/asia/east-asia/n...
2,GME,Company plans to launch its RS1 rocket from Al...,POSITIVE,0.999048,https://uk.news.yahoo.com/inside-ukraine-scram...
3,GME,Company plans to launch its RS1 rocket from Al...,POSITIVE,0.999048,https://au.finance.yahoo.com/work/
4,GME,Company plans to launch its RS1 rocket from Al...,POSITIVE,0.999048,https://au.finance.yahoo.com/property/
5,GME,Company plans to launch its RS1 rocket from Al...,POSITIVE,0.999048,https://au.news.yahoo.com/weather/
6,GME,Company plans to launch its RS1 rocket from Al...,POSITIVE,0.999048,https://au.finance.yahoo.com/news/us-launches-...
7,GME,Company plans to launch its RS1 rocket from Al...,POSITIVE,0.999048,https://au.finance.yahoo.com/currencies/
8,GME,Company plans to launch its RS1 rocket from Al...,POSITIVE,0.999048,https://au.finance.yahoo.com/industries/indust...
9,GME,Company plans to launch its RS1 rocket from Al...,POSITIVE,0.999048,https://www.independent.co.uk/news/world/europ...
