In [1]:
#pip install transformers


A transformer is a deep learning model that adopts the mechanism of self-attention, differentially weighting the significance of each part of the input data. It is used primarily in the fields of natural language processing (NLP) and computer vision (CV).

In [1]:
import requests

 Requests is a Python library used to send an HTTP request to a website and store the response object within a variable

In [3]:
from bs4 import BeautifulSoup 

Beautiful Soup is a great tool for extracting data from web pages but it works with the source code of the page.

In [4]:
from transformers import PegasusTokenizer,PegasusForConditionalGeneration

#Transformer is an architecture for transforming one sequence into another one with the help of two parts (Encoder and Decoder)
#The PegasusTokenizer class will convert our sentences into tokens. This is a numbered representation of our sentences. 
pegasus modole use for reading a document and producing a summary.

In [5]:
#pip install sentencepiece 
##sentence piece used for data extraction and in making of tokens

In [6]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

In [7]:
url = "https://www.investopedia.com/terms/f/finance.asp"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser') 
paragraphs = soup.find_all('p')

In [8]:
paragraphs[0].text

"Adam Hayes, Ph.D., CFA, is a financial writer with 15+ years Wall Street experience as a derivatives trader. Besides his extensive derivative trading expertise, Adam is an expert in economics and behavioral finance. Adam received his master's in economics from The New School for Social Research and his Ph.D. from the University of Wisconsin-Madison in sociology. He is a CFA charterholder as well as holding FINRA Series 7, 55 & 63 licenses. He currently researches and teaches economic sociology and the social studies of finance at the Hebrew University in Jerusalem."

In [9]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:200]
ARTICLE = ' '.join(words)

In [10]:
ARTICLE

'Adam Hayes, Ph.D., CFA, is a financial writer with 15+ years Wall Street experience as a derivatives trader. Besides his extensive derivative trading expertise, Adam is an expert in economics and behavioral finance. Adam received his master\'s in economics from The New School for Social Research and his Ph.D. from the University of Wisconsin-Madison in sociology. He is a CFA charterholder as well as holding FINRA Series 7, 55 & 63 licenses. He currently researches and teaches economic sociology and the social studies of finance at the Hebrew University in Jerusalem. Investopedia / Mira Norian \nFinance is a term for matters regarding the management, creation, and study of money and investments. It involves the use of credit and debt, securities, and investment to finance current projects using future income flows. Because of this temporal aspect, finance is closely linked to the time value of money, interest rates, and other related topics.\n \nFinance can be broadly divided into thre

In [11]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt') ##pytorch return array #tokenzier used to genarte unique token of each nummber
output = model.generate(input_ids, max_length=100, num_beams=5, early_stopping=True) ##num_beams return only top 5 choices, early stopping used to avoid overfiting
summary = tokenizer.decode(output[0], skip_special_tokens=True)  ##special tokens are not derived from our input

In [12]:
input_ids #The input ids are often the only required parameters to be passed to the model as input

tensor([[ 5262, 18157,   108,  5727,   107,   470,   107,   108, 31336,   108,
           117,   114,   748,  2678,   122, 67066,   231,  2948,  1411,   306,
           130,   114, 22720, 14410,   107,  6530,   169,  2248, 22869,  2430,
          1945,   108,  5262,   117,   142,  1766,   115,  8965,   111,  9705,
          3324,   107,  5262,   915,   169,  2080,   131,   116,   115,  8965,
           135,   139,   351,   760,   118,  2480,  1810,   111,   169,  5727,
           107,   470,   107,   135,   109,   502,   113,  5581,   121, 36454,
           115, 24945,   107,   285,   117,   114, 31336,  7164, 20693,   130,
           210,   130,  2605, 60589,  2879,  6691,  5278,   259,  9860, 10088,
           107,   285,   767, 28331,   111,  6472,  1500, 24945,   111,   109,
           525,  1683,   113,  3324,   134,   109, 13048,   502,   115,  9496,
           107, 19356, 79909,   943, 23320,   566, 19621,  5227,   117,   114,
          1286,   118,  2887,  1409,   109,   603,  

In [13]:
output

tensor([[    0,  5262, 18157,   117,   114,   748,  2678,   122, 67066,   231,
          2948,  1411,   306,   107,     1]])

In [14]:
summary

'Adam Hayes is a financial writer with 15+ years Wall Street experience.'

In [15]:
monitered_tickers = ['gme','tsla','btc']

In [16]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs

In [17]:
search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format('gme')

In [18]:
search_url

'https://www.google.com/search?q=yahoo+finance+gme&tbm=nws'

In [19]:
monitored_tickers = ['GME','TSLA','BTC']

In [20]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'GME': ['/?sa=X&ved=0ahUKEwi9yseZl_L9AhVBQ7gEHUqMAb8QOwgC',
  '/search?q=yahoo+finance+GME&tbm=nws&ie=UTF-8&gbv=1&sei=TlccZL3rH8GG4dUPypiG-As',
  '/search?q=yahoo+finance+GME&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwi9yseZl_L9AhVBQ7gEHUqMAb8Q_AUIBSgA',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwi9yseZl_L9AhVBQ7gEHUqMAb8Q_AUIBygC',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwi9yseZl_L9AhVBQ7gEHUqMAb8Q_AUICCgD',
  'https://maps.google.com/maps?q=yahoo+finance+GME&um=1&ie=UTF-8&sa=X&ved=0ahUKEwi9yseZl_L9AhVBQ7gEHUqMAb8Q_AUICSgE',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwi9yseZl_L9AhVBQ7gEHUqMAb8Q_AUICigF',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwi9yseZl_L9AhVBQ7gEHUqMAb8Q_AUICygG',
  '/advanced_search',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa=X&ved=0ahUKEwi9yseZl_L9AhVBQ7gEHUqMAb8QpwUIDQ',
  '/search?q=yahoo+finance+GME&ie=U

In [21]:
import re


In [22]:
exclude_list = ['maps','policies','preferences','accounts','support']

In [23]:
def strip_unwanted_urls(urls,exclude_list):
    val = []
    for url in urls:
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)',url)[0].split('&')[0]
            val.append(res)
    return list(set(val))
        

In [24]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker],exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/gamestop-reports-fourth-quarter-fiscal-200500425.html',
  'https://www.fastcompany.com/90869514/gamestop-gme-stock-price-going-up-profit-retail-why',
  'https://finance.yahoo.com/news/eyes-fed-decision-key-stocks-152603931.html',
  'https://finance.yahoo.com/news/existing-home-sales-nike-earnings-162923739.html',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws%26pccc%3D1',
  'https://www.shacknews.com/article/134690/gamestop-gme-stock-rockets-38-higher-on-earnings-results',
  'https://finance.yahoo.com/news/stock-market-today-dow-ends-161157972.html',
  'https://www.msn.com/en-us/money/markets/stock-market-news-today-stock-futures-waver-as-all-eyes-on-fed-meeting/ar-AA18Wusj%3Focid%3Dweather-verthp-feeds',
  'https://finance.yahoo.com/news/first-republic-rebounds-record-low-092753176.html',
  'https://finance.yahoo.com/news/stocks-moving-in-after-hours-gamestop-nike-214549776.html',
  'https://finance.yahoo.com/news/changi

In [25]:
def scrape_and_process(URLs):
    Article = []
    for URL in URLs:
        r = requests.get(URL)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p') #p is for paragraph
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        Article.append(ARTICLE)
    return Article

In [26]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'GME': ['GRAPEVINE, Texas, March 21, 2023--(BUSINESS WIRE)--GameStop Corp. (NYSE: GME) ("GameStop" or the "Company") today released financial results for the fourth quarter and fiscal year ended January 28, 2023. The Company’s condensed and consolidated financial statements, including GAAP and non-GAAP results, are below. The Company’s Form 10-K and supplemental information can be found at https://investor.gamestop.com/. FOURTH QUARTER OVERVIEW Net sales were $2.226 billion, compared to $2.254 billion in the prior year\'s fourth quarter. Selling, general and administrative ("SG&A") expenses were $453.4 million, or 20.4% of sales, compared to $538.9 million, or 23.9% of sales, in the prior year\'s fourth quarter. Net income was $48.2 million, compared to a net loss of $147.5 million for the prior year’s fourth quarter. Inventory was $682.9 million at the close of the period, compared to $915.0 million at the close of the prior year\'s fourth quarter, reflecting the Company’s ongoing fo

In [27]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt',max_length = 512,truncation=True) ##pytorch return array #tokenzier used to genarte unique token of each nummber
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True) ##num_beams return only top 5 choices, early stopping used to avoid overfiting
        summary = tokenizer.decode(output[0], skip_special_tokens=True) 
        summaries.append(summary)
    return summaries

In [28]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'GME': ['Net sales were $2.226 billion for the fourth quarter of fiscal year. Selling, general and administrative expenses were $453.4 million, or 20.4% of sales',
  '.',
  'Markets now pricing in a roughly 88% probability of another hike. Yellen reassures markets that backstop measures are in place',
  'Fed meeting, existing home sales, Nike earnings.',
  'All images are copyrighted.',
  'Company reports strong hardware sales, larger cash on hand.',
  'Fed chair says no rate cuts seen this year. Banks tumble as tighter lending standards seen serving same role',
  'The S&P 500, Dow Jones Industrial Average and tech-heavy Nasdaq all fell. Fed raised its benchmark rate 0.25% to a 4.75%-5% target range',
  'KBW Regional Banking Index surges the most since January 2021. First Republic’s share price has slumped over the past two weeks amid investor concern',
  'Nike and GameStop post better-than-expected results.',
  'China’s accession to the WTO in 2001 contributed to a new era of globali

#### pipeline is a function in transformers 
 sentiment show text emotion--- positive or negative

In [29]:
from transformers import pipeline
sentiment = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [30]:
sentiment(summaries['BTC'])

[{'label': 'POSITIVE', 'score': 0.6216397285461426},
 {'label': 'POSITIVE', 'score': 0.631275475025177},
 {'label': 'POSITIVE', 'score': 0.8255074620246887},
 {'label': 'POSITIVE', 'score': 0.8668016195297241},
 {'label': 'NEGATIVE', 'score': 0.9991037249565125},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'NEGATIVE', 'score': 0.9983304142951965},
 {'label': 'NEGATIVE', 'score': 0.9880996346473694},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'NEGATIVE', 'score': 0.9908422231674194},
 {'label': 'NEGATIVE', 'score': 0.9399531483650208}]

In [32]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'GME': [{'label': 'NEGATIVE', 'score': 0.9350910186767578},
  {'label': 'POSITIVE', 'score': 0.9668781757354736},
  {'label': 'POSITIVE', 'score': 0.9811636805534363},
  {'label': 'NEGATIVE', 'score': 0.5370712280273438},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'POSITIVE', 'score': 0.999259889125824},
  {'label': 'NEGATIVE', 'score': 0.9963580965995789},
  {'label': 'NEGATIVE', 'score': 0.9994004964828491},
  {'label': 'NEGATIVE', 'score': 0.9958016276359558},
  {'label': 'POSITIVE', 'score': 0.9948650002479553},
  {'label': 'POSITIVE', 'score': 0.9989142417907715}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.9883734583854675},
  {'label': 'NEGATIVE', 'score': 0.9978517293930054},
  {'label': 'POSITIVE', 'score': 0.926937460899353},
  {'label': 'NEGATIVE', 'score': 0.9019278287887573},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'NEGATIVE', 'score': 0.9922189712524414},
  {'label': 'POSITIVE', 'score': 0.953042209148407},
  {'label': 'PO

In [33]:
#megign all
def create_output_array(summaries,scores,urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [35]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['GME',
  'Net sales were $2.226 billion for the fourth quarter of fiscal year. Selling, general and administrative expenses were $453.4 million, or 20.4% of sales',
  'NEGATIVE',
  0.9350910186767578,
  'https://finance.yahoo.com/news/gamestop-reports-fourth-quarter-fiscal-200500425.html'],
 ['GME',
  '.',
  'POSITIVE',
  0.9668781757354736,
  'https://www.fastcompany.com/90869514/gamestop-gme-stock-price-going-up-profit-retail-why'],
 ['GME',
  'Markets now pricing in a roughly 88% probability of another hike. Yellen reassures markets that backstop measures are in place',
  'POSITIVE',
  0.9811636805534363,
  'https://finance.yahoo.com/news/eyes-fed-decision-key-stocks-152603931.html'],
 ['GME',
  'Fed meeting, existing home sales, Nike earnings.',
  'NEGATIVE',
  0.5370712280273438,
  'https://finance.yahoo.com/news/existing-home-sales-nike-earnings-162923739.html'],
 ['GME',
  'All images are copyrighted.',
  'NEGATIVE',
  0.9880996346473694,
  'https://www.google.com/search?q%3Dy

In [36]:
final_output[21]

['TSLA',
 'EV-maker’s credit rating upgraded to Baa3 from Ba1, outlook stable. Moody’s cites ‘considerable investments’ in new vehicle and battery production',
 'POSITIVE',
 0.9547061920166016,
 'https://finance.yahoo.com/news/tesla-scores-blue-chip-status-after-moodys-debt-upgrade-164544334.html']

In [37]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [38]:
import csv
with open('Stock.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) #delimiter for next column
    csv_writer.writerows(final_output)