# NLP Asset Analysis

## Dependencies & Imports


In [69]:
%%capture
%pip install transformers
%pip install sentencepiece
%pip install torch torchvision

In [70]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, logging # type: ignore
from bs4 import BeautifulSoup # type: ignore
import requests # type: ignore
logging.set_verbosity_error()

## Summarization Model

In [71]:
model_name = 'human-centered-summarization/financial-summarization-pegasus'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

## News & Sentiment Pipeline

In [72]:
tickers = ['GOOG', 'TSLA', 'BTC']

### Search for Stock News

In [73]:
def get_stock_news_urls(ticker):
    search_url = 'https://www.google.com/search?q=yahoo+finance+{}+news&tbm=nws'.format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs

In [74]:
raw_urls = {ticker : get_stock_news_urls(ticker) for ticker in tickers};

### Filter URLs

In [75]:
import re

In [76]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support', 'search']

In [77]:
def filter_urls(urls, exclude_list):
    val = []
    for url in urls:
        if 'https://' in url and 'finance' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))[:5]


In [78]:
cleaned_urls = {ticker : filter_urls(raw_urls[ticker], exclude_list) for ticker in tickers}
cleaned_urls

{'GOOG': ['https://finance.yahoo.com/news/amazon-earnings-195332083.html',
  'https://finance.yahoo.com/news/alphabet-earnings-july-25-153933519.html',
  'https://finance.yahoo.com/news/alphabet-stock-rises-4-after-google-rolls-out-new-bard-features-international-expansion-070139824.html',
  'https://finance.yahoo.com/news/google-parent-alphabets-stock-top-175837293.html',
  'https://finance.yahoo.com/news/want-job-working-ai-ll-180044780.html'],
 'TSLA': ['https://finance.yahoo.com/news/tesla-model-y-model-3-probed-by-nhtsa-on-loss-of-steering-complaints-174245647.html',
  'https://finance.yahoo.com/news/lucid-board-member-on-ev-competition-were-not-targeting-tesla-133735484.html',
  'https://finance.yahoo.com/news/nasdaq-sinks-as-tesla-netflix-plunge-dow-gains-for-9th-day-stock-market-news-today-200246194.html',
  'https://finance.yahoo.com/news/10-best-ev-battery-autonomous-222242368.html'],
 'BTC': ['https://finance.yahoo.com/news/binance-boosts-first-digitals-stablecoin-155401419.

### Scrape URLs

In [79]:
def process(urls):
    articles = []
    for url in urls:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        article = ' '.join(words)
        articles.append(article)
    return articles

In [80]:
articles = {ticker : process(cleaned_urls[ticker]) for ticker in tickers}

### Summarize

In [81]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output  = model.generate(input_ids, max_length=100, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return  summaries

In [82]:
summaries = {ticker : summarize(articles[ticker]) for ticker in tickers}
summaries

{'GOOG': ['Second-quarter profit beat came despite slowdown in sales of Amazon Web Services. Revenue outlook was a striking beat, coming in between $138 billion and $143 billion',
  'Google parent company reported revenue of $74.6 billion. YouTube ad revenues also topped estimates',
  'Google’s chatbot is now available in more than 40 languages. New features include audio responses, Pinned conversations',
  "AFP sues Musk's X social media platform, accusing it of neglecting. Regulatory Hurdles: Agence France-Presse (AFP) has initiated legal proceedings against Elon Musk",
  'How to make money with AI and the skills needed. Here are the types of jobs that you can find in AI'],
 'TSLA': ['NHTSA says investigation covers an estimated 280,000 Model 3 and Model Y cars. Tesla may not be able to fix issue via software update',
  'Margins came in below expectations, but CEO says demand for new truck is off the hook.',
  'Lucid says EV market ‘will explode,’ not ‘ deliberately targeting Tesla.’

## Sentiment Analysis

In [83]:
from transformers import pipeline # type: ignore
sentiment = pipeline('sentiment-analysis')

In [84]:
scores = {ticker : sentiment(summaries[ticker]) for ticker in tickers}
scores

{'GOOG': [{'label': 'POSITIVE', 'score': 0.9606388807296753},
  {'label': 'POSITIVE', 'score': 0.9964740872383118},
  {'label': 'POSITIVE', 'score': 0.9792576432228088},
  {'label': 'NEGATIVE', 'score': 0.9980740547180176},
  {'label': 'POSITIVE', 'score': 0.9550225734710693}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.9995212554931641},
  {'label': 'NEGATIVE', 'score': 0.9878749251365662},
  {'label': 'NEGATIVE', 'score': 0.9985577464103699},
  {'label': 'NEGATIVE', 'score': 0.9997245669364929},
  {'label': 'POSITIVE', 'score': 0.998110294342041}],
 'BTC': [{'label': 'NEGATIVE', 'score': 0.9841852188110352},
  {'label': 'POSITIVE', 'score': 0.9958518743515015},
  {'label': 'NEGATIVE', 'score': 0.9990792274475098},
  {'label': 'NEGATIVE', 'score': 0.9965921640396118},
  {'label': 'NEGATIVE', 'score': 0.9975816011428833}]}

## Exporting to CSV

In [85]:
def create_output_array(summaries, scores, urls):
    output = [['Ticker', 'Summary', 'Label', 'Confidence', 'URL']]
    for ticker in tickers:
        for i in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][i],
                scores[ticker][i]['label'],
                scores[ticker][i]['score'],
                urls[ticker][i]
            ]
            output.append(output_this)
    return output

In [86]:
output = create_output_array(summaries, scores, cleaned_urls)
output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['GOOG',
  'Second-quarter profit beat came despite slowdown in sales of Amazon Web Services. Revenue outlook was a striking beat, coming in between $138 billion and $143 billion',
  'POSITIVE',
  0.9606388807296753,
  'https://finance.yahoo.com/news/amazon-earnings-195332083.html'],
 ['GOOG',
  'Google parent company reported revenue of $74.6 billion. YouTube ad revenues also topped estimates',
  'POSITIVE',
  0.9964740872383118,
  'https://finance.yahoo.com/news/alphabet-earnings-july-25-153933519.html'],
 ['GOOG',
  'Google’s chatbot is now available in more than 40 languages. New features include audio responses, Pinned conversations',
  'POSITIVE',
  0.9792576432228088,
  'https://finance.yahoo.com/news/alphabet-stock-rises-4-after-google-rolls-out-new-bard-features-international-expansion-070139824.html'],
 ['GOOG',
  "AFP sues Musk's X social media platform, accusing it of neglecting. Regulatory Hurdles: Agence France-Presse

In [87]:
import csv
with open('summaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(output)