In [1]:
import requests, sys, time
from datetime import datetime
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# When a browser makes a request to a website, it passes basic info about the browser.
# With this header, we use this same interaction s.t. the scraper looks less like a bot
request_headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'referer': 'https://www.google.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
}

In [3]:
# From a yahoo finance search, we can pull urls to financial/buisness news on a search term search term
def make_search_query(search_term):
    url = f'https://finance.yahoo.com/quote/{search_term}?p={search_term}'
    
    # Make a request and parse the search page
    response = requests.get(url, request_headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    tags = soup.find_all('a', {'class': "js-content-viewer"})
    
    # Extract article links
    article_urls = []
    for refrence in tags:
        article_urls.append("https://finance.yahoo.com" + refrence['href']) 

    return article_urls

In [4]:
search_term = "NVDA"
article_urls = make_search_query(search_term)
for url in article_urls:
    print(url)

https://finance.yahoo.com/video/stocks-trade-higher-open-investors-134201927.html
https://finance.yahoo.com/video/nvidia-ceo-jensen-huang-weighs-214045339.html
https://finance.yahoo.com/video/nvidia-ceo-breaks-down-omniverse-212732853.html
https://finance.yahoo.com/video/today-top-headlines-fed-raises-210523344.html
https://finance.yahoo.com/video/nvidia-ai-omniverse-channels-help-210316770.html
https://finance.yahoo.com/m/26c770ad-a814-3731-b9d1-5c660c3fc239/how-to-invest-nvidia-tesla.html
https://finance.yahoo.com/m/168e6916-f6d5-3fca-8fd3-9d179217d985/top-10-s-p-500-stocks-by.html
https://finance.yahoo.com/m/e46279a8-4a3a-310d-b1e9-86d9a7ac606e/nvidia-stock-scores.html
https://finance.yahoo.com/news/medtronic-mdt-enhance-ai-innovation-164804636.html
https://finance.yahoo.com/m/4ef47c48-f44e-3bbf-ab60-6d03f021c62e/nvidia-stock-option-trade.html
https://finance.yahoo.com/m/63578db3-6ebd-3b47-8955-da66f41abda5/dow-jones-rises-300-points.html
https://finance.yahoo.com/m/0c868bdc-3fae-38

In [5]:
# now that we have urls, we can gather the info we want from each article
#  our search results contains 3 types of results:
# 1 - videos: videos contain "video" in the url, we can reject all videos
# 2 - redirects: redericts will take you to a portion of the article and have a link to the full article. 
#           We will scrape meta data from the yahoo finace page, scrape the real link, and use the real link to scrape the article content
# 3 - articles: the article is hosted directly on yahoo finance and no further work is needed
def explore_articles(article_urls):
    articles = []
    for url in article_urls:
        try:
            if "video" in url:
                continue

            response = requests.get(url, request_headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            title = soup.find_all('h1')[1].getText()
            publish_date = soup.find('time').get('datetime')
            keywords = soup.find('meta', attrs={'name': 'news_keywords'})['content']
            
            if "/m/" in url:
                tags = soup.find_all('a', {'class': "link caas-button"})
                url = tags[0]['href']
                response = requests.get(url, request_headers)
                soup = BeautifulSoup(response.content, 'html.parser')
            paragraphs = soup.find_all('p')
            content = ''
            for paragraph in paragraphs:
                content += paragraph.get_text()
                
            articles.append({'Title': title, 'Publish Date': publish_date, 'Source': url, 'Keywords':keywords, 'Content': content})
        except Exception:
            articles.append(f"Error: {Exception}")
        
    return articles

In [6]:
articles = explore_articles(article_urls)
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
df = pd.DataFrame(articles)
df.to_csv(f'results_{search_term}_{timestamp}.csv', index=False)
df.head()

Unnamed: 0,Title,Publish Date,Source,Keywords,Content
0,"How To Invest: Nvidia, Tesla Reveal 3-Step Rou...",2023-03-23T19:07:14.000Z,https://www.investors.com/how-to-invest/invest...,"tech stocks, growth stocks, stock market, stoc...",\n Access to this page has been...
1,Top 10 S&P 500 Stocks by Index Weight,2023-03-23T19:23:39.000Z,https://www.investopedia.com/top-10-s-and-p-50...,"Index Weighting, market cap, net income, Stand...","\nSee which stocks fell in the rankings, and w..."
2,Nvidia Stock Scores Successful Breakout Amid I...,2023-03-23T17:59:02.000Z,https://www.investors.com/news/nvidia-stock-20...,"Nvidia Stock, Nvidia, NVDA stock, artificial i...",\n Access to this page has been...
3,Medtronic (MDT) to Enhance AI Innovation With ...,2023-03-23T16:48:04.000Z,https://finance.yahoo.com/news/medtronic-mdt-e...,"Medtronic, Zacks Investment Research, GI Geniu...",Detailed updates on the high-stakes House Ener...
4,Nvidia Stock Option Trade Benefits From Neutra...,2023-03-23T15:17:00.000Z,https://www.investors.com/research/options/nvi...,"Nvidia, calendar spread, implied volatility, c...",\n Access to this page has been...
