In [1]:
import requests, sys, time
from datetime import datetime
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# When a browser makes a request to a website, it passes basic info about the browser.
# With this header, we use this same interaction s.t. the scraper looks less like a bot
request_headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'referer': 'https://www.google.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
}

In [3]:
# From a yahoo finance search, we can pull urls to financial/buisness news on a search term search term
def make_search_query(search_term):
    url = f'https://finance.yahoo.com/quote/{search_term}?p={search_term}'
    
    # Make a request and parse the search page
    response = requests.get(url, request_headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    tags = soup.find_all('a', {'class': "js-content-viewer"})
    
    # Extract article links
    article_urls = []
    for refrence in tags:
        article_urls.append("https://finance.yahoo.com" + refrence['href']) 

    return article_urls

In [4]:
search_term = "NVDA"
article_urls = make_search_query(search_term)
for url in article_urls:
    print(url)

https://finance.yahoo.com/m/8fed441d-da17-34cc-81e9-a8e5710e1075/microsoft-offers-to-keep.html
https://finance.yahoo.com/m/52df9dd5-5bc7-3e62-acc8-af54ae4e61a9/nvidia-stock-fomo-versus.html
https://finance.yahoo.com/m/cf71a10a-0828-33c9-99b7-d3b0b2e14bb8/3-things-about-amd-that-smart.html
https://finance.yahoo.com/m/c847bcfc-4b03-3ec1-bbdf-d5118606fc35/beyond-the-hype-examining.html
https://finance.yahoo.com/m/f3257ab8-25b9-3e95-bf18-1ff471a4c803/3-stocks-that-could-be-worth.html
https://finance.yahoo.com/m/753633bd-6faf-3dd1-acae-1d0a720ddb9f/look-beyond-ai-this-42.html
https://finance.yahoo.com/m/5ce34a5c-c296-33f4-a379-70567930a6db/nvidia-s-stock-offering-a.html
https://finance.yahoo.com/m/c35c3145-db26-3f8c-953e-3e195ec0f295/intel-s-roadmap-changes-a.html
https://finance.yahoo.com/m/e55ff48d-9785-34d2-a29b-ceb1ca745495/why-super-micro-computer.html
https://finance.yahoo.com/m/684c930d-1475-3adc-b013-ae604c056ebc/artificial-intelligence.html


In [5]:
# now that we have urls, we can gather the info we want from each article
#  our search results contains 3 types of results:
# 1 - videos: videos contain "video" in the url, we can reject all videos
# 2 - redirects: redericts will take you to a portion of the article and have a link to the full article. 
#           We will scrape meta data from the yahoo finace page, scrape the real link, and use the real link to scrape the article content
# 3 - articles: the article is hosted directly on yahoo finance and no further work is needed
def explore_articles(article_urls):
    articles = []
    for url in article_urls:
        try:
            if "video" in url:
                continue

            response = requests.get(url, request_headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            title = soup.find_all('h1')[1].getText()
            publish_date = soup.find('time').get('datetime')
            keywords = soup.find('meta', attrs={'name': 'news_keywords'})['content']
            
            if "/m/" in url:
                tags = soup.find_all('a', {'class': "link caas-button"})
                url = tags[0]['href']
                response = requests.get(url, request_headers)
                soup = BeautifulSoup(response.content, 'html.parser')
            paragraphs = soup.find_all('p')
            content = ''
            for paragraph in paragraphs:
                content += paragraph.get_text()
                
            articles.append({'Title': title, 'Publish Date': publish_date, 'Source': url, 'Keywords':keywords, 'Content': content})
        except Exception:
            articles.append(f"Error: {Exception}")
        
    return articles

In [6]:
articles = explore_articles(article_urls)
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
df = pd.DataFrame(articles)
df.to_csv(f'results_{search_term}_{timestamp}.csv', index=False)
df.head()

Unnamed: 0,Title,Publish Date,Source,Keywords,Content
0,Microsoft offers to keep ‘Call of Duty’ on Pla...,2023-03-08T18:00:00.000Z,https://www.marketwatch.com/story/microsoft-of...,"Activision Blizzard, Microsoft Corp",Microsoft Corp. proposed a series of licenses ...
1,Nvidia Stock: FOMO Versus Fundamentals. What S...,2023-03-08T15:15:00.000Z,https://www.fool.com/investing/2023/03/08/nvid...,"Nvidia, stock trades",Founded in 1993 by brothers Tom and David Gard...
2,3 Things About AMD That Smart Investors Know,2023-03-08T15:00:00.000Z,https://www.fool.com/investing/2023/03/08/3-th...,"Nvidia, Advanced Micro Devices, Grand View Res...",Founded in 1993 by brothers Tom and David Gard...
3,Beyond the Hype -- Examining Nvidia's Biggest ...,2023-03-08T11:00:00.000Z,https://www.fool.com/investing/2023/03/08/beyo...,"Nvidia, manufacturing partners, semiconductor ...",Founded in 1993 by brothers Tom and David Gard...
4,3 Stocks That Could Be Worth More Than Tesla b...,2023-03-08T10:50:00.000Z,https://www.fool.com/investing/2023/03/08/stoc...,"Tesla, Nvidia, market cap",Founded in 1993 by brothers Tom and David Gard...
