In [4]:
import requests, sys, time
from datetime import datetime
from bs4 import BeautifulSoup
import pandas as pd

In [5]:
# When a browser makes a request to a website, it passes basic info about the browser.
# With this header, we use this same interaction s.t. the scraper looks less like a bot
request_headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'referer': 'https://www.google.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
}

In [6]:
# From a yahoo finance search, we can pull urls to financial/buisness news on a search term search term
def make_search_query(search_term):
    url = f'https://finance.yahoo.com/quote/{search_term}?p={search_term}'
    
    # Make a request and parse the search page
    response = requests.get(url, request_headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    tags = soup.find_all('a', {'class': "js-content-viewer"})
    
    # Extract article links
    article_urls = []
    for refrence in tags:
        article_urls.append("https://finance.yahoo.com" + refrence['href']) 

    return article_urls

In [7]:
search_term = "NVDA"
article_urls = make_search_query(search_term)
print(article_urls)

['https://finance.yahoo.com/m/5ce34a5c-c296-33f4-a379-70567930a6db/nvidia-s-stock-offering-a.html', 'https://finance.yahoo.com/m/c35c3145-db26-3f8c-953e-3e195ec0f295/intel-s-roadmap-changes-a.html', 'https://finance.yahoo.com/m/e55ff48d-9785-34d2-a29b-ceb1ca745495/why-super-micro-computer.html', 'https://finance.yahoo.com/m/684c930d-1475-3adc-b013-ae604c056ebc/artificial-intelligence.html', 'https://finance.yahoo.com/m/2c245944-1827-365c-91cf-fe92143a279e/investors-cash-in-on-12.html', 'https://finance.yahoo.com/news/buy-nvidia-corporation-nvda-now-113025502.html', 'https://finance.yahoo.com/m/36466dfa-a386-3fec-aba1-ebc0e331ca3e/as-chip-sales-dry-up-nvidia.html', 'https://finance.yahoo.com/m/36680d0a-3391-3319-9f7b-8cd3c5718783/tech-execs-didn%E2%80%99t-just-start.html', 'https://finance.yahoo.com/m/765dc080-a51c-303c-846f-a6bcf68867ae/this-solar-ipo-nvidia-uber.html', 'https://finance.yahoo.com/m/e5ffda0e-2346-3d8c-82cb-51c76f707f08/why-did-nvidia-stock-climb.html']


In [8]:
# On yahoo finance, our search results contains 3 types of results:
# 1 - videos: videos contain "video" in the url, we can reject all videos
# 2 - redirects: redericts will take you to a portion of the article and have a link to the full article. We want to pull the link to the full article
# 3 - articles: the article is hosted directly on yahoo finance and no further work is needed
def filter_articles(article_urls):
    filtered_urls = []
    for url in article_urls:
        if "video" in url:
            continue
        elif "/m/" in url:
            response = requests.get(url, request_headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            tags = soup.find_all('a', {'class': "link caas-button"})
            url = tags[0]['href']
        filtered_urls.append(url)
    return filtered_urls

In [9]:
filtered_urls = filter_articles(article_urls)
print(filtered_urls)


['https://www.fool.com/investing/2023/03/07/nvidias-stock-offering-a-sign-of-confidence-or-des/?source=eptyholnk0000202&utm_source=yahoo-host&utm_medium=feed&utm_campaign=article', 'https://www.fool.com/investing/2023/03/07/intels-roadmap-changes-a-sign-of-desperation-or-a/?source=eptyholnk0000202&utm_source=yahoo-host&utm_medium=feed&utm_campaign=article', 'https://www.fool.com/investing/2023/03/07/why-super-micro-computer-gained-354-in-february/?source=eptyholnk0000202&utm_source=yahoo-host&utm_medium=feed&utm_campaign=article', 'https://www.investors.com/news/technology/artificial-intelligence-stocks/?src=A00220', 'https://www.investors.com/etfs-and-funds/sectors/sp500-investors-cash-in-on-triple-play-stocks-that-can-do-no-wrong/?src=A00220', 'https://finance.yahoo.com/news/buy-nvidia-corporation-nvda-now-113025502.html', 'https://www.marketwatch.com/story/as-chip-sales-dry-up-nvidia-cfo-says-spending-on-ai-will-save-companies-money-d94c504b?siteid=yhoof2', 'https://www.marketwatch.

In [20]:
# now that we have usable urls, we can gather the info we want from each article
def explore_articles(article_urls):
    articles = []
    for url in article_urls:
        response = requests.get(url, request_headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        source = None
        try:
            title = soup.find('h1').get_text()
        except:
            title = None
        try:
            publish_date = soup.find('time').get('datetime')
        except:
            publish_date = None
        keywords = None

        try:
            content = ''
            for paragraph in soup.find_all('p'):
                content += paragraph.get_text()
        except:
            content = None
        articles.append({'Title': title, 'Publish Date': publish_date, 'Content': content})
    return articles
#df = pd.DataFrame(data)
        

In [21]:
articles = explore_articles(filtered_urls)
print(articles)

[{'Title': "Nvidia's Stock Offering -- a Sign of Confidence or Desperation?", 'Publish Date': None, 'Content': "Founded in 1993 by brothers Tom and David Gardner, The Motley Fool helps millions of people attain financial freedom through our website, podcasts, books, newspaper column, radio show, and premium investing services.Founded in 1993 by brothers Tom and David Gardner, The Motley Fool helps millions of people attain financial freedom through our website, podcasts, books, newspaper column, radio show, and premium investing services.You’re reading a free article with opinions that may differ from The Motley Fool’s Premium Investing Services. Become a Motley Fool member today to get instant access to our top analyst recommendations, in-depth research, investing resources, and more. Learn MoreNvidia's\xa0(NVDA -1.11%) stock price has increased over 60% year to date and it has been one of the best-performing semiconductor stocks to start the year off. In late February, there was an S

In [22]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
df = pd.DataFrame(articles)
df.to_csv(f'results_{search_term}_{timestamp}.csv', index=False)