# Yahoo! News Scraper
Scrape news from Yahoo! based on a specific search criteria

In [None]:
import re
import csv
from time import sleep
from bs4 import BeautifulSoup
import requests

headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'referer': 'https://www.google.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
}

def get_article(card):
    """Extract article information from the raw html"""
    try:
        headline = card.find('h4', 's-title').text
        source = card.find("span", 's-source').text
        posted = card.find('span', 's-time').text.replace('·', '').strip()
        description = card.find('p', 's-desc').text.strip()
        raw_link = card.find('a').get('href')
        print(f"DEBUG: Raw link: {raw_link}")
        unquoted_link = requests.utils.unquote(raw_link)
        print(f"DEBUG: Unquoted link: {unquoted_link}")
        pattern = re.compile(r'RU=(.+)/RK')
        match = re.search(pattern, unquoted_link)
        if match:
            clean_link = match.group(1)
            print(f"DEBUG: Clean link: {clean_link}")
        else:
            print("DEBUG: No match found for link pattern. Using raw link")
            clean_link = raw_link
        
        article = (headline, source, posted, description, clean_link)
        print(f"DEBUG: Extracted article -> {article}")
        return article
    except Exception as e:
        print(f"DEBUG: Exception encountered in get_article: {e}")
        raise

def get_the_news(search):
    """Run the main program"""
    template = 'https://news.search.yahoo.com/search?p={}'
    url = template.format(search)
    print(f"DEBUG: Starting URL: {url}")
    articles = []
    links = set()
    
    while True:
        response = requests.get(url, headers=headers)
        if not response.ok:
            print(f"DEBUG: Request failed with status {response.status_code} for URL: {url}")
            break
        
        print(f"DEBUG: Fetching URL: {url}")
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('div', 'NewsArticle')
        print(f"DEBUG: Found {len(cards)} news cards")
        
        # extract articles from page
        for card in cards:
            try:
                article = get_article(card)
                link = article[-1]
                if link not in links:
                    links.add(link)
                    articles.append(article)
            except Exception as e:
                print(f"DEBUG: Error processing a card: {e}")      
                
        # find the next page
        next_page = soup.find('a', 'next')
        if next_page:
            url = next_page.get('href')
            print(f"DEBUG: Next page URL: {url}")
            sleep(1)
        else:
            print("DEBUG: No next page found. Exiting loop.")
            break
            
    # save article data
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Headline', 'Source', 'Posted', 'Description', 'Link'])
        writer.writerows(articles)
        print("DEBUG: CSV file saved with scraped articles.")
        
    return articles

In [None]:
# run the main program
articles = get_the_news('iphone')

In [None]:
# show the first 4 records
articles[0:4]