# Yahoo! News Scraper
Scrape news from Yahoo! based on a specific search criteria

In [None]:
import re
import csv
from time import sleep
from bs4 import BeautifulSoup
import requests

headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'referer': 'https://www.google.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
}

def get_article(card):
    """Extract article information from the raw html"""
    try:
        headline = card.find('h4', 's-title').text
        source = card.find("span", 's-source').text
        posted = card.find('span', 's-time').text.replace('·', '').strip()
        description = card.find('p', 's-desc').text.strip()
        raw_link = card.find('a').get('href')
        print(f"DEBUG: Raw link: {raw_link}")
        unquoted_link = requests.utils.unquote(raw_link)
        print(f"DEBUG: Unquoted link: {unquoted_link}")
        pattern = re.compile(r'RU=(.+)/RK')
        match = re.search(pattern, unquoted_link)
        if match:
            clean_link = match.group(1)
            print(f"DEBUG: Clean link: {clean_link}")
        else:
            print("DEBUG: No match found for link pattern. Using raw link")
            clean_link = raw_link
        
        article = (headline, source, posted, description, clean_link)
        print(f"DEBUG: Extracted article -> {article}")
        return article
    except Exception as e:
        print(f"DEBUG: Exception encountered in get_article: {e}")
        raise

def get_the_news(search, max_articles=None):
    """Run the main program with an optional limit on the number of articles to scrape."""
    template = 'https://news.search.yahoo.com/search?p={}'
    url = template.format(search)
    print(f"DEBUG: Starting URL: {url}")
    articles = []
    links = set()
    
    while True:
        response = requests.get(url, headers=headers)
        if not response.ok:
            print(f"DEBUG: Request failed with status {response.status_code} for URL: {url}")
            break
        
        print(f"DEBUG: Fetching URL: {url}")
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('div', 'NewsArticle')
        print(f"DEBUG: Found {len(cards)} news cards")
        
        # extract articles from page
        for card in cards:
            try:
                article = get_article(card)
                link = article[-1]
                if link not in links:
                    links.add(link)
                    articles.append(article)
                    print(f"DEBUG: Total articles scraped: {len(articles)}")
                    # Stop if max_articles limit is reached
                    if max_articles and len(articles) >= max_articles:
                        print(f"DEBUG: Reached max_articles limit of {max_articles}. Stopping.")
                        break
            except Exception as e:
                print(f"DEBUG: Error processing a card: {e}")
        
        # Stop if max_articles limit is reached
        if max_articles and len(articles) >= max_articles:
            break
                
        # find the next page
        next_page = soup.find('a', 'next')
        if next_page:
            url = next_page.get('href')
            print(f"DEBUG: Next page URL: {url}")
            sleep(1)
        else:
            print("DEBUG: No next page found. Exiting loop.")
            break
            
    # save article data
    if articles:  # Ensure articles exist before writing to CSV
        with open('results.csv', 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['Headline', 'Source', 'Posted', 'Description', 'Link', 'Full Content'])
            writer.writerows(articles)
            print("DEBUG: CSV file saved with scraped articles.")
        
    return articles

In [None]:
def get_article(card):
    """Extract article information from the raw html"""
    try:
        headline = card.find('h4', 's-title').text
        source = card.find("span", 's-source').text
        posted = card.find('span', 's-time').text.replace('·', '').strip()
        description = card.find('p', 's-desc').text.strip()
        raw_link = card.find('a').get('href')
        print(f"DEBUG: Raw link: {raw_link}")
        unquoted_link = requests.utils.unquote(raw_link)
        print(f"DEBUG: Unquoted link: {unquoted_link}")
        pattern = re.compile(r'RU=(.+)/RK')
        match = re.search(pattern, unquoted_link)
        if match:
            clean_link = match.group(1)
            print(f"DEBUG: Clean link: {clean_link}")
        else:
            print("DEBUG: No match found for link pattern. Using raw link")
            clean_link = raw_link
        
        # Fetch full article content
        try:
            article_response = requests.get(clean_link, headers=headers)
            if article_response.ok:
                article_soup = BeautifulSoup(article_response.text, 'html.parser')
                # Update the selector based on the actual structure of the article pages
                full_content = article_soup.find('div', {'class': 'main-content'}).text.strip()  # Example selector
                print(f"DEBUG: Full content extracted -> {full_content[:100]}...")  # Print first 100 characters
            else:
                full_content = "Failed to fetch article content"
                print(f"DEBUG: Failed to fetch article content for URL: {clean_link}")
        except Exception as e:
            full_content = "Error fetching article content"
            print(f"DEBUG: Exception fetching article content: {e}")
        
        article = (headline, source, posted, description, clean_link, full_content)
        print(f"DEBUG: Extracted article -> {article}")
        return article
    except Exception as e:
        print(f"DEBUG: Exception encountered in get_article: {e}")
        raise

In [None]:
# Run the scraper
articles = get_the_news('politics latest',max_articles=4)

# Show the first 4 records
for article in articles[:4]:
    print("Headline:", article[0])
    print("Source:", article[1])
    print("Posted:", article[2])
    print("Description:", article[3])
    print("Link:", article[4])
    print("Full Content:", article[5][:200])  # Print the first 200 characters of the full content
    print("===")

In [None]:
from transformers import GPT2Tokenizer

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

def count_tokens(text):
    """Count the number of tokens in a given text."""
    tokens = tokenizer.encode(text)
    return len(tokens)

# Example: Count tokens for the first 4 articles
for article in articles[:4]:
    full_content = article[5]  # Full content is the 6th element in the tuple
    token_count = count_tokens(full_content)
    print(f"Headline: {article[0]}")
    print(f"Token Count: {token_count}")
    print("===")