In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re

In [2]:
def scrape_article_content(url):
    """
    Scrape the main content of an article from a given URL.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        # Send HTTP request
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise exception for bad status codes
        
        # Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Remove scripts, styles, and other non-content elements
        for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
            element.decompose()
        
        # Try common article tags
        content = []
        # Look for article-specific tags first
        article = soup.find('article')
        if article:
            paragraphs = article.find_all('p')
            content = [p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]
        
        # Fallback to main content divs or body paragraphs
        if not content:
            main_content = soup.find(['main', 'div'], class_=re.compile('content|article|body', re.I))
            if main_content:
                paragraphs = main_content.find_all('p')
                content = [p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]
        
        # Fallback to all paragraphs if no main content found
        if not content:
            paragraphs = soup.find_all('p')
            content = [p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]
        
        # Clean and join content
        if content:
            cleaned_content = ' '.join(content).replace('\n', ' ').replace('\r', '')
            cleaned_content = re.sub(r'\s+', ' ', cleaned_content).strip()
            return cleaned_content[:10000]  # Limit to 10,000 chars to avoid excessive length
        
        print(f"No content found for {url}")
        return ''
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return ''
    except Exception as e:
        print(f"Error parsing {url}: {e}")
        return ''

In [3]:
def process_news_data(input_csv, output_csv):
    """
    Read news_data.csv, scrape article content for each URL, and save to new_data.csv with Content column.
    """
    # Read input CSV
    try:
        df = pd.read_csv(input_csv)
        print(f"Loaded {len(df)} articles from {input_csv}")
    except FileNotFoundError:
        print(f"Error: {input_csv} not found.")
        return
    except Exception as e:
        print(f"Error reading {input_csv}: {e}")
        return
    
    # Initialize Content column
    df['Content'] = ''
    
    # Scrape content for each URL
    for idx, row in df.iterrows():
        url = row['URL']
        print(f"Scraping article {idx + 1}/{len(df)}: {url}")
        content = scrape_article_content(url)
        df.at[idx, 'Content'] = content
        time.sleep(1)  # Delay to avoid overwhelming servers
    
    # Reorder columns
    df = df[['Ticker', 'Date', 'Title', 'Description', 'Source', 'URL', 'Content']]
    
    # Save to output CSV
    try:
        df.to_csv(output_csv, index=False)
        print(f"Saved updated data with Content column to {output_csv}")
        print(f"Sample of new data:")
        print(df[['Title', 'Source', 'Content']].head())
    except Exception as e:
        print(f"Error saving {output_csv}: {e}")
    
    # Summary
    successful = len(df[df['Content'] != ''])
    print(f"\nSummary: Successfully scraped content for {successful}/{len(df)} articles")

In [4]:
input_csv = '../data/news_data.csv'
output_csv = '../data/news_data.csv'

process_news_data(input_csv, output_csv)

Loaded 200 articles from ../data/news_data.csv
Scraping article 1/200: https://www.etfdailynews.com/2025/05/04/apple-inc-nasdaqaapl-shares-sold-by-brown-financial-advisors/
Scraping article 2/200: https://www.etfdailynews.com/2025/05/04/apple-inc-nasdaqaapl-shares-bought-by-wrapmanager-inc/
Scraping article 3/200: https://www.etfdailynews.com/2025/05/04/hendley-co-inc-has-17-67-million-holdings-in-apple-inc-nasdaqaapl/
Scraping article 4/200: https://www.etfdailynews.com/2025/05/04/apple-inc-nasdaqaapl-is-mengis-capital-management-inc-s-largest-position/
Scraping article 5/200: https://www.etfdailynews.com/2025/05/04/pine-valley-investments-ltd-liability-co-has-50-77-million-stake-in-apple-inc-nasdaqaapl/
Scraping article 6/200: https://www.etfdailynews.com/2025/05/04/skylands-capital-llc-sells-18000-shares-of-apple-inc-nasdaqaapl/
Scraping article 7/200: https://www.etfdailynews.com/2025/05/04/impax-asset-management-group-plc-has-140-65-million-holdings-in-apple-inc-nasdaqaapl/
Scrapi