In [4]:
import requests
from bs4 import BeautifulSoup
import csv
import json
import os
import re
from datetime import datetime
from urllib.parse import urljoin

In [5]:
BASE_URL = "https://www.thehindu.com/"
CATEGORY_URL = "https://www.thehindu.com/sport/"
CSV_FILENAME = "the_hindu_articles.csv"
OUTPUT_DIR = "article_metadata"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

In [None]:

def scrape_links_to_csv():
    
    
    try:
        response = requests.get(CATEGORY_URL, headers=HEADERS)
        response.raise_for_status()  
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the category page: {e}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')
    
    
    article_links = set()
    

    for a_tag in soup.select('h3.title > a, .story-card-news > h3 > a, .latest-news-list > li > a'):
        href = a_tag.get('href')
        if href and href.startswith('https://www.thehindu.com/sport/'):
            
            if href.endswith('.ece'):
                 article_links.add(href)

    if not article_links:
        print("Could not find any article links. The website structure might have changed.")
        return

    print(f"Found {len(article_links)} unique article links.")

    # Writing the data to a CSV file
    with open(CSV_FILENAME, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        # Write header
        writer.writerow(['website', 'category', 'article_url'])
        # Write data rows
        for url in article_links:
            writer.writerow(['The Hindu', 'Sport', url])
            
    print(f"Successfully saved article links to '{CSV_FILENAME}'")
    print("-" * 50)


In [7]:
scrape_links_to_csv()

--- Starting Part 1: Scraping links from https://www.thehindu.com/sport/ ---
Found 9 unique article links.
Successfully saved article links to 'the_hindu_articles.csv'
--------------------------------------------------


In [None]:

def sanitize_filename(name):
    name = re.sub(r'[\\/*?:"<>|]', "", name)
    name = name.replace(' ', '_')
    return name[:100]

In [None]:
def scrape_metadata_from_csv():

    if not os.path.exists(CSV_FILENAME):
        print(f"Error: CSV file '{CSV_FILENAME}' not found.")
        return

    # Create the output directory if it doesn't exist
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print(f"JSON files will be saved in the '{OUTPUT_DIR}' directory.")

    with open(CSV_FILENAME, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        
        for i, row in enumerate(reader):
            article_url = row['article_url']
            print(f"\nProcessing article {i+1}: {article_url}")

            try:
                response = requests.get(article_url, headers=HEADERS, timeout=10)
                response.raise_for_status()
            except requests.exceptions.RequestException as e:
                print(f"  -> Could not fetch article: {e}")
                continue

            article_soup = BeautifulSoup(response.content, 'html.parser')

            # --- Extracting Metadata (Corrected Logic) ---
            
            # 1. Title (Robust check)
            title_tag = article_soup.find('h1', class_='title')
            if title_tag:
                title = title_tag.get_text(strip=True)
            else: # Fallback to meta tag if h1 is not found
                meta_title_tag = article_soup.find('meta', {'property': 'og:title'})
                title = meta_title_tag['content'] if meta_title_tag else "Title not found"

            # 2. Summary (This was already correct)
            summary_tag = article_soup.find('meta', {'name': 'description'})
            summary = summary_tag['content'] if summary_tag else ""

            # 3. Publish Date (This was already correct)
            publish_date_tag = article_soup.find('meta', {'property': 'article:published_time'})
            publish_date = publish_date_tag['content'] if publish_date_tag else ""

            # 4. Article Image (This was already correct)
            image_tag = article_soup.find('meta', {'property': 'og:image'})
            article_image = image_tag['content'] if image_tag else ""

            # 5. Image Credit
            # Try multiple strategies to find image credit
            image_credit = ""
            # Strategy 1: Standard span
            credit_span = article_soup.find('span', class_='credit')
            if credit_span:
               image_credit = credit_span.get_text(strip=True)

            # Strategy 2: Inside figure > figcaption (used in some articles)
            if not image_credit:
               fig_caption = article_soup.select_one('figure figcaption')
               if fig_caption:
                  image_credit = fig_caption.get_text(strip=True)

            # Strategy 3: Inside div with class "caption" or "image-credit"
            if not image_credit:
               caption_div = article_soup.find('div', class_=re.compile(r'(caption|image-credit)', re.I))
               if caption_div:
                  image_credit = caption_div.get_text(strip=True)

            # 6. Article Content
            article_content = ""
            content_div = article_soup.find('div', id=re.compile(r'content-body-\d+-\d+'))
            if content_div:
             paragraphs = content_div.find_all('p')
             article_content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
            else:  
            # Fallback: try .articlebody or .content-area if present
             alt_content = article_soup.find('div', class_=re.compile(r'(articlebody|content-area)'))
            if alt_content:
             paragraphs = alt_content.find_all('p')
             article_content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
            else:
             article_content = "Article content not found."

            # 7. Tags
            # --- Tags Extraction ---
            tags = ""

            # Strategy 1: Article tags container
            tags_div = article_soup.find('div', class_='article-tags-m')
            if tags_div:
               tags = ", ".join([a.get_text(strip=True) for a in tags_div.find_all('a')])

            # Strategy 2: Fallback list structure
            if not tags:
               alt_tags = article_soup.select('ul.keywords > li > a, ul.article-keywords > li > a')
               if alt_tags:
                  tags = ", ".join([a.get_text(strip=True) for a in alt_tags])


            # --- Assembling JSON data ---
            metadata = {
                'title': title,
                'summary': summary,
                'publish_date': publish_date,
                'article_image': article_image,
                'article_content': article_content,
                'image_credit': image_credit,
                'tags': tags,
                'source_url': article_url
            }
            
            # --- Saving the JSON file ---
            timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
            sanitized_title = sanitize_filename(title)
            json_filename = f"thehindu_sport_{sanitized_title}_{timestamp}.json"
            json_filepath = os.path.join(OUTPUT_DIR, json_filename)
            
            with open(json_filepath, 'w', encoding='utf-8') as jsonfile:
                json.dump(metadata, jsonfile, indent=4, ensure_ascii=False)
            
            print(f"  -> Successfully extracted and saved to '{json_filepath}'")
            
    print("\n--- All articles processed. ---")

In [13]:
scrape_metadata_from_csv()


--- Starting Part 2: Extracting metadata for each article ---
JSON files will be saved in the 'article_metadata' directory.

Processing article 1: https://www.thehindu.com/sport/cricket/winning-at-edgbaston-will-be-one-of-my-happiest-memories-whenever-i-retire-shubman-gill/article69782541.ece
  -> Successfully extracted and saved to 'article_metadata/thehindu_sport_Winning_at_Edgbaston_will_be_one_of_my_‘happiest_memories’_whenever_I_retire_Shubman_Gill_20250707163401.json'

Processing article 2: https://www.thehindu.com/sport/cricket/vaughan-urges-inconsistent-zak-crawley-to-learn-from-shubman-gill/article69782533.ece
  -> Successfully extracted and saved to 'article_metadata/thehindu_sport_Vaughan_urges_inconsistent_Zak_Crawley_to_learn_from_Shubman_Gill_20250707163402.json'

Processing article 3: https://www.thehindu.com/sport/cricket/stokes-rues-missed-chances-after-crushing-loss/article69781250.ece
  -> Successfully extracted and saved to 'article_metadata/thehindu_sport_Stokes_r

In [14]:
# Check if the output directory and CSV file exist
if os.path.exists(OUTPUT_DIR):
    print(f"'{OUTPUT_DIR}' directory created successfully.")
    
    # List a few JSON files from the directory
    json_files = os.listdir(OUTPUT_DIR)
    if json_files:
        print(f"\nFound {len(json_files)} JSON files. Here are the first 5:")
        for filename in json_files[:5]:
            print(f" - {filename}")
    else:
        print("The output directory is empty.")

else:
    print(f"Error: '{OUTPUT_DIR}' directory not found.")

if os.path.exists(CSV_FILENAME):
    print(f"\n'{CSV_FILENAME}' created successfully.")
else:
    print(f"Error: '{CSV_FILENAME}' not found.")

'article_metadata' directory created successfully.

Found 9 JSON files. Here are the first 5:
 - thehindu_sport_ICC_appoints_Sanjog_Gupta_as_its_new_CEO_20250707163405.json
 - thehindu_sport_Winning_at_Edgbaston_will_be_one_of_my_‘happiest_memories’_whenever_I_retire_Shubman_Gill_20250707163401.json
 - thehindu_sport_Stokes_rues_missed_chances_after_crushing_loss_20250707163402.json
 - thehindu_sport_Wimbledon__Alcaraz,_Sabalenka_reach_quarterfinals_20250707163404.json
 - thehindu_sport_Wimbledon_says_a_call_on_a_shot_that_landed_out_was_missed_because_the_electronic_system_was_off_20250707163406.json

'the_hindu_articles.csv' created successfully.
