In [None]:
# Install necessary libraries
!pip install beautifulsoup4 requests pandas

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Base URL of Hindustan Times
BASE_URL = "https://www.hindustantimes.com"

# Function to get article links from a section page
def get_article_links(section_url):
    response = requests.get(section_url, headers={"User-Agent": "Mozilla/5.0"})
    if response.status_code != 200:
        print(f"Failed to fetch page: {section_url}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    
    # Extracting article links
    links = [BASE_URL + a["href"] for a in soup.find_all("a", href=True) if "/india-news/" in a["href"]]
    
    return list(set(links))  # Remove duplicates

# Function to scrape a single article
def scrape_article(url):
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    if response.status_code != 200:
        print(f"Failed to fetch article: {url}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    
    # Extract title
    title = soup.find("h1").text.strip() if soup.find("h1") else "No title"
    
    # Extract text content
    article_text = " ".join([p.text.strip() for p in soup.find_all("p")])

    return {"Title": title, "URL": url, "Article": article_text}

# Define section to scrape
section_url = "https://www.hindustantimes.com/india-news"
article_links = get_article_links(section_url)

# Scrape multiple articles
data = []
for i, link in enumerate(article_links[:10]):  # Limit to first 10 articles
    print(f"Scraping {i+1}/{len(article_links)}: {link}")
    article_data = scrape_article(link)
    if article_data:
        data.append(article_data)
    time.sleep(2)  # Delay to prevent blocking

# Save scraped data to CSV
df = pd.DataFrame(data)
df.to_csv("hindustan_times_articles.csv", index=False)

print("✅ Scraping complete! Data saved as hindustan_times_articles.csv")
