In [1]:
# !pip install requests beautifulsoup4 transformers torch


In [2]:
# !pip install tensorflow keras

In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json

# URL to scrape (you can change this to any other category)
base_url = "https://www.prothomalo.com/business"

def get_filtered_urls(base_url, keyword):
    """
    Scrapes and filters URLs based on the given category keyword.
    """
    response = requests.get(base_url)

    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a', href=True)

    filtered_urls = []
    for link in links:
        href = link['href']
        full_url = urljoin(base_url, href)

        # Filter by the keyword, e.g., 'sports'
        if keyword in full_url:
            filtered_urls.append(full_url)

    return filtered_urls

def extract_data_from_url(url):
    """
    Extracts data from a given URL.
    """
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to retrieve {url}. Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract relevant data points
    data = {
        "url": url,
        "title": soup.title.string if soup.title else "No Title",  # Extract title
        "meta_description": soup.find("meta", {"name": "description"})["content"]
                              if soup.find("meta", {"name": "description"}) else "No Description",
        "content": extract_content(soup),  # Extract article content
        "author": extract_author(soup),    # Extract author if available
        "publication_date": extract_publication_date(soup)  # Extract publication date
    }
    return data

def extract_content(soup):
    """
    Extracts the main content of the article.
    """
    paragraphs = soup.find_all('p')
    content = " ".join(p.get_text() for p in paragraphs)
    return content if content else "No Content"

def extract_author(soup):
    """
    Extracts the author of the article if available.
    """
    author_tag = soup.find(class_="author-name")  # Adjust this based on page structure
    return author_tag.get_text().strip() if author_tag else "Unknown Author"

def extract_publication_date(soup):
    """
    Extracts the publication date of the article.
    """
    date_tag = soup.find("time")  # Adjust this based on page structure
    return date_tag["datetime"] if date_tag and date_tag.has_attr("datetime") else "Unknown Date"

def save_to_json(data, filename="scraped_data.json"):
    """
    Saves the extracted data into a JSON file.
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def main():
    # Filtered URLs based on the 'sports' keyword
    sports_urls = get_filtered_urls(base_url, "sports")

    # Extract data from each filtered URL
    all_data = []
    for i, url in enumerate(sports_urls, 1):
        print(f"Processing {i}/{len(sports_urls)}: {url}")
        data = extract_data_from_url(url)
        if data:
            all_data.append(data)

    # Save the extracted data to a JSON file
    save_to_json(all_data)

if __name__ == "__main__":
    main()


Processing 1/1: https://www.prothomalo.com/sports


In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json

# Base URL to scrape
base_url = "https://www.prothomalo.com/sports"

def get_filtered_urls(base_url, keyword="sports"):
    """
    Scrape and filter URLs based on the specified category.
    """
    response = requests.get(base_url)

    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a', href=True)

    filtered_urls = []
    for link in links:
        href = link['href']
        full_url = urljoin(base_url, href)

        if keyword in full_url:  # Filter by 'sports' keyword
            filtered_urls.append(full_url)

    return filtered_urls

def extract_data_from_url(url):
    """
    Extract all required data from a given sports article URL.
    """
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to retrieve {url}. Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting article details
    data = {
        "url": url,
        "title": soup.title.string if soup.title else "No Title",
        "content": extract_content(soup),
        "author": extract_author(soup),
        "publication_date": extract_publication_date(soup)
    }

    return data

def extract_content(soup):
    """
    Extracts the main content from the article.
    """
    content_tag = soup.find_all('p')
    return " ".join(paragraph.get_text() for paragraph in content_tag) if content_tag else "No Content"

def extract_author(soup):
    """
    Extracts the author of the article.
    """
    author_tag = soup.find(class_="author-name")
    return author_tag.get_text().strip() if author_tag else "Unknown Author"

def extract_publication_date(soup):
    """
    Extracts the publication date of the article.
    """
    date_tag = soup.find("time")
    return date_tag["datetime"] if date_tag and date_tag.has_attr("datetime") else "Unknown Date"

def save_to_json(data, filename="scraped_data.json"):
    """
    Saves the extracted data to a JSON file.
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def main():
    # Scrape and filter URLs related to sports
    sports_urls = get_filtered_urls(base_url, "sports")

    # Extract data from each URL
    all_data = []
    for i, url in enumerate(sports_urls, 1):
        print(f"Processing {i}/{len(sports_urls)}: {url}")
        data = extract_data_from_url(url)
        if data:
            all_data.append(data)

    # Save data to JSON
    save_to_json(all_data)

if __name__ == "__main__":
    main()


Processing 1/3: https://www.prothomalo.com/api/auth/v1/oauth/authorize?client_id=811&response_type=code&redirect_uri=/api/auth/v1/oauth/token&callback_uri=/sports
Failed to retrieve https://www.prothomalo.com/api/auth/v1/oauth/authorize?client_id=811&response_type=code&redirect_uri=/api/auth/v1/oauth/token&callback_uri=/sports. Status code: 400
Processing 2/3: https://www.prothomalo.com/sports
Processing 3/3: https://www.prothomalo.com/api/auth/v1/oauth/authorize?client_id=811&response_type=code&redirect_uri=/api/auth/v1/oauth/token&callback_uri=/sports
Failed to retrieve https://www.prothomalo.com/api/auth/v1/oauth/authorize?client_id=811&response_type=code&redirect_uri=/api/auth/v1/oauth/token&callback_uri=/sports. Status code: 400
