In [7]:
# !pip install requests beautifulsoup4 transformers gitpython

In [9]:
!pip install tensorflow keras




[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json
from transformers import pipeline

# Set your Huggingface API key here
HUGGINGFACE_API_KEY = "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"

# Base URL to scrape
base_url = "https://www.jugantor.com/sports"

def get_filtered_urls(base_url, keyword):
    """
    Scrape and filter URLs containing the specified keyword.
    """
    response = requests.get(base_url)

    if response.status_code != 200:
        print(f"Failed to retrieve {base_url}. Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a', href=True)

    # Filter URLs by keyword
    filtered_urls = [urljoin(base_url, link['href']) for link in links if keyword in link['href']]
    return filtered_urls

def extract_data_from_url(url):
    """
    Extracts all available data from the article page.
    """
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to retrieve {url}. Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    data = {
        "url": url,
        "title": soup.title.string if soup.title else "No Title",
        "author": extract_author(soup),
        "publication_date": extract_publication_date(soup),
        "content": extract_content(soup)
    }

    return data

def extract_content(soup):
    """
    Extracts the main content of the article.
    """
    paragraphs = soup.find_all('p')
    content = " ".join(p.get_text() for p in paragraphs)
    return content if content else "No Content"

def extract_author(soup):
    """
    Extracts the author from the article page, if available.
    """
    author_tag = soup.find(class_="author-name")
    return author_tag.get_text().strip() if author_tag else "Unknown Author"

def extract_publication_date(soup):
    """
    Extracts the publication date from the article page.
    """
    date_tag = soup.find("time")
    return date_tag["datetime"] if date_tag and date_tag.has_attr("datetime") else "Unknown Date"

def analyze_with_llm(content):
    """
    Uses Huggingface's pipeline to analyze sentiment, importance, and perspective.
    """
    headers = {"Authorization": f"Bearer {HUGGINGFACE_API_KEY}"}
    sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
    
    # Analyze sentiment
    sentiment_result = sentiment_pipeline(content[:512])[0]

    # Generate importance score (example: based on content length)
    importance_score = min(round(len(content) / 100, 2), 10.0)  # Capped at 10.0

    # Analyze international relevance (dummy example using keywords)
    international_relevance = "High" if any(kw in content.lower() for kw in ["world", "international", "global"]) else "Low"

    return {
        "sentiment": sentiment_result["label"],
        "confidence": sentiment_result["score"],
        "importance_score": importance_score,
        "international_relevance": international_relevance
    }

def save_to_json(data, filename="scraped_data.json"):
    """
    Saves the data to a JSON file.
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def main():
    # Scrape and filter URLs related to sports
    sports_urls = get_filtered_urls(base_url, "sports")

    # Extract and analyze data for each article
    all_data = []
    for i, url in enumerate(sports_urls, 1):
        print(f"Processing {i}/{len(sports_urls)}: {url}")
        data = extract_data_from_url(url)
        if data:
            analysis = analyze_with_llm(data["content"])
            data.update(analysis)  # Integrate LLM-generated analysis
            all_data.append(data)

    # Save the final data to JSON
    save_to_json(all_data)

if __name__ == "__main__":
    main()
