In [20]:
import os
import logging
from datetime import datetime
from typing import List, Dict, Any

import requests
from bs4 import BeautifulSoup
import pandas as pd
from textblob import TextBlob
from dateutil import parser
from dotenv import load_dotenv
import json
from urllib.parse import quote
import xmltodict
import sodapy

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Load API key from .env file
load_dotenv()
NEWS_API_KEY = os.getenv('NEWS_API')
if not NEWS_API_KEY:
    raise ValueError("NEWS_API key not found in .env file")
DATAGOV_API = os.getenv('DATAGOV_API') 
if not DATAGOV_API:
    raise ValueError("DATAGOV_API key not found in .env file")

# Thresholds for subjectivity and sentiment
SUBJECTIVITY_THRESHOLD = 0.5
SENTIMENT_THRESHOLD = 0.3
START_DATE = (datetime.now() - pd.DateOffset(months=1)).strftime('%Y-%m-%d')

# Add these new constants
GDELT_BASE_URL = "https://api.gdeltproject.org/api/v2/doc/doc"
EU_DATA_BASE_URL = "https://data.europa.eu/api/hub/search/datasets"

def parse_date(date_string: str) -> str:
    """Parse and format date strings."""
    return parser.parse(date_string).strftime('%Y-%m-%d')

def analyze_text(text: str) -> tuple[float, float]:
    """Analyze subjectivity and sentiment of the text."""
    blob = TextBlob(text)
    return blob.sentiment.subjectivity, blob.sentiment.polarity

def fetch_newsapi_articles() -> List[Dict[str, Any]]:
    """Fetch articles from NewsAPI."""
    url = f'https://newsapi.org/v2/everything'
    params = {
        'q': 'latest',
        'language': 'en',
        'from': START_DATE,
        'sortBy': 'publishedAt',
        'apiKey': NEWS_API_KEY
    }
    response = requests.get(url, params=params)
    
    logger.debug(f"NewsAPI Response: {response.json()}")
    articles = response.json().get('articles', [])
    logger.info(f"NewsAPI returned {len(articles)} articles")
    
    data = []
    threshold_filtered = 0
    
    for article in articles:
        date = article.get('publishedAt', '')
        source = article['source']['name']
        text = article.get('content', '')
        url = article.get('url', '')
        subjectivity_score, sentiment_score = analyze_text(text)
        if subjectivity_score <= SUBJECTIVITY_THRESHOLD and abs(sentiment_score) <= SENTIMENT_THRESHOLD:
            threshold_filtered += 1
            data.append({
                'source': source,
                'date': parse_date(date),
                'text': text,
                'url': url,
                'subjectivity_score': subjectivity_score,
                'sentiment_score': sentiment_score
            })
    
    logger.info(f"NewsAPI: {len(articles)} total, {threshold_filtered} passed threshold filter")
    return data

def fetch_plos_articles() -> List[Dict[str, Any]]:
    base_url = 'https://journals.plos.org/plosone/'
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all articles using the main content area
    articles = soup.select('main article')
    logger.info(f"PLOS returned {len(articles)} articles")
    
    data = []
    threshold_filtered = 0
    
    for article in articles:
        # Get article title and link
        title_element = article.select_one('h2 a')
        if not title_element:
            logger.debug("Article skipped due to missing title element.")
            continue
            
        title = title_element.get_text(strip=True)
        # Properly format the URL by combining base_url with the relative path
        link = f"{base_url.rstrip('/')}{title_element['href']}"
        
        # Get date from the article element
        date_element = article.select_one('p.date')
        if not date_element:
            logger.debug(f"Article '{title}' skipped due to missing date element.")
            continue
        
        date_text = date_element.get_text(strip=True)
        date = parse_date(date_text)
        
        # Check if the article is from the last 3 months
        if (datetime.now() - datetime.strptime(date, '%Y-%m-%d')).days > 90:
            logger.debug(f"Article '{title}' skipped because it is older than 3 months.")
            continue
        
        # Fetch full article content
        try:
            article_response = requests.get(link)
            article_soup = BeautifulSoup(article_response.content, 'html.parser')
            
            # Get article abstract or content
            content_element = article_soup.select_one('div.abstract')
            text = f"{title} - {content_element.get_text(strip=True)}" if content_element else title
            
            subjectivity_score, sentiment_score = analyze_text(text)
            if subjectivity_score <= SUBJECTIVITY_THRESHOLD and abs(sentiment_score) <= SENTIMENT_THRESHOLD:
                threshold_filtered += 1
                data.append({
                    'source': 'PLOS',
                    'date': date,
                    'text': text,
                    'url': link,
                    'subjectivity_score': subjectivity_score,
                    'sentiment_score': sentiment_score
                })
            else:
                logger.debug(f"Article '{title}' skipped due to subjectivity or sentiment score thresholds.")
        except Exception as e:
            logger.error(f"Error fetching article content for '{title}': {e}")
            continue
    
    logger.info(f"PLOS: {len(articles)} total, {threshold_filtered} passed threshold filter")
    return data


In [21]:

def fetch_who_articles() -> List[Dict[str, Any]]:
    """Scrape recent news releases from WHO."""
    url = 'https://www.who.int/news-room/releases'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    articles = soup.find_all('div', class_='list-view--item vertical-list-item')
    logger.info(f"WHO returned {len(articles)} articles")
    
    data = []
    threshold_filtered = 0
    
    for article in articles:
        title = article.find('a').get_text(strip=True)
        date_text = article.find('span', class_='timestamp').get_text(strip=True)
        date = parse_date(date_text)
        link = "https://www.who.int" + article.find('a')['href']
        subjectivity_score, sentiment_score = analyze_text(title)
        if subjectivity_score <= SUBJECTIVITY_THRESHOLD and abs(sentiment_score) <= SENTIMENT_THRESHOLD:
            threshold_filtered += 1
            data.append({
                'source': 'WHO',
                'date': date,
                'text': title,
                'url': link,
                'subjectivity_score': subjectivity_score,
                'sentiment_score': sentiment_score
            })
    
    logger.info(f"WHO: {len(articles)} total, {threshold_filtered} passed threshold filter")
    return data

def fetch_un_articles() -> List[Dict[str, Any]]:
    """Scrape recent news from the United Nations."""
    url = 'https://www.un.org/press/en'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    articles = soup.find_all('div', class_='views-row')
    logger.info(f"UN returned {len(articles)} articles")
    
    data = []
    threshold_filtered = 0
    
    for article in articles:
        title = article.find('a').get_text(strip=True)
        date_text = article.find('span', class_='date-display-single').get_text(strip=True)
        date = parse_date(date_text)
        link = "https://www.un.org" + article.find('a')['href']
        subjectivity_score, sentiment_score = analyze_text(title)
        if subjectivity_score <= SUBJECTIVITY_THRESHOLD and abs(sentiment_score) <= SENTIMENT_THRESHOLD:
            threshold_filtered += 1
            data.append({
                'source': 'UN',
                'date': date,
                'text': title,
                'url': link,
                'subjectivity_score': subjectivity_score,
                'sentiment_score': sentiment_score
            })
    
    logger.info(f"UN: {len(articles)} total, {threshold_filtered} passed threshold filter")
    return data

def fetch_gdelt_articles() -> List[Dict[str, Any]]:
    """Fetch recent articles from GDELT Project."""
    params = {
        'format': 'json',
        'maxrecords': 250,
        'timespan': '1440',  # Last 24 hours
        'format': 'json'
    }
    
    response = requests.get(GDELT_BASE_URL, params=params)
    articles = response.json().get('articles', [])
    
    data = []
    for article in articles:
        text = f"{article.get('title', '')} {article.get('seentext', '')}"
        date = parse_date(article.get('seendate', ''))
        subjectivity_score, sentiment_score = analyze_text(text)
        
        if subjectivity_score <= SUBJECTIVITY_THRESHOLD and abs(sentiment_score) <= SENTIMENT_THRESHOLD:
            data.append({
                'source': 'GDELT',
                'date': date,
                'text': text,
                'url': article.get('url', ''),
                'subjectivity_score': subjectivity_score,
                'sentiment_score': sentiment_score
            })
    
    logger.info(f"GDELT: {len(articles)} total, {len(data)} passed threshold filter")
    return data

def fetch_datagov_articles() -> List[Dict[str, Any]]:
    """Fetch recent datasets from Data.gov."""
    client = sodapy.Socrata("data.gov", DATAGOV_API)
    
    # Fetch recent datasets
    results = client.get("7g6j-rrh5", limit=100, order="modification_date DESC")
    
    data = []
    for result in results:
        text = f"{result.get('title', '')} {result.get('description', '')}"
        date = parse_date(result.get('modification_date', ''))
        subjectivity_score, sentiment_score = analyze_text(text)
        
        if subjectivity_score <= SUBJECTIVITY_THRESHOLD and abs(sentiment_score) <= SENTIMENT_THRESHOLD:
            data.append({
                'source': 'Data.gov',
                'date': date,
                'text': text,
                'url': result.get('landingPage', ''),
                'subjectivity_score': subjectivity_score,
                'sentiment_score': sentiment_score
            })
    
    logger.info(f"Data.gov: {len(results)} total, {len(data)} passed threshold filter")
    return data

def fetch_eu_data_articles() -> List[Dict[str, Any]]:
    """Fetch recent datasets from EU Open Data Portal."""
    params = {
        'limit': 100,
        'sort': 'modified',
        'order': 'desc',
        'format': 'json'
    }
    
    response = requests.get(EU_DATA_BASE_URL, params=params)
    results = response.json().get('result', {}).get('results', [])
    
    data = []
    for result in results:
        text = f"{result.get('title', '')} {result.get('description', '')}"
        date = parse_date(result.get('modified', ''))
        subjectivity_score, sentiment_score = analyze_text(text)
        
        if subjectivity_score <= SUBJECTIVITY_THRESHOLD and abs(sentiment_score) <= SENTIMENT_THRESHOLD:
            data.append({
                'source': 'EU Open Data',
                'date': date,
                'text': text,
                'url': result.get('landingPage', ''),
                'subjectivity_score': subjectivity_score,
                'sentiment_score': sentiment_score
            })
    
    logger.info(f"EU Data Portal: {len(results)} total, {len(data)} passed threshold filter")
    return data


In [22]:
def fetch_who_articles() -> List[Dict[str, Any]]:
    """Scrape recent news releases from WHO with pagination."""
    base_url = 'https://www.who.int/news-room/releases'
    data = []
    threshold_filtered = 0
    page = 1
    max_articles = 100

    while len(data) < max_articles:
        url = f"{base_url}?page={page}"
        response = requests.get(url)
        if response.status_code != 200:
            logger.info(f"Failed to retrieve page {page}")
            break
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the main container
        main_container = soup.select_one('div.sf-list-vertical')
        if not main_container:
            logger.info("No main container found")
            break
            
        # Find all article rows
        article_rows = main_container.find_all('div', class_='list-view--item', recursive=False)
        if not article_rows:
            logger.info(f"No articles found on page {page}")
            break
            
        logger.info(f"WHO page {page} returned {len(article_rows)} articles")
        
        for row in article_rows:
            # Extract link
            link_elem = row.find('a')
            if not link_elem or 'href' not in link_elem.attrs:
                continue
            link = "https://www.who.int" + link_elem['href']
            
            # Extract date
            date_elem = row.select_one('div.timestamp span')
            if not date_elem:
                continue
            date_text = date_elem.get_text(strip=True)
            date = parse_date(date_text)  # This will handle "7 November 2024" format
            
            # Extract title
            title = link_elem.get_text(strip=True).replace(date_text, '').strip()
            
            # Check if article is within 3 months
            if (datetime.now() - datetime.strptime(date, '%Y-%m-%d')).days > 90:
                logger.info("Reached articles older than 3 months")
                return data
            
            # Analyze text sentiment and subjectivity
            subjectivity_score, sentiment_score = analyze_text(title)
            
            # Filter based on thresholds
            if subjectivity_score <= SUBJECTIVITY_THRESHOLD and abs(sentiment_score) <= SENTIMENT_THRESHOLD:
                threshold_filtered += 1
                data.append({
                    'source': 'WHO',
                    'date': date,
                    'text': title,
                    'url': link,
                    'subjectivity_score': subjectivity_score,
                    'sentiment_score': sentiment_score
                })
                
                if len(data) >= max_articles:
                    logger.info("Reached maximum number of articles")
                    return data
        
        page += 1
    
    logger.info(f"WHO: Total articles processed across {page-1} pages, {threshold_filtered} passed threshold filter")
    return data

In [23]:
def fetch_who_articles() -> List[Dict[str, Any]]:
    """Scrape recent news releases from WHO with pagination."""
    base_url = 'https://www.who.int/news-room/releases'
    data = []
    threshold_filtered = 0
    page = 1
    max_articles = 100

    while len(data) < max_articles:
        url = f"{base_url}?page={page}"
        response = requests.get(url)
        if response.status_code != 200:
            logger.info(f"Failed to retrieve page {page}")
            break
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the main container
        main_container = soup.select_one('div.sf-list-vertical')
        if not main_container:
            logger.info("No main container found")
            break
            
        # Find all article rows
        article_rows = main_container.find_all('div', class_='list-view--item', recursive=False)
        if not article_rows:
            logger.info(f"No articles found on page {page}")
            break
            
        logger.info(f"WHO page {page} returned {len(article_rows)} articles")
        
        for row in article_rows:
            # Extract link
            link_elem = row.find('a')
            if not link_elem or 'href' not in link_elem.attrs:
                continue
            link = "https://www.who.int" + link_elem['href']
            
            # Extract date
            date_elem = row.select_one('div.timestamp span')
            if not date_elem:
                continue
            date_text = date_elem.get_text(strip=True)
            date = parse_date(date_text)  # This will handle "7 November 2024" format
            
            # Extract title
            title = link_elem.get_text(strip=True).replace(date_text, '').strip()
            
            # Check if article is within 3 months
            if (datetime.now() - datetime.strptime(date, '%Y-%m-%d')).days > 90:
                logger.info("Reached articles older than 3 months")
                return data
            
            # Analyze text sentiment and subjectivity
            subjectivity_score, sentiment_score = analyze_text(title)
            
            # Filter based on thresholds
            if subjectivity_score <= SUBJECTIVITY_THRESHOLD and abs(sentiment_score) <= SENTIMENT_THRESHOLD:
                threshold_filtered += 1
                data.append({
                    'source': 'WHO',
                    'date': date,
                    'text': title,
                    'url': link,
                    'subjectivity_score': subjectivity_score,
                    'sentiment_score': sentiment_score
                })
                
                if len(data) >= max_articles:
                    logger.info("Reached maximum number of articles")
                    return data
        
        page += 1
    
    logger.info(f"WHO: Total articles processed across {page-1} pages, {threshold_filtered} passed threshold filter")

In [24]:
main_container


In [25]:
soup.find('div', class_='k-listview-content')

In [26]:
DATA_PATH = os.getenv('DATA_PATH')
!ls $DATA_PATH

collected_data_20241108_151603.parquet


In [40]:
df = pd.read_parquet(os.path.join(DATA_PATH, 'collected_data_20241108_151603.parquet'))
df.head()

Unnamed: 0,source,date,text,url,subjectivity_score,sentiment_score
0,Financial Post,2024-11-07,Author of the article:\r\nArticle content\r\nC...,https://financialpost.com/globe-newswire/freeh...,0.0,0.0
1,Smithsonian.com,2024-11-07,Shilo Shiv Suleman's Padma/Lotus is the first ...,https://www.smithsonianmag.com/smart-news/to-s...,0.166667,0.125
2,Bleeding Cool News,2024-11-07,"Posted in: Activision, Call of Duty, Call Of D...",https://bleedingcool.com/games/call-of-duty-bl...,0.433333,-0.166667
3,GamesRadar+,2024-11-07,Dragon Ball: Sparking Zero has become a big an...,https://www.gamesradar.com/games/fighting/drag...,0.375758,0.272727
4,Financial Post,2024-11-07,Lightspeed Commerce Inc.s chief executive says...,https://financialpost.com/pmn/lightspeed-wants...,0.435859,-0.004798


In [42]:
import re
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [45]:
# b. Clean the text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text.lower()

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

df.loc[:, 'clean_text'] = df.loc[:, 'text'].apply(clean_text)
df.head()

df = df.drop_duplicates(subset=['clean_text'])

df.loc[:, 'is_english'] = df.loc[:, 'clean_text'].apply(is_english)
df = df[df['is_english']]


In [46]:
def create_qa_pair(text):
    # Simplified placeholder function
    # Needs to be adapted based on actual text patterns
    parts = text.split(' ')
    if len(parts) > 4:
        question = ' '.join(parts[:4]) + '?'
        answer = ' '.join(parts[4:])
        return {'question': question, 'answer': answer}
    else:
        return None

df['qa_pair'] = df['clean_text'].apply(create_qa_pair)
df = df.dropna(subset=['qa_pair'])

df['question'] = df['qa_pair'].apply(lambda x: x['question'])
df['answer'] = df['qa_pair'].apply(lambda x: x['answer'])

qa_df = df[['question', 'answer']]

# Split the QA data
qa_train, qa_temp = train_test_split(qa_df, test_size=0.2, random_state=42)
qa_val, qa_test = train_test_split(qa_temp, test_size=0.5, random_state=42)

# # Save QA datasets
# qa_train.to_json('qa_train.jsonl', orient='records', lines=True)
# qa_val.to_json('qa_validation.jsonl', orient='records', lines=True)
# qa_test.to_json('qa_test.jsonl', orient='records', lines=True)

# # Create Hugging Face datasets for QA
# train_dataset = Dataset.from_pandas(qa_train)
# val_dataset = Dataset.from_pandas(qa_val)
# test_dataset = Dataset.from_pandas(qa_test)

# qa_dataset = DatasetDict({
#     'train': train_dataset,
#     'validation': val_dataset,
#     'test': test_dataset
# })

# # Save QA datasets
# qa_dataset.save_to_disk('qa_dataset')