In [15]:
import subprocess
import sys
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from googletrans import Translator

# This command ensures the required libraries are installed before the script runs.
# It is a standard practice for creating a self-contained script.
try:
    print("Installing required dependencies...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "googletrans==4.0.0rc1", "requests", "beautifulsoup4", "pandas"])
    print("Installation complete. Proceeding with scraping and translation...")
except subprocess.CalledProcessError as e:
    print(f"Error during installation: {e}", file=sys.stderr)
    sys.exit(1)

# Headers to mimic a web browser
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Define the topic and keywords you want to scrape for
TARGET_TOPIC = 'finance'
TARGET_KEYWORDS = ['owo', 'aje', 'okowo']

BASE_URL = 'https://www.bbc.com/yoruba'
translator = Translator()

def translate_text(text):
    """Translates text using the googletrans library."""
    # Split text for translation to avoid API limits on long texts
    chunks = [text[i:i + 4000] for i in range(0, len(text), 4000)]
    translated_chunks = []
    retries = 3
    for chunk in chunks:
        for i in range(retries):
            try:
                translated_chunks.append(translator.translate(chunk, src='yo', dest='en').text)
                time.sleep(1) # Be respectful of the API
                break
            except Exception as e:
                if i < retries - 1:
                    print(f"Translation error: {e}. Retrying in {2**(i+1)} seconds...", file=sys.stderr)
                    time.sleep(2**(i+1))
                else:
                    print(f"Translation failed after {retries} attempts for a chunk.", file=sys.stderr)
                    translated_chunks.append('Translation not available.')
    return " ".join(translated_chunks)

def get_article_content(url):
    """Fetches and extracts the title and body text from a single article URL."""
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        title_element = soup.find('h1')
        title = title_element.get_text(strip=True) if title_element else 'No Title'

        article_body = soup.find('article') or soup.find('main') or soup.find('div', role='main')
        if not article_body:
            print(f"Could not find main article body for {url}", file=sys.stderr)
            return title, None

        body_parts = article_body.find_all('p')
        article_text = ' '.join([p.get_text() for p in body_parts])

        article_text = re.sub(r'\s+', ' ', article_text).strip()

        return title, article_text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching article at {url}: {e}", file=sys.stderr)
        return None, None

def scrape_bbc_yoruba(topic, keywords):
    """Scrapes BBC Yoruba for articles based on keywords."""
    base_url = 'https://www.bbc.com/yoruba'
    data = []

    try:
        print(f"Fetching articles from {base_url}...")
        response = requests.get(base_url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        links = soup.find_all('a', {'href': True})

        article_urls = set()
        for link in links:
            href = link['href']
            if '/yoruba/' in href and href.startswith('https://www.bbc.com/yoruba/'):
                article_urls.add(href)

        # Filter the URLs based on the keywords in the link text or URL itself
        filtered_urls = [
            url for url in list(article_urls)[:50]
            if any(keyword in url.lower() or keyword in requests.get(url, headers=HEADERS).text.lower() for keyword in keywords)
        ]

        print(f"Found {len(filtered_urls)} {topic}-related articles. Scraping...")

        with ThreadPoolExecutor(max_workers=5) as executor:
            future_to_url = {executor.submit(get_article_content, url): url for url in filtered_urls}
            for i, future in enumerate(as_completed(future_to_url)):
                article_url = future_to_url[future]
                print(f"Processing article {i+1}/{len(filtered_urls)}: {article_url}")
                try:
                    yoruba_title, article_content_yoruba = future.result()

                    if article_content_yoruba and len(article_content_yoruba) > 20:
                        data.append({
                            'topic': topic,
                            'yoruba_title': yoruba_title,
                            'english_title': translate_text(yoruba_title),
                            'yoruba_content': article_content_yoruba,
                            'english_content': translate_text(article_content_yoruba),
                            'url': article_url
                        })
                    else:
                        print(f"Skipping article at {article_url} due to empty content.", file=sys.stderr)
                except Exception as exc:
                    print(f'{article_url} generated an exception: {exc}', file=sys.stderr)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the main page: {e}", file=sys.stderr)
        return

    if data:
        df = pd.DataFrame(data)
        output_file = f'yoruba_{topic}_datasets.csv'
        df.to_csv(output_file, index=False, encoding='utf-8')
        print(f"\n--- Successfully scraped and saved data to '{output_file}' ---")
    else:
        print(f"\nNo data was scraped for {topic}. Check the website structure or your internet connection.", file=sys.stderr)

if __name__ == '__main__':
    scrape_bbc_yoruba(TARGET_TOPIC, TARGET_KEYWORDS)


Installing required dependencies...
Installation complete. Proceeding with scraping and translation...
Fetching articles from https://www.bbc.com/yoruba...
Found 49 finance-related articles. Scraping...
Processing article 1/49: https://www.bbc.com/yoruba/articles/c05e4djvzlro
Processing article 2/49: https://www.bbc.com/yoruba/articles/ceq2xq5rw70o
Processing article 3/49: https://www.bbc.com/yoruba/articles/c7v13lj65rzo
Processing article 4/49: https://www.bbc.com/yoruba/articles/cx2p061mv3jo
Processing article 5/49: https://www.bbc.com/yoruba/articles/cd07jj2xe1ko
Processing article 6/49: https://www.bbc.com/yoruba/articles/c98l8dg40rdo
Processing article 7/49: https://www.bbc.com/yoruba/articles/cwyrxq4d0r7o
Processing article 8/49: https://www.bbc.com/yoruba/articles/cger8wp4zqyo
Processing article 9/49: https://www.bbc.com/yoruba/articles/c15lpe11qwwo
Processing article 10/49: https://www.bbc.com/yoruba/articles/cdd3lj97pddo
Processing article 11/49: https://www.bbc.com/yoruba/art

Skipping article at https://www.bbc.com/yoruba/topics/c340q0y3p5kt due to empty content.


Processing article 27/49: https://www.bbc.com/yoruba/articles/c20v3jxzzepo
Processing article 28/49: https://www.bbc.com/yoruba/articles/c0qlxp5lxj0o
Processing article 29/49: https://www.bbc.com/yoruba/articles/c931lv3ge4wo
Processing article 30/49: https://www.bbc.com/yoruba/articles/cn4lzxg14wro
Processing article 31/49: https://www.bbc.com/yoruba/articles/cx2x59n4d4jo
Processing article 32/49: https://www.bbc.com/yoruba/articles/cpv09ejglypo
Processing article 33/49: https://www.bbc.com/yoruba/articles/c3vzx66qn30o
Processing article 34/49: https://www.bbc.com/yoruba/articles/ckg2v8jwd47o
Processing article 35/49: https://www.bbc.com/yoruba/articles/cr4qlygvewxo
Processing article 36/49: https://www.bbc.com/yoruba/articles/cpwyg59jpzjo
Processing article 37/49: https://www.bbc.com/yoruba/topics/ck5rznlk6k3t
Processing article 38/49: https://www.bbc.com/yoruba/articles/cdxy704qe0zo


Skipping article at https://www.bbc.com/yoruba/topics/ck5rznlk6k3t due to empty content.


Processing article 39/49: https://www.bbc.com/yoruba/articles/czrp71rkv22o
Processing article 40/49: https://www.bbc.com/yoruba/articles/cwyw90gdxrro
Processing article 41/49: https://www.bbc.com/yoruba/articles/c9qyq3g103no
Processing article 42/49: https://www.bbc.com/yoruba/articles/ceq2x43dwn9o
Processing article 43/49: https://www.bbc.com/yoruba/articles/c7542p2qe7xo
Processing article 44/49: https://www.bbc.com/yoruba/articles/cdr6lp4r258o
Processing article 45/49: https://www.bbc.com/yoruba/articles/c4gl49q0qp7o
Processing article 46/49: https://www.bbc.com/yoruba/articles/cjr1r55821zo
Processing article 47/49: https://www.bbc.com/yoruba/articles/c8x520v41gdo
Processing article 48/49: https://www.bbc.com/yoruba/articles/cdr60zrg6m1o
Processing article 49/49: https://www.bbc.com/yoruba/articles/cpd9vdneqn9o

--- Successfully scraped and saved data to 'yoruba_finance_datasets.csv' ---
