In [9]:
import os
import re
import requests
from bs4 import BeautifulSoup
from time import sleep

# Ensure the directory exists
output_dir = "Country-performers"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

page_titles = list(set(performer[0] for performer in performers))
page_titles = [title.strip().replace(' ', '_') for title in page_titles]

# Wikipedia API setup
WIKIPEDIA_API_URL = "https://en.wikipedia.org/w/api.php"
headers = {
    'User-Agent': 'YourAppName/1.0 (your_email@example.com)'
}

# Process each page individually
for title in page_titles:
    params = {
        "action": "parse",
        "page": title,
        "prop": "text",
        "format": "json"
        # No "redirects": 1, to skip redirects
    }

    try:
        response = requests.get(WIKIPEDIA_API_URL, params=params, headers=headers)
        response.raise_for_status()
        data = response.json()

        # Check for errors in the response
        if 'error' in data:
            error_code = data['error'].get('code', '')
            if error_code == 'missingtitle':
                print(f"Page '{title}' is missing. Skipping.")
                continue
            elif error_code == 'redirects':
                print(f"Page '{title}' is a redirect. Skipping.")
                continue
            else:
                print(f"An error occurred with page '{title}': {data['error'].get('info', '')}")
                continue

        if 'parse' in data:
            html_content = data['parse']['text']['*']
            # Use BeautifulSoup to parse HTML content
            soup = BeautifulSoup(html_content, 'html.parser')
            text = soup.get_text()

            # Save or process the text in the performer_files folder
            filename = os.path.join(output_dir, f"{title}_plain.txt")
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(text)
            print(f"Downloaded plain text for {title}")
        else:
            print(f"No content found for page '{title}'. Skipping.")

        # Respectful crawling
        sleep(0.5)  # Sleep for half a second between requests
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while fetching '{title}': {e}")
    except Exception as e:
        print(f"An unexpected error occurred with '{title}': {e}")


NameError: name 'performers' is not defined