In [12]:
import os
import re
import requests
from bs4 import BeautifulSoup
from time import sleep

# Ensure the directory exists
output_dir = "Country-performers"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Function to clean and extract proper performer names from Wikipedia link formats
def clean_performer_name(raw_name):
    # Remove any leading "*[[", trailing "]]", and birth/death year annotations
    name = re.sub(r'\*?\[\[|\]\]|\(.*?\)', '', raw_name).strip()
    # Handle names with pipes ("|"), only take the part after the "|", or before if there's no pipe
    name = name.split('|')[-1]
    return name.strip()

# Load performer names from performers.txt
performer_file = "performers.txt"  # Replace this with the actual path to your performers.txt file
with open(performer_file, 'r', encoding='utf-8') as f:
    raw_performers = [line.strip() for line in f.readlines() if line.strip()]

# Clean all performer names
performers = [clean_performer_name(raw_performer) for raw_performer in raw_performers]

# Wikipedia API setup
WIKIPEDIA_API_URL = "https://en.wikipedia.org/w/api.php"
headers = {
    'User-Agent': 'YourAppName/1.0 (your_email@example.com)'
}

# Process each page individually
for title in performers:
    title_formatted = title.replace(' ', '_')
    
    params = {
        "action": "parse",
        "page": title_formatted,
        "prop": "text",
        "format": "json"
    }

    try:
        response = requests.get(WIKIPEDIA_API_URL, params=params, headers=headers)
        response.raise_for_status()
        data = response.json()

        # Check for errors in the response
        if 'error' in data:
            error_code = data['error'].get('code', '')
            if error_code == 'missingtitle':
                print(f"Page '{title}' is missing. Skipping.")
                continue
            elif error_code == 'redirects':
                print(f"Page '{title}' is a redirect. Skipping.")
                continue
            else:
                print(f"An error occurred with page '{title}': {data['error'].get('info', '')}")
                continue

        if 'parse' in data:
            html_content = data['parse']['text']['*']
            # Use BeautifulSoup to parse HTML content
            soup = BeautifulSoup(html_content, 'html.parser')
            text = soup.get_text()

            # Save or process the text in the performer_files folder
            filename = os.path.join(output_dir, f"{title_formatted}_plain.txt")
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(text)
            print(f"Downloaded plain text for {title}")
        else:
            print(f"No content found for page '{title}'. Skipping.")

        # Respectful crawling
        sleep(0.5)  # Sleep for half a second between requests
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while fetching '{title}': {e}")
    except Exception as e:
        print(f"An unexpected error occurred with '{title}': {e}")


Downloaded plain text for The Abrams Brothers
Downloaded plain text for Ace in the Hole Band
Downloaded plain text for Roy Acuff
Downloaded plain text for Kay Adams
Downloaded plain text for Ryan Adams
Downloaded plain text for Doug Adkins
Downloaded plain text for Trace Adkins
Downloaded plain text for David "Stringbean" Akeman
Downloaded plain text for Rhett Akins
Downloaded plain text for Alabama
Downloaded plain text for Lauren Alaina
Downloaded plain text for Jason Aldean
Downloaded plain text for Alee
Downloaded plain text for Daniele Alexander
Downloaded plain text for Jessi Alexander
Downloaded plain text for Gary Allan
Downloaded plain text for Susie Allanson
Downloaded plain text for Deborah Allen
Downloaded plain text for Duane Allen
Downloaded plain text for Harley Allen
Downloaded plain text for Jimmie Allen
Downloaded plain text for Rex Allen
Downloaded plain text for Terry Allen
Downloaded plain text for Allman Brothers Band
Downloaded plain text for Gregg Allman
Downloa