In [2]:
import requests
import json
from bs4 import BeautifulSoup
import time
import re
import pandas as pd

# Read the actors.csv file
csv_file_path = r"C:/Users/Bas van Houten/Documents/Master/Thesis/gen/data-preparation/temp/info_actors.csv"
df = pd.read_csv(csv_file_path)

# Extract the primaryName column
names = df["primaryName"]

# Construct the URLs
base_url = "https://ethnicelebs.com/"
urls = [base_url + name.lower().replace(" ", "-") for name in names]

# Create a list to store the scraped data
data = []

# Set the User-Agent header
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Limit the URLs to the first 20
urls = urls[:100]

# Loop through the URLs
for url in urls:
    # Extract the name from the URL
    name = url.split("/")[-1].replace("-", " ")

    # Send a GET request to the URL with the User-Agent header
    response = requests.get(url, headers=headers)

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the <p> tag containing the date of birth
    p_tags = soup.find_all("p")

    # Initialize birth_year as None
    birth_year = None

    # Iterate over the <p> tags and extract the birth year
    for p_tag in p_tags:
        if "Date of Birth" in p_tag.text:
            # Extract the birth year using regular expressions
            birth_year_match = re.search(r"\d{4}", p_tag.text)
            if birth_year_match:
                birth_year = birth_year_match.group(0)
            break

    # Find the <strong> tag containing the ethnicity information
    strong_tag = soup.find("strong")

    # Extract the ethnicity from the <strong> tag if it exists
    ethnicity = strong_tag.text.replace("Ethnicity:", "").strip() if strong_tag else "N/A"

    # Create a dictionary to store the data
    entry = {
        "Name": name,
        "Ethnicity": ethnicity,
        "Birth Year": birth_year
    }

    # Append the entry to the data list
    data.append(entry)

    # Close the response
    response.close()

    # Add a delay of 1 second
    time.sleep(1)

# Write the scraped data to a JSON file
file_name = "scraped_data_sample.json"
with open(file_name, "w") as json_file:
    json.dump(data, json_file, indent=4)

print("Data written to:", file_name)


Data written to: scraped_data_sample.json
