In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
characters = pd.read_csv("Characters.csv")
names = characters["name"].tolist()
URL_ADDITIONS = ["-".join(name.lower().replace("\'", "").split()) for name in names]
BASE_URL = "https://www.starwars.com/databank/"

In [3]:
errors = []
number_of_errors = 0

def add_one_error():
    global number_of_errors
    number_of_errors += 1

def get_page(url):
    try: response = requests.get(url)
    except: errors.append(url); add_one_error(); return None
    return BeautifulSoup(response.text, "html.parser")

def check_if_error_404(soup):
    try: soup.find_all("p", class_="desc")[0].text; return False
    except: return True

def get_properties(soup):
    stats = soup.find_all("section", class_="module stats span-full-screen content-span-full-screen secondary-theme dark")
    categories = stats[0].find_all("div", class_="category")
    properties = {}
    for category in categories:
        heading = category.find_all("div", class_="heading")[0].text
        property_names = [p.text for p in category.find_all("div", class_="property-name")]
        properties[heading] = property_names
    return properties

def get_description(soup):
    discription = soup.find_all("p", class_="desc")[0].text
    return discription

def get_history(soup):
    all_referals = []
    all_text = soup.find_all("div", class_="rich-text-output")
    if len(all_text) == 0:
        return None, None
    minor_text_samples = all_text[0].find_all("p")
    all_history_text = ""
    for sample in minor_text_samples:
        referals = sample.find_all("a")
        for referal in referals:
            all_referals.append(referal.text.strip())
        all_history_text += " " + sample.text
    return all_history_text, all_referals

def get_descriptions(url):
    soup = get_page(url)
    if soup is None or check_if_error_404(soup):
        errors.append(url)
        add_one_error()
        return None
    description = get_description(soup)
    history, referal = get_history(soup)
    properties = get_properties(soup)
    return description, referal, properties, history

In [4]:
from tqdm import tqdm

# creating dataframe objects:
new_names = []
descriptions = []
referals = []
properties = []
histories = []

for i in tqdm(range(len(names))):
    name = names[i]
    url_addition = URL_ADDITIONS[i]
    full_url = BASE_URL + url_addition
    descriptions_tuple = get_descriptions(full_url)

    if descriptions_tuple is None:
        continue 
    
    description, referal, property, history = descriptions_tuple
    new_names.append(name)
    descriptions.append(description)
    referals.append(referal)
    properties.append(property)
    histories.append(history)

100%|██████████| 964/964 [14:34<00:00,  1.10it/s]


In [8]:
df = pd.DataFrame({"description": descriptions, "referals": referals, "properties": properties, "history": history})
df.to_csv("descriptions.csv", sep=";")

In [11]:
print("Number of errors: ", number_of_errors)
with open("errors.csv", "w") as f:
    for line in errors:
        f.write(line + "\n")

Number of errors:  70
