In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from tqdm import tqdm
from ast import literal_eval

To get data from all the starwars characters from the official Starwars Databank, we first need to know all the possible URLs to get the data from. This is done by getting the list of all the characters from the 'starwars-databank-server.vercel.app' API:

In [None]:
URL_CHARACTERS = "https://starwars-databank-server.vercel.app/api/v1/characters"

def getResponse(url):
    response = requests.get(url)
    if response.status_code != 200:
        raise ValueError(f"Failed to get data. Status code: {response.status_code}")
    data = response.json()
    return data

def cleanString(var):
    return str(var).replace("\"", "\'")

def getNextUrl(data):
    BASE_URL = "https://starwars-databank-server.vercel.app/"
    try: return (BASE_URL + data["info"]["next"])
    except: return None

In [None]:
character_df_old = None
temp_url = URL_CHARACTERS
character_name_list = []

for _ in tqdm(range(100)):
    data = getResponse(temp_url)
    character_name_list += [cleanString(character["name"]) for character in data["data"]]
    
    # Since the getResponse function doesn't return all characters
    # we need to loop through the different API "pages" with getNextUrl
    temp_url = getNextUrl(data)
    if temp_url is None:
        print("Done!")
        break

url_name_list = ["-".join(name.lower().replace("\'", "").split()) for name in character_name_list]

 96%|█████████▌| 96/100 [00:54<00:02,  1.75it/s]

Done!





Now that we have a list of character names we can then start getting the character data we actually want from the official Star Wars databank website. We achieve this by appending these names to the end of the base URL. From the resulting HTML page we can then extract the desired data:

In [None]:
BASE_URL = "https://www.starwars.com/databank/"
URL_ADDITIONS = url_name_list

errors = []
number_of_errors = 0

def add_one_error():
    global number_of_errors
    number_of_errors += 1

def get_page(url):
    try: response = requests.get(url)
    except: errors.append(url); add_one_error(); return None
    return BeautifulSoup(response.text, "html.parser")

def check_if_error_404(soup):
    try: soup.find_all("p", class_="desc")[0].text; return False
    except: return True

def get_properties(soup):
    stats = soup.find_all("section", class_="module stats span-full-screen content-span-full-screen secondary-theme dark")
    categories = stats[0].find_all("div", class_="category")
    properties = {}
    for category in categories:
        heading = category.find_all("div", class_="heading")[0].text
        property_names = [p.text for p in category.find_all("div", class_="property-name")]
        properties[heading] = property_names
    return properties

def get_description(soup):
    discription = soup.find_all("p", class_="desc")[0].text
    discription = discription.replace("\n", "")
    return discription

def get_history(soup):
    all_referals = []
    all_text = soup.find_all("div", class_="rich-text-output")
    if len(all_text) == 0:
        return None, None
    minor_text_samples = all_text[0].find_all("p")
    all_history_text = ""
    for sample in minor_text_samples:
        referals = sample.find_all("a")
        for referal in referals:
            all_referals.append(referal.text.strip())
        all_history_text += " " + sample.text
    return all_history_text, all_referals

def get_descriptions(url):
    soup = get_page(url)
    if soup is None or check_if_error_404(soup):
        errors.append(url)
        add_one_error()
        return None
    description = get_description(soup)
    history, referal = get_history(soup)
    properties = get_properties(soup)
    return description, referal, properties, history

#### Getting all the correct URL's before proceeding

In [None]:
def check_if_working_url(url):
    soup = get_page(url)
    if soup is None or check_if_error_404(soup):
        return False
    return True

In [None]:
all_urls = []
errors = []

for i in tqdm(range(len(character_name_list))):
    name = character_name_list[i]
    url_addition = URL_ADDITIONS[i]
    full_url = BASE_URL + url_addition
    if check_if_working_url(full_url):
        all_urls.append((name, full_url))
    else:
        errors.append((name, full_url))

100%|██████████| 964/964 [11:32<00:00,  1.39it/s]


### Concat cleaned errors and all_urls

In [None]:
potentially_cleaned_urls = []

cleaned_errors = [lst.split("\n") for lst in open("cleaned_errors.csv", "r").readlines()]
for cleaned_error in cleaned_errors:
    potentially_cleaned_urls.append(cleaned_error[0])

In [None]:
errors = []

for i in tqdm(range(len( potentially_cleaned_urls))):
    name = literal_eval(potentially_cleaned_urls[i])[0]
    full_url = literal_eval(potentially_cleaned_urls[i])[1]
    if check_if_working_url(full_url):
        all_urls.append((name, full_url))
    else:
        errors.append((name, full_url))

100%|██████████| 68/68 [00:41<00:00,  1.63it/s]


In [None]:
with open("errors.csv", "w") as f:
    for line in errors:
        f.write(str(line) + "\n")

### Making the dataframe

In [None]:
# creating dataframe objects:
new_names = []
errors = []
descriptions = []
referals = []
properties = []
histories = []
number_of_errors = 0

for i in tqdm(range(len(all_urls))):
    name = all_urls[i][0]
    full_url = all_urls[i][1]
    descriptions_tuple = get_descriptions(full_url)

    if descriptions_tuple is None:
        continue 
    
    description, referal, property, history = descriptions_tuple
    new_names.append(name)
    descriptions.append(description)
    referals.append(referal)
    properties.append(property)
    histories.append(history)

    all_urls.append((name, full_url))  

print("Number of errors: ", number_of_errors)
print(*errors, sep="\n") 

cleaned_descriptions = [description.replace("\n", "").replace("\r", "") for description in descriptions]
df = pd.DataFrame({"name": new_names, "description": cleaned_descriptions, "referals": referals, "properties": properties, "history": history})
df.to_csv("descriptions.csv", sep=";")

100%|██████████| 958/958 [10:44<00:00,  1.49it/s]

Number of errors:  0






In [None]:
print("Number of errors: ", number_of_errors)
with open("errors.csv", "w") as f:
    for line in errors:
        f.write(line + "\n")

Number of errors:  0
