### Libraries

In [1]:
import re
import os
import ast
import json
import urllib
import requests
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
from urllib.request import urlopen

#### Create the directories

In [2]:
Path("Data").mkdir(parents=True, exist_ok=True)
Path("Data/Characters").mkdir(parents=True, exist_ok=True)
Path("Data/Episodes").mkdir(parents=True, exist_ok=True)

#### Extract every planet that characters reside

Due to the fact that, in the Star Wars wikia page there isn't any available source that could classify the characters in a few important categories, it was decided that it would be most interesting to be sorted by the planet on which they reside

In [3]:
planets = []

page_title = "https://starwars.fandom.com/wiki/Category:Individuals_by_planet"
title = f"titles={page_title}"
page = requests.get(page_title)
soup = BeautifulSoup(page.content, "html.parser")
planet_pattern = re.compile(r'/wiki/(.*)')
for link in soup.find_all("a", {"class": "category-page__member-link"}):
    href = link.get("href")
    name =  planet_pattern.match(href).group(1)
    planets.append(name)

#### Extract these characters along with the information on where they reside

In [4]:
char = pd.DataFrame(columns = ['Name', 'Residence'])

for planet in planets:
    characters = []

    page_title = "https://starwars.fandom.com/wiki/" + planet   
    title = f"titles={page_title}"
    page = requests.get(page_title)
    soup = BeautifulSoup(page.content, "html.parser")
    character_pattern = re.compile(r'/wiki/(.*)')
    category_pattern = re.compile(r'Category:')
    legend_pattern = re.compile(r'(.*)/Legends')
    for link in soup.find_all("a", {"class": "category-page__member-link"}):
        href = link.get("href")
        name =  character_pattern.match(href).group(1)
        if not category_pattern.match(name) and not legend_pattern.match(name):
            characters.append(name)
    temp_char = pd.DataFrame(characters, columns=['Name'])
    temp_char['Residence'] = planet[9:]
    char = char.append(temp_char)

#### Add characters from planets with more than one page

There are planets, where their pages are more than one and with the above method the characters on the next pages are not added. So the below code will be implemented in order to add them all

In [5]:
rest_pages = {'Alderaanians' : ['https://starwars.fandom.com/wiki/Category:Alderaanians?from=Othona%2C+Dalus%0ADalus+Othona'],
              'Corellians' : ['https://starwars.fandom.com/wiki/Category:Corellians?from=Forte%2C+Crale%0ACrale+Forte', 'https://starwars.fandom.com/wiki/Category:Corellians?from=Mullawny%2C+Stax%0AStax+Mullawny', 'https://starwars.fandom.com/wiki/Category:Corellians?from=Vergesso%2C+Ecile%0AEcile+Vergesso'],
              'Coruscanti' : ['https://starwars.fandom.com/wiki/Category:Coruscanti?from=Jool', 'https://starwars.fandom.com/wiki/Category:Coruscanti?from=Sel%27Sabagno%2C+Elan%0AElan+Sel%27Sabagno%2FLegends'],
              'Naboo' : ['https://starwars.fandom.com/wiki/Category:Naboo?from=Unidentified+Naboo+girl'],
              'Tatooinians' : ['https://starwars.fandom.com/wiki/Category:Tatooinians?from=Drow%2C+Vek%0AVek+Drow', 'https://starwars.fandom.com/wiki/Category:Tatooinians?from=Kithaba%2FLegends', 'https://starwars.fandom.com/wiki/Category:Tatooinians?from=Rile%2C+Chastina%0AChastina+Rile', 'https://starwars.fandom.com/wiki/Category:Tatooinians?from=Venn%2C+Tanis%0ATanis+Venn']}

for k in rest_pages.keys():
    planet = 'Category:' + k
    for page_title in rest_pages[k]:
        characters = []
        title = f"titles={page_title}"
        page = requests.get(page_title)
        soup = BeautifulSoup(page.content, "html.parser")
        character_pattern = re.compile(r'/wiki/(.*)')
        category_pattern = re.compile(r'Category:')
        legend_pattern = re.compile(r'(.*)/Legends')
        for link in soup.find_all("a", {"class": "category-page__member-link"}):
            href = link.get("href")
            name =  character_pattern.match(href).group(1)
            if not category_pattern.match(name) and not legend_pattern.match(name):
                characters.append(name)
        temp_char = pd.DataFrame(characters, columns=['Name'])
        temp_char['Residence'] = planet[9:]
        char = char.append(temp_char)

#### Create the DataFrame for the characters

In [6]:
char['Name'] = char['Name'].str.replace('%C3%A9','é')
char['Name'] = char['Name'].str.replace('%C3%B3','ó')
char['Name'] = char['Name'].str.replace('%C3%A7','ç')
char['Name'] = char['Name'].str.replace('%C3%A1','á')
char['Name'] = char['Name'].str.replace('%60','`')
char['Name'] = char['Name'].str.replace('%22','"')
char['Name'] = char['Name'].str.replace('%27',"'")
char['Residence'] = char['Residence'].str.replace('%27',"'")
char = char[char['Name']!='User:Hk_47/WiP2']
char = char.drop_duplicates(subset='Name')
char = char.sort_values(by=['Name'])
char = char[~char['Name'].str.contains('/')]
char = char.reset_index(drop=True)
char['Name'] = char['Name'].str.replace('_'," ")
char['Residence'] = char['Residence'].str.replace('_'," ")

#### Create the characters directory with every character's page

In [7]:
temp_name = char['Name'].str.replace(' ',"_")

for i in range(len(char)):
    baseurl = "https://starwars.fandom.com/api.php?"
    action = "action=query"
    title = "titles=" + urllib.parse.quote_plus(temp_name[i])
    content = "prop=revisions&rvprop=content&rvslots=*"
    dataformat = "format=json"
    query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
    
    response = urlopen(query)
    source = response.read()
    query_json = json.loads(source)
    text_name = temp_name[i] + '.txt'
    temp = './Data/Characters/' + text_name
    with open(temp, 'w') as outfile:
        json.dump(query_json, outfile)

#### Add attributes to every character

In [8]:
for filename in os.listdir(os.path.join(os.getcwd(), 'Data/Characters')):
    with open(os.path.join(os.path.join(os.getcwd(), 'Data/Characters'), filename), 'r') as f:
        contents = f.read()
        query_json = ast.literal_eval(contents)
        page_id = list(query_json['query']['pages'].keys())[0]
        string = query_json['query']['pages'][page_id]['revisions'][0]['slots']['main']['*']
        
        name = filename[:-4].replace('_',' ')
        
        # Add homeworld attribute
        if re.findall('homeworld=\[\[(.*?)\/(?:.*?)?\]\]', string) != []:
            homeworld = re.findall('homeworld=\[\[(.*?)\/(?:.*?)?\]\]', string)
        elif re.findall('homeworld=\[\[(.*?)\]\]', string) != []:
            homeworld = re.findall('homeworld=\[\[(.*?)\]\]', string)
        else:
            homeworld = ['Unknown']
        char.loc[char['Name']==name, 'Homeworld'] = homeworld
        
        # Add species attribute
        if re.findall('species=\[\[(.*?)(?:\/.*?)?(?:\|.*?)?(?:\(.*?)?\]\]', string) != []:
            species = re.findall('species=\[\[(.*?)(?:\/.*?)?(?:\|.*?)?(?:\(.*?)?\]\]', string)
        else:
            species = ['Unknown']
        char.loc[char['Name']==name, 'Species'] = species[0]
        
        # Add gender
        if re.findall('gender=\[\[Gender(?:\/.*?)?\|(.*?)\]\]', string) != []:
            gender = re.findall('gender=\[\[Gender(?:\/.*?)?\|(.*?)\]\]', string)[0] 
            gender = gender.replace('Masculine Programming', 'Masculine programming').replace('Males', 'Male').replace('male', 'Male').replace('feMale', 'Female').replace('FeMale', 'Female').replace('Females', 'Female')
        else:
            gender = 'Unknown'
        char.loc[char['Name']==name, 'Gender'] = gender

#### Download the episodes pages

In [9]:
episodes = ['Episode I', 'Episode II', 'Episode III', 'Episode IV', 'Episode V', 'Episode VI']
episodes_pages = ['Star_Wars:_Episode_I_The_Phantom_Menace',
                  'Star_Wars:_Episode_II_Attack_of_the_Clones',
                  'Star_Wars:_Episode_III_Revenge_of_the_Sith',
                  'Star_Wars:_Episode_IV_A_New_Hope',
                  'Star_Wars:_Episode_V_The_Empire_Strikes_Back',
                  'Star_Wars:_Episode_VI_Return_of_the_Jedi']

for i in range(len(episodes)):
    baseurl = "https://starwars.fandom.com/api.php?"
    action = "action=query"
    title = "titles=" + urllib.parse.quote_plus(episodes_pages[i])
    content = "prop=revisions&rvprop=content&rvslots=*"
    dataformat = "format=json"
    query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
    
    response = urlopen(query)
    source = response.read()
    query_json = json.loads(source)
    text_name = episodes[i] + '.txt'
    temp = './Data/Episodes/' + text_name
    with open(temp, 'w') as outfile:
        json.dump(query_json, outfile)

#### Find the characters from the six episodes 

In [10]:
for filename in os.listdir(os.path.join(os.getcwd(), 'Data/Episodes')):
    with open(os.path.join(os.path.join(os.getcwd(), 'Data/Episodes'), filename), 'r') as f:
        contents = f.read()
        query_json = ast.literal_eval(contents)
        page_id = list(query_json['query']['pages'].keys())[0]
        string = query_json['query']['pages'][page_id]['revisions'][0]['slots']['main']['*']
        pattern = '\[\[(.*?)(?:\|.*?)?\]\]'
        temp = re.findall(pattern, string)
        temp = list(char[char['Name'].isin(temp)]['Name'].drop_duplicates())
        char[filename[:-4]] = 0
        char.loc[char['Name'].isin(temp), filename[:-4]] = 1

#### Rearrange columns and create trilogy columns

In [11]:
columns = ['Name', 'Homeworld', 'Residence', 'Species', 'Gender', 
           'Episode I', 'Episode II', 'Episode III', 'Episode IV', 'Episode V', 'Episode VI']

char = char[columns]

char.loc[:,'First Trilogy'] = 0
char.loc[:,'Second Trilogy'] = 0
idx = (char['Episode I']==1)|(char['Episode II']==1)|(char['Episode III']==1)
char.loc[idx, 'First Trilogy'] = 1
idx = (char['Episode IV']==1)|(char['Episode V']==1)|(char['Episode VI']==1)
char.loc[idx, 'Second Trilogy'] = 1

#### Save dataframe

In [12]:
char.to_csv('./Data/Characters.csv', index=False)