In [224]:
%load_ext autoreload
%autoreload 2
import os
import time
import requests
import urllib.request
from bs4 import BeautifulSoup
from string import ascii_lowercase as alc
from PIL import Image

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
url = "https://characterprofile.fandom.com"
list_query = "/wiki/Category:Video_Game_Characters?from="
page_query = "A"

In [23]:
response = requests.get(url + list_query + page_query)
soup = BeautifulSoup(response.text, 'html.parser')

In [24]:
char_blocks = soup.find_all(class_="category-page__member")

In [27]:
char_dir = char_blocks[0].find('a').attrs['href']
char_dir

'/wiki/Abobo'

In [28]:
char_name = char_blocks[0].find('a').attrs['title']
char_name

'Abobo'

In [31]:
char_response = requests.get(url + char_dir)
char_soup = BeautifulSoup(char_response.text, 'html.parser')

In [46]:
img_url = char_soup.find(class_="wikia-infobox").find(class_="image").attrs['href']
img_url

'https://static.wikia.nocookie.net/characterprofile/images/a/a8/Abobo_-_04.png/revision/latest?cb=20200809184914'

In [106]:
img_ext = char_soup.find(class_="wikia-infobox").find('img').get('data-image-name').split('.')[-1]
img_ext

'png'

In [109]:
urllib.request.urlretrieve(img_url, f'../data/images/{char_name}.{img_ext}')

('../data/images/Abobo.png', <http.client.HTTPMessage at 0x7fe3c7bf2cb0>)

In [56]:
for i,th in enumerate(char_soup.find(class_="wikia-infobox").find_all('th')):
    if th.find('a'):
        idx = i
i

8

In [89]:
char_alignment = char_soup.find(class_="wikia-infobox").find_all('td')[idx].contents[0].strip('\n')
char_alignment

'Neutral Evil'

In [212]:
### HELPER FUNCTIONS ###
def get_chars_from_alpha(url, list_query, c):
    response = requests.get(url + list_query + c)
    soup = BeautifulSoup(response.text, 'html.parser')
    char_blocks = soup.find_all(class_="category-page__member")
    return char_blocks

def get_char(char_block):
    char_dir = char_block.find('a').attrs['href']
    char_name = char_block.find('a').attrs['title'].strip('Category')
    invalid = '<>:"/\|?*'
    for char in invalid:
        char_name = char_name.replace(char, '')
    return char_name, char_dir

def get_char_img(char_soup):
    try:
        img_url = char_soup.find(class_="wikia-infobox").find(class_="image").attrs['href']
        img_ext = char_soup.find(class_="wikia-infobox").find('img').get('data-image-name').split('.')[-1]
        image_fname = f'{char_name}.{img_ext}'
        if not os.path.isfile(f'../data/images/{image_fname}'):
            urllib.request.urlretrieve(img_url, f'../data/images/{image_fname}')   
        img_w, img_h = Image.open(f'../data/images/{image_fname}').size
    except:
        img_url, image_fname, img_h, img_w = np.nan, np.nan, np.nan, np.nan     
    return image_fname, img_h, img_w, img_url

def get_char_alignment(char_soup):
    try:
        for i,th in enumerate(char_soup.find(class_="wikia-infobox").find_all('th')):
            if th.find('a'):
                idx = i
                char_alignment = char_soup.find(class_="wikia-infobox").find_all('td')[idx].contents[0].strip('\n')
    except:
        char_alignment = np.nan
    return char_alignment

In [267]:

### MAIN LOOP ###

columns = ['Name', 'URL', 'Image fName', 'Image Height', 'Image Width', 'Image URL', 'Alignment']

url = "https://characterprofile.fandom.com"
list_query = "/wiki/Category:Video_Game_Characters?from="

try:
    df = pd.read_csv('../data/game_chars.csv')
    df = df[columns]
    l = df['Name'].iloc[-1][0].lower()
    alc = alc[alc.find(l):]
except:
    df = pd.DataFrame(columns=columns)



for c in alc[:2]:
    accumulator = []
    
    char_blocks = get_chars_from_alpha(url, list_query, c)
    
    for char_block in char_blocks:
        char_dict = dict.fromkeys(columns)
        char_name, char_dir = get_char(char_block)  
        char_dict['Name'] = char_name
        char_dict['URL'] = url+char_dir
        
        if char_name in df['Name'].tolist():
            print(f'Skipping {char_name}, already downloaded.')
        else:
            char_response = requests.get(url + char_dir)
            char_soup = BeautifulSoup(char_response.text, 'html.parser')
            print(f'Parsing {char_name}...')
        
            char_dict['Image fName'], char_dict['Image Height'], \
            char_dict['Image Width'], char_dict['Image URL'] = get_char_img(char_soup)   
             
            char_dict['Alignment'] = get_char_alignment(char_soup)
            
            accumulator.append(char_dict)
            time.sleep(1)

    df_acc = pd.DataFrame(accumulator)   
    df = pd.concat((df,df_acc))
    df.reset_index(drop=True, inplace = True)
    df.to_csv('../data/game_chars.csv', index=False)


Skipping Balrog (Street Fighter), already downloaded.
Skipping Banj, already downloaded.
Skipping Banjo Kazooi, already downloaded.
Skipping Barak, already downloaded.
Skipping Barret Wallac, already downloaded.
Skipping Bash, already downloaded.
Skipping Bass.EXE, already downloaded.
Skipping Bastion, already downloaded.
Skipping Bayonetta (character), already downloaded.
Skipping Beck (Mighty No. 9), already downloaded.
Skipping Bend, already downloaded.
Skipping Bethesda Softworks, already downloaded.
Skipping Big Boss, already downloaded.
Skipping Big Smok, already downloaded.
Skipping Bill Riz, already downloaded.
Skipping Billy and Jimmy L, already downloaded.
Skipping Bird, already downloaded.
Skipping Black Orchid, already downloaded.
Skipping Blank, already downloaded.
Skipping Blastois, already downloaded.
Skipping Blaze The , already downloaded.
Skipping Blaziken, already downloaded.
Skipping Bloody R, already downloaded.
Skipping Bol, already downloaded.
Skipping Bomberman,

KeyboardInterrupt: 

In [278]:
df = pd.read_csv('../data/game_chars.csv')
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 832 entries, 0 to 831
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          832 non-null    object 
 1   URL           832 non-null    object 
 2   Image fName   565 non-null    object 
 3   Image Height  565 non-null    float64
 4   Image Width   565 non-null    float64
 5   Image URL     565 non-null    object 
 6   Alignment     597 non-null    object 
dtypes: float64(2), object(5)
memory usage: 45.6+ KB


In [279]:
df = df.dropna().reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 563 entries, 0 to 562
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          563 non-null    object 
 1   URL           563 non-null    object 
 2   Image fName   563 non-null    object 
 3   Image Height  563 non-null    float64
 4   Image Width   563 non-null    float64
 5   Image URL     563 non-null    object 
 6   Alignment     563 non-null    object 
dtypes: float64(2), object(5)
memory usage: 30.9+ KB


In [291]:
df.groupby('Alignment')[['Name']].nunique().sort_values('Name', ascending=False).head(10)

Unnamed: 0_level_0,Name
Alignment,Unnamed: 1_level_1
Neutral Good,145
True Neutral,71
Lawful Good,69
Chaotic Good,61
Neutral Evil,51
Chaotic Evil,47
Chaotic Neutral,29
Lawful Evil,14
Lawful Neutral,13
Unknown,12


In [295]:
df.loc[df['Alignment'] == "True Neutral"]['Name'].tolist()

['Abomasnow',
 'Amoonguss',
 'Arceus',
 'Augus',
 'Bastion',
 'Blastoise',
 'Blaziken',
 'Cody Travers',
 'Darmanitan',
 'Deoxys',
 'Ditto',
 'Dunsparce',
 'Enderman',
 'Escavalier',
 'Exeggutor',
 'Fearow',
 'Flygon',
 'Frisk',
 'Gliscor',
 'Golurk',
 'Gray Fox',
 'Greninja',
 'Hanzo Shimada',
 'Haxorus',
 'Heracross',
 'Huitzil',
 'Infernape',
 'Jumpluff',
 'Lara Croft',
 'Lucario',
 'Luvdisc',
 'Machamp',
 'Max Brass',
 'Morrigan Aensland',
 'Napstablook',
 'Noctowl',
 'Porygon-Z',
 'Quagsire',
 'Rayquaza',
 'Reuniclus',
 'Roxas',
 'Sakuya Izayoi',
 'Sceptile',
 'Shigure Rangetsu',
 'Shiki (Samurai Shodown)',
 'Smeargle',
 'Strider Hiryu',
 'Taka',
 'Terry (Banjo-Kazooie)',
 'The Demoman (Classic)',
 'The Engineer',
 'The Engineer (Classic)',
 'The Medic (Classic)',
 'The Pyro',
 'The Pyro (Classic)',
 'The Scout',
 'The Scout (Classic)',
 'The Sniper (Classic)',
 'The Soldier',
 'The Soldier (Classic)',
 'The Spy',
 'The Spy (Classic)',
 'Vespiquen',
 'Viola',
 'Whitney',
 'Wiggler