In [2]:
from bs4 import BeautifulSoup

In [3]:
# '../data/prova_1500/article_0.html'
def get_soup(fname):
    f = open(fname, 'r')
    soup = BeautifulSoup(f.read(), 'html.parser')
    f.close()
    return soup

In [5]:
def get_title(soup, ret):
    title = soup.find('title')
    ret['title'] = title.contents[0].strip()

In the following cell I'll iterate over the divs with class 'spaceit_pad' that contain some spam containing the value of interest for as that are:

* anime_type: Type
* number of episodes: Episodes
* release and end: Aired
* number of members: Members
* score: Score
* users: (in the same spam of Score)
* rank: Ranked
* popularity: Popularity

In [104]:
from datetime import datetime

def get_left_attributes(soup, ret):
    divs = soup.find_all('div', {"class": "spaceit_pad"})
    from_interest = ['Episodes:','Aired:','Members:','Ranked:','Popularity:']
    for div in divs:
        content = div.contents
        tag = content[1].contents[0]
        if tag=='Score:':
            attr= {'itemprop':'ratingValue'}
            score = div.find('span', attr)
            ret['score'] = float(score.contents[0]) if score is not None else None
            attr = {'itemprop':'ratingCount'}
            users = div.find('span', attr)
            ret['users'] = int(users.contents[0]) if users is not None else None
        elif tag == 'Type:':
            ret['type'] = content[3].contents[0]
        elif tag in from_interest:
            val = content[2].strip()
            if val.startswith('#'):
                val=val[1:]
                ret[tag[:-1].lower()] = int(val)
            elif tag == 'Aired:':
                if 'to' in val:
                    start, end = val.split('to')
                    start = start.strip()
                    start = datetime.strptime(start, '%b %d, %Y')
                    end = datetime.strptime(end.strip(), '%b %d, %Y') if '?' not in end.strip() else None
                    val = f'start: {start}, end: {end}'
                    ret['start_date'] = start
                    ret['end_date'] = end
                else:
                    start = datetime.strptime(val.strip(), '%b %d, %Y')
                    ret['start_date'] = start
                    ret['end_date'] = None
            else:
                val = val.replace(',','')
                
                ret[tag[:-1].lower()] = int(val) if val.isnumeric() else None #Sometimes val can be 'unknown' (i.e. see OnePiece)
        #print(div.contents)

The remaining fields are:
* **Synopsis** (to save as animeDescription): *String*
* **Related Anime** (to save as animeRelated): Extract all the related animes, but only keep unique values and those that have a hyperlink associated to them. *List of strings*.
* **Characters** (to save as animeCharacters): *List of strings*.
* **Voices** (to save as animeVoices): *List of strings*
* **Staff** 

They are contained in different structures of the right side of the page and will be retrieved in the following cells

In this cell I'll retrieve the description of the anime (synopsis)

In [7]:
def get_synopsis(soup, ret):
    synopsis = soup.find('p', {'itemprop':'description'}).contents[0]
    ret['synopsis'] = synopsis

In this cell I'll retrieve the **distinct** related animes in the page that can be found in the table with class 'anime_detail_related_anime'

In [64]:
def get_related_animes(soup, ret):
    related_animes_table = soup.find('table', {'class': 'anime_detail_related_anime'})
    if related_animes_table is None:
        print('no related animes')
        return
    animes = set()
    for a in related_animes_table.find_all('a'):
        a_cont = a.contents
        if len(a_cont) > 0:
            animes.add(a_cont[0])

    ret['related_anime'] = list(animes)


In [19]:
def get_staff(div, ret):
    i = 1
    staff = []
    for td in div.find_all('td', {'class':'borderClass'}):
        if (i)==0:
            #print(td)
            a, small = td.find_all(['small', 'a'])
            staff.append([a.contents[0], small.contents[0].split(',')])

        i = (i+1)%2
    ret['staff'] = staff

Now the only features of interest are voices and characters and they can be found in the other div of the class "detail-characters-list clearfix" and they belong to the same table but in two different columns

In [94]:
def get_characters_voices(div, ret):
    ret['characters'] = []
    ret['voices'] = []
    ch_vc = [[], []]
    links = list(map(lambda x: x.contents[0], div.find_all('a')))
    for tr in div.find_all('tr'):
        tds = tr.find_all('td', {'class':'borderClass'})
        if len(tds) != 3:
            continue

        ch = tds[1]
        vc = tds[2]
        ch_a = ch.find('a')
        vc_a = vc.find('a')
        if ch_a is not None:
            ret['characters'].append(ch_a.contents[0])
        if vc_a is not None:
            ret['voices'].append(vc_a.contents[0])


In [81]:
def get_total_info(fname):
    ret = dict()
    soup = get_soup(fname)
    get_title(soup, ret)
    get_left_attributes(soup, ret)
    get_synopsis(soup, ret)
    get_related_animes(soup, ret)
    divs = soup.find_all('div', {'class':"detail-characters-list clearfix"})
    if len(divs) == 0:
        ret['characters'] = []
        ret['voices'] = []
    elif len(divs) != 2:
        if divs[0].find('h3', {'class':"h3_characters_voice_actors"}) is not None:
            print('only ch_voices')
            get_characters_voices(divs[0], ret)
            ret['staff'] = []
        else:
            print('only staff')
            get_staff(divs[0], ret)
            ret['characters'] = []
            ret['voices'] = []
    else:
        get_characters_voices(divs[0], ret)
        get_staff(divs[1], ret)
    return ret
    

In [98]:
prova = get_total_info('../data/to8760/article_0.html')
print(list(zip(prova['characters'], prova['voices'])))

[('Elric, Edward', 'Park, Romi'), ('Elric, Alphonse', 'Kugimiya, Rie'), ('Mustang, Roy', 'Miki, Shinichiro'), ('Hughes, Maes', 'Fujiwara, Keiji'), ('Greed', 'Nakamura, Yuuichi'), ('Hawkeye, Riza', 'Orikasa, Fumiko'), ('Yao, Ling', 'Miyano, Mamoru'), ('Armstrong, Alex Louis', 'Utsumi, Kenji'), ('Rockbell, Winry', 'Takamoto, Megumi'), ('Armstrong, Olivier Mira', 'Soumi, Youko')]


In [46]:
base_dir = os.path.join('..', 'data', 'to8760')
fnames = sorted(os.listdir(base_dir), key= lambda x: int(x.split('.')[0].split('_')[1]))

In [47]:
print(get_total_info(os.path.join(base_dir, fnames[-1])))

Score:
{'title': 'Nurse Witch Komugi-chan R - MyAnimeList.net', 'type': 'TV', 'episodes': 12, 'start_date': datetime.datetime(2016, 1, 10, 0, 0), 'end_date': datetime.datetime(2016, 3, 27, 0, 0), 'score': 5.97, 'users': 3269, 'ranked': 8764, 'popularity': 5052, 'members': 11364, 'synopsis': 'The new "slapstick" story will depict Komugi-chan and her rivals as they juggle their daily lives as students, idols, and magical girls "with laughter and tears."', 'related_anime': ['Nurse Witch Komugi-chan Magikarte'], 'characters': ['Yoshida, Komugi', 'Saionji, Kokona', 'Kisaragi, Tsukasa', 'Rei', 'Tamako', 'Misuzu', 'Miki', 'Maki', 'Hime-P', 'Lilia'], 'voices': ['Tomoe, Kei', 'Yamazaki, Erii', 'Koichi, Makoto', 'Matsui, Eriko', 'Kohinata, Akane', 'Maeda, Rena', 'Tachibana, Meemu', 'Asahina, Madoka', 'Momoi, Haruko', 'Satake, Uki'], 'staff': [['Kawaguchi, Keiichirou', ['Director', ' Storyboard']], ['Imaizumi, Yuuichi', ['Sound Director']], ['Fudeyasu, Kazuyuki', ['Script']], ['Murakami, Momoko',

In [48]:
f_15k = '../data/article_15000.html'
print(get_total_info(f_15k))

Score:
{'title': 'Bikkuriman Kids: Theme Fighter Nyander - MyAnimeList.net', 'type': 'ONA', 'episodes': 4, 'start_date': datetime.datetime(2008, 8, 6, 0, 0), 'end_date': datetime.datetime(2008, 11, 27, 0, 0), 'score': None, 'users': None, 'ranked': 15007, 'popularity': 16663, 'members': 164, 'synopsis': 'A ', 'related_anime': ['Bikkuriman'], 'characters': [], 'voices': []}


In [105]:
for f in fnames[68:]:
    get_total_info(os.path.join(base_dir, f))
    print(f"file: {f} OK!")

file: article_68.html OK!
file: article_69.html OK!


ValueError: Errore nel convertire il value: Unknown del tag Episodes: in intero

In [70]:
def get_total_info_from_idx(idx, base_dir=os.path.join('..', 'data', 'to8760')):
    fname = f"article_{idx}.html"
    return get_total_info(os.path.join(base_dir, fname))

In [99]:
print(get_total_info_from_idx(22))

only ch_voices
{'title': 'Shiguang Dailiren (Link Click) - MyAnimeList.net', 'type': 'ONA', 'episodes': 11, 'start_date': datetime.datetime(2021, 4, 30, 0, 0), 'end_date': datetime.datetime(2021, 7, 9, 0, 0), 'score': 8.86, 'users': 33258, 'ranked': 23, 'popularity': 1361, 'members': 131580, 'synopsis': 'It is said that a picture is worth a thousand words. In this case, it holds an infinite amount of secrets. These are secrets that only Cheng Xiaoshi and Lu Guang are able to find. In a small shop called "Time Photo Studio," the two friends provide a special service: using their extraordinary powers that let them enter photographs, they jump into pictures brought to them by clients in order to grant their wishes. Through the eyes of the photographer, they live through the events surrounding the picture and try to decipher how to solve their client\'s request.', 'related_anime': ['Shiguang Dailiren Specials', 'Shiguang Dailiren 2nd Season', 'Shiguang Dailiren Fan Wai Pian: Biwu Zhaoqin']