In [129]:
from bs4 import BeautifulSoup
import bs4

In [3]:
# '../data/prova_1500/article_0.html'
def get_soup(fname):
    f = open(fname, 'r')
    soup = BeautifulSoup(f.read(), 'html.parser')
    f.close()
    return soup

In [190]:
def get_title(soup, ret):
    title = soup.find('title').contents[0].strip()
    if(title.endswith(" - MyAnimeList.net")):
        title = title[:-18]
    ret['title'] = title

In the following cell I'll iterate over the divs with class 'spaceit_pad' that contain some spam containing the value of interest for as that are:

* anime_type: Type
* number of episodes: Episodes
* release and end: Aired
* number of members: Members
* score: Score
* users: (in the same spam of Score)
* rank: Ranked
* popularity: Popularity

In [185]:
from datetime import datetime

def get_date(str_date):
    if str_date == 'Not available':
        return None
    try:
        end = datetime.strptime(str_date, '%b %d, %Y')
    except ValueError:
        try:
            end = datetime.strptime(str_date, '%b %Y')
        except ValueError:
            end = datetime.strptime(str_date, '%Y')
    return end



def get_left_attributes(soup, ret):
    divs = soup.find_all('div', {"class": "spaceit_pad"})
    from_interest = ['Episodes:','Aired:','Members:','Ranked:','Popularity:']
    for div in divs:
        content = div.contents
        tag = content[1].contents[0]
        if tag=='Score:':
            attr= {'itemprop':'ratingValue'}
            score = div.find('span', attr)
            ret['score'] = float(score.contents[0]) if score is not None else None
            attr = {'itemprop':'ratingCount'}
            users = div.find('span', attr)
            ret['users'] = int(users.contents[0]) if users is not None else None
        elif tag == 'Type:':
            content[1:] = [el for el in content[1:] if el!='\n']
            if type(content[-1])==bs4.element.NavigableString:
                ret['type'] = content[-1].strip()
            else:
                ret['type'] = content[-1].contents[0]
            
        elif tag in from_interest:
            val = content[2].strip()
            if val.startswith('#'):
                val=val[1:]
                ret[tag[:-1].lower()] = int(val)
            elif tag == 'Aired:':
                if 'to' in val:
                    start, end = val.split('to')
                    start = start.strip()
                    start = get_date(start)
                    
                    if end is not None:    
                        if '?' not in end:
                            end = get_date(end.strip())
                        else:
                            end=None
                    val = f'start: {start}, end: {end}'
                    ret['start_date'] = start
                    ret['end_date'] = end
                else:
                    start=val.strip()
                    start = get_date(start)
                    ret['start_date'] = start
                    ret['end_date'] = None
            else:
                val = val.replace(',','')
                
                ret[tag[:-1].lower()] = int(val) if val.isnumeric() else None #Sometimes val can be 'unknown' (i.e. see OnePiece)
        #print(div.contents)

The remaining fields are:
* **Synopsis** (to save as animeDescription): *String*
* **Related Anime** (to save as animeRelated): Extract all the related animes, but only keep unique values and those that have a hyperlink associated to them. *List of strings*.
* **Characters** (to save as animeCharacters): *List of strings*.
* **Voices** (to save as animeVoices): *List of strings*
* **Staff** 

They are contained in different structures of the right side of the page and will be retrieved in the following cells

In this cell I'll retrieve the description of the anime (synopsis)

In [7]:
def get_synopsis(soup, ret):
    synopsis = soup.find('p', {'itemprop':'description'}).contents[0]
    ret['synopsis'] = synopsis

In this cell I'll retrieve the **distinct** related animes in the page that can be found in the table with class 'anime_detail_related_anime'

In [186]:
def get_related_animes(soup, ret):
    related_animes_table = soup.find('table', {'class': 'anime_detail_related_anime'})
    if related_animes_table is None:
        #print('no related animes')
        ret['related_anime'] = None
        return
    animes = set()
    for a in related_animes_table.find_all('a'):
        a_cont = a.contents
        if len(a_cont) > 0:
            animes.add(a_cont[0])

    ret['related_anime'] = list(animes)


In [19]:
def get_staff(div, ret):
    i = 1
    staff = []
    for td in div.find_all('td', {'class':'borderClass'}):
        if (i)==0:
            #print(td)
            a, small = td.find_all(['small', 'a'])
            staff.append([a.contents[0], small.contents[0].split(',')])

        i = (i+1)%2
    ret['staff'] = staff

Now the only features of interest are voices and characters and they can be found in the other div of the class "detail-characters-list clearfix" and they belong to the same table but in two different columns

In [94]:
def get_characters_voices(div, ret):
    ret['characters'] = []
    ret['voices'] = []
    ch_vc = [[], []]
    links = list(map(lambda x: x.contents[0], div.find_all('a')))
    for tr in div.find_all('tr'):
        tds = tr.find_all('td', {'class':'borderClass'})
        if len(tds) != 3:
            continue

        ch = tds[1]
        vc = tds[2]
        ch_a = ch.find('a')
        vc_a = vc.find('a')
        if ch_a is not None:
            ret['characters'].append(ch_a.contents[0])
        if vc_a is not None:
            ret['voices'].append(vc_a.contents[0])


In [201]:
def get_total_info(fname):
    ret = dict()
    soup = get_soup(fname)
    get_title(soup, ret)
    get_left_attributes(soup, ret)
    get_synopsis(soup, ret)
    get_related_animes(soup, ret)
    divs = soup.find_all('div', {'class':"detail-characters-list clearfix"})
    if len(divs) == 0:
        ret['characters'] = []
        ret['voices'] = []
        ret['staff'] = []
    elif len(divs) != 2:
        if divs[0].find('h3', {'class':"h3_characters_voice_actors"}) is not None:
            #print('only ch_voices')
            get_characters_voices(divs[0], ret)
            ret['staff'] = []
        else:
            #print('only staff')
            get_staff(divs[0], ret)
            ret['characters'] = []
            ret['voices'] = []
    else:
        get_characters_voices(divs[0], ret)
        get_staff(divs[1], ret)
    return ret
    

In [174]:
base_dir = os.path.join('..', 'data', 'html_pages')
tot_ret = []
for f in os.listdir(base_dir)[:10]:
    tot_ret.append(get_total_info(os.path.join(base_dir, f)))
    print(f"file: {f} OK!")

no related animes
only staff
file: article_14986.html OK!
only ch_voices
file: article_15694.html OK!
no related animes
file: article_12151.html OK!
no related animes
file: article_06542.html OK!
file: article_01287.html OK!
file: article_03740.html OK!
file: article_04085.html OK!
file: article_02852.html OK!
file: article_10496.html OK!
file: article_17353.html OK!


In [175]:
print(tot_ret[0]['related_anime'])


None


In [176]:
def get_total_info_from_idx(idx, base_dir=os.path.join('..', 'data', 'html_pages')):
    fname = f"article_{str(idx).zfill(5)}.html"
    return get_total_info(os.path.join(base_dir, fname))
print(list(get_total_info_from_idx(0).keys()))

['title', 'type', 'episodes', 'start_date', 'end_date', 'score', 'users', 'ranked', 'popularity', 'members', 'synopsis', 'related_anime', 'characters', 'voices', 'staff']


In [204]:
def get_tsv_from_idx(idx, base_dir=os.path.join('..', 'data', 'html_pages')):
    fields = ['title', 'type', 'episodes', 'start_date', 'end_date', 'score', 'users', 'ranked', 'popularity', 'members', 'synopsis', 'related_anime', 'characters', 'voices', 'staff']
    head = '\t'.join(fields)
    ret = '' 
    info_dict = get_total_info_from_idx(idx, base_dir)
    for f in fields:
        val = str(info_dict[f])
        if val is None:
            val = ''
        elif type(val)==list and len(val)==0:
            val= ''
        ret+= val+'\t'
    return head, ret[:-1]

In [194]:
header, content = get_tsv_from_idx(0).split('\n')
print(f"header:\n\t'{header}'")
print('\n\n')
print(f"content:\n\t'{content}'")

header:
	'title	type	episodes	start_date	end_date	score	users	ranked	popularity	members	synopsis	related_anime	characters	voices	staff'



content:
	'Fullmetal Alchemist: Brotherhood	TV	64	2009-04-05 00:00:00	2010-07-04 00:00:00	9.16	1622384	1	3	2675906	After a horrific alchemy experiment goes wrong in the Elric household, brothers Edward and Alphonse are left in a catastrophic new reality. Ignoring the alchemical principle banning human transmutation, the boys attempted to bring their recently deceased mother back to life. Instead, they suffered brutal personal loss: Alphonse's body disintegrated while Edward lost a leg and then sacrificed an arm to keep Alphonse's soul in the physical realm by binding it to a hulking suit of armor.	['Fullmetal Alchemist: Brotherhood - 4-Koma Theater', 'Fullmetal Alchemist: The Sacred Star of Milos', 'Fullmetal Alchemist', 'Fullmetal Alchemist: Brotherhood Specials']	['Elric, Edward', 'Elric, Alphonse', 'Mustang, Roy', 'Hughes, Maes', 'Greed', 'Hawkey

In [188]:
for i in range(19122):
    ret = get_total_info_from_idx(i)
    title = ret['title']
    if(not title.endswith(" - MyAnimeList.net")):
        print(f"il file {i} ha il titolo: {title}")

In [202]:
def save_tsv_info(start, end, src_dir='../data/html_pages', dst_dir='../data/tsv_files'):
    if not os.path.exists(dst_dir):
        os.mkdir(dst_dir)
    
    fields = ['title', 'type', 'episodes', 'start_date', 'end_date', 'score', 'users', 'ranked', 'popularity', 'members', 'synopsis', 'related_anime', 'characters', 'voices', 'staff']
    total_tsv = os.path.join(dst_dir, 'total_pages.tsv')
    if not os.path.exists(total_tsv):
        with open(total_tsv, 'x') as out:
            out.write('\t'.join(fields)+'\n')
    for idx in range(start, end):
        tsv_h, tsv_c = get_tsv_from_idx(idx, src_dir)
        out_name = f"article_{str(idx).zfill(5)}.tsv"
        
        with open(os.path.join(dst_dir, out_name), 'w') as f:
            f.write(tsv_h + '\n' + tsv_c)
        
        with open(os.path.join(total_tsv), 'a') as f:
            f.write('\n'+tsv_c)     
        print(f"idx: {idx} DONE!")   

In [203]:
save_tsv_info(1592,19122)

idx: 351 DONE!
idx: 352 DONE!
idx: 353 DONE!
idx: 354 DONE!
idx: 355 DONE!
idx: 356 DONE!
idx: 357 DONE!
idx: 358 DONE!
idx: 359 DONE!
idx: 360 DONE!
idx: 361 DONE!
idx: 362 DONE!
idx: 363 DONE!
idx: 364 DONE!
idx: 365 DONE!
idx: 366 DONE!
idx: 367 DONE!
idx: 368 DONE!
idx: 369 DONE!
idx: 370 DONE!
idx: 371 DONE!
idx: 372 DONE!
idx: 373 DONE!
idx: 374 DONE!
idx: 375 DONE!
idx: 376 DONE!
idx: 377 DONE!
idx: 378 DONE!
idx: 379 DONE!
idx: 380 DONE!
idx: 381 DONE!
idx: 382 DONE!
idx: 383 DONE!
idx: 384 DONE!
idx: 385 DONE!
idx: 386 DONE!
idx: 387 DONE!
idx: 388 DONE!
idx: 389 DONE!
idx: 390 DONE!
idx: 391 DONE!
idx: 392 DONE!
idx: 393 DONE!
idx: 394 DONE!
idx: 395 DONE!
idx: 396 DONE!
idx: 397 DONE!
idx: 398 DONE!
idx: 399 DONE!
idx: 400 DONE!
idx: 401 DONE!
idx: 402 DONE!
idx: 403 DONE!
idx: 404 DONE!
idx: 405 DONE!
idx: 406 DONE!
idx: 407 DONE!
idx: 408 DONE!
idx: 409 DONE!
idx: 410 DONE!
idx: 411 DONE!
idx: 412 DONE!
idx: 413 DONE!
idx: 414 DONE!
idx: 415 DONE!
idx: 416 DONE!
idx: 417 D

ValueError: too many values to unpack (expected 2)