In [340]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import csv


In [341]:
# Top 100 Anime of all time according to stakeholder
url = "https://www.imdb.com/list/ls057577566/"
response = requests.get(url)
soup = BeautifulSoup(response.content,'html.parser')

In [342]:
# Create lists for data to be scraped into
name = []   # name of the anime
rating = []  # overall rating of the anime
aired_duration = [] # released year - end year
average_runtime = [] # average duration of each anime episode
genre = [] # genre of the anime
MPAA_rating = [] # MPAA rating
votes = []
personal_ranking= list(range(1,101)) # stakeholder's ranking

In [343]:
# Use bs4 to place all 100 anime tv-shows into a list
anime_data = soup.find_all('div', attrs={'class':'lister-item mode-detail'})

In [344]:
# iterate through all 100 movies
for data in anime_data:
    # extract all the names of the animes
    name.append(data.h3.a.text)  

    # extract all the aired_duration of the animes
    aired_duration.append(data.find(string=re.compile(r'\(.*\)')))

    # extract all the genres of the animes
    genre.append(data.p.find('span', attrs={'class':'genre'}).text)

    # extract all the mpaa certificates of the animes
    mpaa_review = data.p.find('span',class_='certificate')
    if mpaa_review:
        MPAA_rating.append(mpaa_review.text) 
    else:
        MPAA_rating.append('None')

    # extract all the average run time of the animes
    runtime_span = data.p.find('span', class_='runtime')
    if runtime_span:
        average_runtime.append(runtime_span.text)
    else:
        average_runtime.append("None")

    # extract all the ratings of the animes
    rate = data.find('span',class_='ipl-rating-star__rating')
    if rate:
        rating.append(rate.text)
    else:
        rating.append('None')

    # extract all the votes of the animes
    vote = data.find('span', attrs={'name':'nv'})
    if vote:
        votes.append(vote.text)
    else:
        votes.append("None")


In [345]:
# cleaned the genre array by stripping the whitespaces and newlines within each element
temp_genre = []
for typ in genre:
    temp_genre.append(typ.strip())
genre = temp_genre

In [346]:
# split the genres into a list data type
temp_genre = []
for x in genre:
    result = x.split(",")
    result.pop(0)  # pop out the animation genre since anime is animation by default
    temp_genre.append(result)
genre = [[ typ.strip() for typ in sub_list ] for sub_list in temp_genre] #strip the whitespace within each sublist's elements

In [347]:
# cleaned the aired_duration array by removing all the parentheses from each element
aired_duration = [s.replace('(', '').replace(')', '') for s in  aired_duration]

# split the aired duration to starting_year and ending_year
year_started = []
year_ended = []
for years in aired_duration:
    result = re.match(r"(\d{4})(?:–(\d{4}))?", years)

    if result.group(2) is not None:  # if the end year is None, the anime is still airing
            year_started.append(result.group(1))
            year_ended.append(result.group(2))
    else:
        year_started.append(result.group(1))
        year_ended.append("None")

# converted the years into integers
year_started = [int(x) if x != "None" else "None" for x in year_started]
year_ended = [int(x) if x != "None" else "None" for x in year_ended]

In [348]:
# cleaned the average_runtime array by removing the minutes part from each element
average_runtime  = [int(t.replace(' min', '')) if t != "None" else "None" for t in average_runtime]

In [349]:
# write all the columns into the csv file
with open("scraped_top_100_anime.csv", "w", newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["title", "rating", "starting_year","ending_year","average_runtime", "genre","MPAA_rating", "votes", "personal_ranking"])
    for title, score, starting_duration, ending_duration, avg_time, genres, mpaa, vote, rank in zip(name,rating,year_started,year_ended,average_runtime,genre,MPAA_rating,votes,personal_ranking):
        writer.writerow([title, score, starting_duration, ending_duration, avg_time, genres, mpaa, vote, rank])
