# Use requests library to download webpages

In [25]:
#!pip install requests --upgrade --quiet

In [26]:
import requests

In [27]:
topics_url = 'https://myanimelist.net/topanime.php'

In [28]:
response = requests.get(topics_url)

In [29]:
response.status_code
#200 means that the response was successful

200

In [30]:
len(response.text)

158057

In [31]:
page_contents = response.text

In [32]:
page_contents[:1000]

'\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"\n    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n\n<html lang="en">\n<head>\n    \n<link rel="preconnect" href="//fonts.gstatic.com/" crossorigin="anonymous" />\n<link rel="preconnect" href="//fonts.googleapis.com/" crossorigin="anonymous" />\n<link rel="preconnect" href="//tags-cdn.deployads.com/" crossorigin="anonymous" />\n<link rel="preconnect" href="//www.googletagservices.com/" crossorigin="anonymous" />\n<link rel="preconnect" href="//www.googletagmanager.com/" crossorigin="anonymous"/>\n<link rel="preconnect" href="//apis.google.com/" crossorigin="anonymous"/>\n<link rel="preconnect" href="//pixel-sync.sitescout.com/" crossorigin="anonymous"/>\n<link rel="preconnect" href="//pixel.tapad.com/" crossorigin="anonymous"/>\n<link rel="preconnect" href="//c.deployads.com/" crossorigin="anonymous"/>\n<link rel="preconnect" href="//tpc.googlesyndication.com/" crossorigin="anonymous"/>\n<link rel="preconne

In [33]:
with open ('webpage.html', 'w') as f:
    f.write(page_contents)

# Use Beautiful Soup to extract information

In [34]:
#!pip install beautifulsoup4 --quiet

In [35]:
from bs4 import BeautifulSoup

In [36]:
doc = BeautifulSoup(page_contents, 'html.parser')

In [37]:
h3_tags = doc.find_all("div", {'class' : 'di-ib clearfix'})

In [38]:
len(h3_tags)

50

In [39]:
h3_tags[0].contents[0]

<h3 class="hoverinfo_trigger fl-l fs14 fw-b anime_ranking_h3"><a href="https://myanimelist.net/anime/51535/Shingeki_no_Kyojin__The_Final_Season_-_Kanketsu-hen" id="#area51535" rel="#info51535">Shingeki no Kyojin: The Final Season - Kanketsu-hen</a></h3>

In [40]:
h3_tags[0].contents[0].string

'Shingeki no Kyojin: The Final Season - Kanketsu-hen'

In [41]:
h3_tags[0].contents[0].a['href']

'https://myanimelist.net/anime/51535/Shingeki_no_Kyojin__The_Final_Season_-_Kanketsu-hen'

In [42]:
titles = []
for i in range(len(h3_tags)):
    titles.append(h3_tags[i].contents[0].string)

In [43]:
#titles

In [44]:
links = []
for i in range(len(h3_tags)):
    links.append(h3_tags[i].contents[0].a['href'])

In [45]:
#links

In [46]:
score_tags = doc.find_all('td', {'class':'score ac fs14'})

In [47]:
score_tags[0].contents[0].span.string

'9.15'

In [48]:
scores=[]
for tag in score_tags:
    scores.append(tag.contents[0].span.string)

In [49]:
#scores

# Create CSV Files with extracted information

In [50]:
import pandas as pd

In [51]:
df = pd.DataFrame()

In [52]:
dict = {'Name':titles,
        'Score':scores,
        'Link':links}

In [53]:
df = pd.DataFrame(dict)

In [54]:
df

Unnamed: 0,Name,Score,Link
0,Shingeki no Kyojin: The Final Season - Kankets...,9.15,https://myanimelist.net/anime/51535/Shingeki_n...
1,Fullmetal Alchemist: Brotherhood,9.11,https://myanimelist.net/anime/5114/Fullmetal_A...
2,Bleach: Sennen Kessen-hen,9.09,https://myanimelist.net/anime/41467/Bleach__Se...
3,Steins;Gate,9.08,https://myanimelist.net/anime/9253/Steins_Gate
4,Gintama°,9.07,https://myanimelist.net/anime/28977/Gintama°
5,Kaguya-sama wa Kokurasetai: Ultra Romantic,9.06,https://myanimelist.net/anime/43608/Kaguya-sam...
6,Shingeki no Kyojin Season 3 Part 2,9.06,https://myanimelist.net/anime/38524/Shingeki_n...
7,Gintama: The Final,9.05,https://myanimelist.net/anime/39486/Gintama__T...
8,Gintama',9.04,https://myanimelist.net/anime/9969/Gintama
9,Hunter x Hunter (2011),9.04,https://myanimelist.net/anime/11061/Hunter_x_H...


In [55]:
df.to_csv('mal.csv',index_label='Rank')

## Getting Themes/Genres out of a Anime page

In [56]:
response = requests.get(links[0])

In [57]:
response.status_code

200

In [58]:
an_page = response.text

In [59]:
doc = BeautifulSoup(an_page, 'html.parser')

In [60]:
genreTags = doc.find_all('span', {'itemprop':'genre'})

In [61]:
genreTags[0].text

'Action'

In [62]:
tempGenres = []
for genre in genreTags:
    tempGenres.append(genre.text)
tempGenres

['Action', 'Drama', 'Suspense', 'Gore', 'Military', 'Survival', 'Shounen']

In [63]:
genres = []
for url in links:
    response = requests.get(url)
    an_page = response.text
    doc = BeautifulSoup(an_page, 'html.parser')
    genreTags = doc.find_all('span', {'itemprop':'genre'})
    tempGenres = []
    for genre in genreTags:
        tempGenres.append(genre.text)
    genres.append(tempGenres)


In [64]:
#genres

## Getting the Synopsis

In [65]:
response = requests.get(links[0])
an_page = response.text
doc = BeautifulSoup(an_page, 'html.parser')
synopsis = doc.find_all('p', {'itemprop':'description'})

In [66]:
synopsis[0].text

'The conclusion to Shingeki no Kyojin.'

In [67]:
synopsis = []
for url in links:
    response = requests.get(url)
    an_page = response.text
    doc = BeautifulSoup(an_page, 'html.parser')
    tempSyn = doc.find_all('p', {'itemprop':'description'})
    synopsis.append(tempSyn[0].text)

In [68]:
len(synopsis)

50

# Create Function to Scrape top 1000 Anime

In [69]:
#https://myanimelist.net/topanime.php?limit=0


In [84]:
def getAnimeTitleLink():
    limit = 0
    titles = []
    links = []
    tags = []
    while limit < 100:
        topUrl = f"https://myanimelist.net/topanime.php?limit={limit}"
        response = requests.get(topUrl)
        page = response.text
        doc = BeautifulSoup(page, 'html.parser')
        topTags = doc.find_all("div", {'class' : 'di-ib clearfix'})
        tags += topTags
        limit = limit+50
    for div in tags:
        titles.append(div.contents[0].string)
        links.append(div.contents[0].a['href'])
    return titles, links
           

In [85]:
titles, links = getAnimeTitleLink()

In [103]:
def getScores():
    limit = 0
    scores = []
    tags = []
    while limit < 100:
        topUrl = f"https://myanimelist.net/topanime.php?limit={limit}"
        response = requests.get(topUrl)
        page = response.text
        doc = BeautifulSoup(page, 'html.parser')
        score_tags = doc.find_all('td', {'class':'score ac fs14'})
        tags += score_tags
        limit = limit+50
    for div in tags:
        scores.append(div.contents[0].span.string)
    return scores


In [105]:
scores = getScores()

In [114]:
def getGenres(links):
    genres = []
    for url in links:
        hdr = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, hdr)
        an_page = response.text
        doc = BeautifulSoup(an_page, 'html.parser')
        genreTags = doc.find_all('span', {'itemprop':'genre'})
        tempGenres = []
        for genre in genreTags:
            tempGenres.append(genre.text)
        genres.append(tempGenres)
    return genres

In [119]:
genres = getGenres(links)

In [90]:
def getSynopsis(links):
    synopsis = []
    for url in links:
        response = requests.get(url)
        an_page = response.text
        doc = BeautifulSoup(an_page, 'html.parser')
        tempSyn = doc.find_all('p', {'itemprop':'description'})
        synopsis.append(tempSyn[0].text)
    return synopsis

In [95]:
synopsis = getSynopsis(links)

In [121]:
synopsis[:2]

['The conclusion to Shingeki no Kyojin.',
 'After a horrific alchemy experiment goes wrong in the Elric household, brothers Edward and Alphonse are left in a catastrophic new reality. Ignoring the alchemical principle banning human transmutation, the boys attempted to bring their recently deceased mother back to life. Instead, they suffered brutal personal loss: Alphonse\'s body disintegrated while Edward lost a leg and then sacrificed an arm to keep Alphonse\'s soul in the physical realm by binding it to a hulking suit of armor.\n\r\nThe brothers are rescued by their neighbor Pinako Rockbell and her granddaughter Winry. Known as a bio-mechanical engineering prodigy, Winry creates prosthetic limbs for Edward by utilizing "automail," a tough, versatile metal used in robots and combat armor. After years of training, the Elric brothers set off on a quest to restore their bodies by locating the Philosopher\'s Stone—a powerful gem that allows an alchemist to defy the traditional laws of Equ

In [108]:
dict = {'Name':titles,
        'Score':scores,
        'Genres':genres,
        'Synopsis':synopsis,
        'Link':links
        }

In [111]:
df = pd.DataFrame(dict)

In [138]:
import numpy as np
df.index = np.arange(1, len(df) + 1)
#Make index start at one, that way it corresponds with the ranking on the website

In [140]:
import csv
df.to_csv('mal.csv',index_label='Rank')