### Packages
Uncomment and install if you don't have these yet

In [1]:
'''
!pip install pandas
!pip install beautifulsoup4
!pip install selenium
!pip install requests
'''

'\n!pip install pandas\n!pip install beautifulsoup4\n!pip install selenium\n!pip install requests\n'

In [2]:
import pandas as pd

import requests

from bs4 import BeautifulSoup

import re
import time
import numpy as np

from enum import Enum
from abc import ABC, abstractmethod

# Some useful classes to prepare.

## The scraper
<li>Help correctly build URLS</li>
<li>Pull raw HTML and generate soups</li>

In [3]:
class Scraper:
    class SEASONS(Enum):
        FALL = 'fall'
        SPRING = 'spring'
        SUMMER = 'summer'
        WINTER = 'winter'
    
    def build_URL_Seasons(year : str | int, 
                          season : SEASONS) -> str:

        return f"https://myanimelist.net/anime/season/{year}/{season.value}"

    
    def delay(minimum = 5):
        time.sleep(np.random.rand() * 10 + minimum)

    def scrape(url : str) -> BeautifulSoup:
        #include a delay per get() to avoid DDOS 
        Scraper.delay()
        
        response = requests.get(url)
        if response.status_code != requests.status_codes.codes.all_ok:
            raise ConnectionError
        
        return BeautifulSoup(response.text, 'html.parser')

## Pages
Abstract template where Pages must
<li>Parse their soups accordingly</li>


### What pages are we expecting?
<li>Seasonal page listing all anime titles links.</li>
<li>Details page containing an entry's info.</li>
<li>Stats page listing all reviews.</li>

In [4]:
class ScrapedPage(ABC):
    @abstractmethod
    def parse() -> dict:
        return {}

    def cleanString(item : str) -> str :
        return re.sub(r'\s+', ' ', item).strip()

In [5]:
CONTINUE = False

assert CONTINUE

AssertionError: 

# WebScraping MAL(myanimelist.net)

### About MAL
One of the most popular anime forums containing rankings, discussions, and information on anime media (movies, shows, manga).

### To do: 
<ol>
    <li>List all entries under a specified season and category.</li>
    <li>Find each entry's article page and gather all relevant data.</li>
</ol>

## Step 0: Inspect the Websites

### What we Know

<ol>
    <li>Base url: <a href="https://myanimelist.net/">https://myanimelist.net/</a></li>
    <img src="Images\base_url.png"/>
    <li>Url format for seasonal listings: <b>{BASE URL}/anime/season/{YEAR}/{SEASON}</b></li>
    <img src="Images\seasonal_url.png"/>
    <li>Url format for article pages: <b>{BASE URL}/anime/{ID}/{NAME}</b></li>
    <img src="Images\article_url.png"/>
    <li>All webpages are <b>static</b></li>
    <li>Default requests.get() functions without restrictions.</li>
    <li>The site doesn't require login to access data.</li>
</ol>


# Season Page

## Inspecting the seasonal list page

### The headers
>We can navigate by clicking the headers to select a specified {YEAR} and {SEASON}.
<br>However we already how the url formatting works hence we don't need to add complication with Selenium.

><b>{BASE URL}/anime/season/{YEAR}/{SEASON}</b>

>Try these:
>><a href="https://myanimelist.net/anime/season/2024/fall">2024 and Fall</a>
<br><a href="https://myanimelist.net/anime/season/2018/summer">2018 and Summer</a>

<img src="Images\seasonal_headers.png"/>

Let's try building the url.

> YEAR : String
> <br>SEASON : String {'fall', 'summer', 'speing', 'winter'}

```python
SEASONS_URL = "https://myanimelist.net/anime/season/"
YEAR = "2024"
SEASON = "spring"

FULL_SEASONS_URL = f'{SEASONS_URL}{YEAR}/{SEASON}'

```

In [None]:
YEAR = 2024
SEASON = Scraper.SEASONS.SPRING

FULL_SEASONS_URL = Scraper.build_URL_Seasons(YEAR, SEASON)
print(FULL_SEASONS_URL)

### The entries

>The entries are all listed even without scrolling thus proving it's a static page.
><br>There's some information provided but it's still nothing compared to the article page.

>What we'll need from this page is just
><li>The title</li>
><li>The link to the article</li>

<img src="Images\seasonal_entries.png"/>


## Step 1: Scrape the HTML

Using requests.get(), pull the HTML data.

Using BeautifulSoup, convert the HTML into a Soup wrapper.

```python
response = requests.get(FULL_SEASONS_URL)

#Check if there was an issue encountered while getting the HTML
assert(response.status_code == requests.status_codes.codes.all_ok)

#Convert the blob into the soup wrapper
BeautifulSoup(response.text, "html.parser")
```

In [None]:
seasonalSoup = Scraper.scrape(FULL_SEASONS_URL)

In [None]:
print(seasonalSoup.prettify())

## Step 2: Parse the Needed Data

> Return to the entry...
> <br>The only actual information we need is the title and its link.
> <br>We don't need the other data or the parent div container.

<img src='Images/link_title.png'/>

> To extract that there's the following attributes...
> <li>element_type : a</li>
> <li>class_name : link-title</li>

<img src='Images/link_title_source.png'/>

In [None]:
link_titles = seasonalSoup.find_all(class_ ='link-title')
link_titles

### Extract data
Get the text and the href.

In [None]:
df = pd.DataFrame({
    'Title':[a.text for a in link_titles], 
    'Link':[a.get('href') for a in link_titles]
})

df

## Wrap
> Let's extend the `ScrapedPage` to cover all the steps for the `SeasonalPage`.

In [None]:
class SeasonPage(ScrapedPage):
    def parse(soup : BeautifulSoup) -> dict:
        link_titles = soup.find_all(class_ ='link-title')

        return {
            'Title':[a.text for a in link_titles], 
            'Link':[a.get('href') for a in link_titles]
        }

In [None]:
df_Season = pd.DataFrame(SeasonPage.parse(seasonalSoup))

### Checkpoint
Let's save it as a csv 

In [None]:
filename = f'Season/{YEAR}_{SEASON.value}.csv'
df_Season.to_csv(filename)
print(filename)

### End of Scraping Season Page

We now have a complete list of anime titles and its article page.

We will be repeating the same methodology onwards in the individual article pages where we Inspect, Scrape, then Parse.

# Scraping Articles Page
<img src='Images/article.png'>

# Details Page

> We will primarily be scraping the details page.
> This is because there are different pages/headers depending on media type.

### Episode headers
<img src='Images/article_episode_headers.png'/>

### Movie headers
<img src='Images/article_movie_headers.png'/>

> Information we need are in the left panel.
> <br>The rest are redundant.
> <br>Note that there's also a js script for displaying more/less titles, but the title name still exists within the static context. 

<img src='Images/article_details_panel.png'/>

> Let's first get the HTML and read it

In [None]:
link = df.iloc[0].Link
print(link)

detailsSoup = Scraper.scrape(link)
print(detailsSoup.prettify())

### Extract Panel Data
> There are 2 ways to extract it.

> The first is to find the parent div and extract its <b>children</b>.
> <br><img src='Images/div_spaceit_pad.png'/>
> <br>But this has issues where other unwanted divs also use this class name.

> The other alternative is to find a unique class.<br>
> Which is the one used in the label.<br>
<img src='Images/span_dark_text.png'/>

In [None]:
print('Unwanted/Redundant:')
print(detailsSoup.find_all(class_='spaceit_pad')[-1])

In [None]:
detailsSoup.find_all(class_='dark_text')

> To access the content values, ask the siblings.<br>
> <img src='Images/dark_text_siblings.png'/>

> Adjust accordingly as some siblings are just text,
> but others can be a hyperlink or even lists of them.<br>
> <img src='Images/siblings_hyperlinks.png'/>

In [None]:
output = {}

for d in detailsSoup.find_all(class_='dark_text'):
    label = d.text.strip()
    content = d.find_next_sibling(string=True).strip()
    
    if content == '':
        content = [a.text.strip() for a in d.find_next_siblings('a')]

    output[label] = content

output

### Fixing score and Additional Info
> Score is still missing. This is because it's composed of more complex siblings.

> Luckily it has a unique class name so use that instead.

> In addition, let's get more data
> <li>Scorers</li>
> <li>Synopsis</li>
> <li>Related_Entries</li>

In [None]:
output['Score:'] = detailsSoup.find('span', attrs={'itemprop':'ratingValue'}).text.strip()
output['Scorers'] = detailsSoup.find('span', attrs={'itemprop':'ratingCount'}).text.strip()
output['Synopsis'] = re.sub(r'\s+', ' ', detailsSoup.find('p', attrs={'itemprop':'description'}).text).strip()

In [None]:
output['Related_Entries'] = [re.sub(r'\s+', ' ', entry.text).strip() for entry in detailsSoup.find_all('div', attrs={'class':'entry borderClass'})]
output

In [None]:
df_Details = pd.DataFrame([output])
df_Details

## Wrap for Iteration

In [None]:
class DetailsPage(ScrapedPage):
    def parse(soup : BeautifulSoup) -> dict:
        output = {}

        try:
            DetailsPage.parseLeftPanel(soup, output)
        except Exception as e:
            print('Left Panel Failed')
            
        try:
            DetailsPage.parseScore(soup, output)
        except Exception as e:
            print('Score Failed')
                  
        try:
            DetailsPage.parseSynopsis(soup, output)
        except Exception as e:
            print('Synopsis Failed')

        try:
            DetailsPage.parseRelatedEntries(soup, output)
        except Exception as e:
            print('Related Entries Failed')
        
        return output

    def parseLeftPanel(soup : BeautifulSoup, output : dict) :
        for d in soup.find_all(class_='dark_text'):
            label = d.text.replace(':','').strip()
            content = d.find_next_sibling(string=True).strip()
            
            if content == '':
                content = [a.text.strip() for a in d.find_next_siblings('a')]
        
            output[label] = content

    def parseScore(soup : BeautifulSoup, output : dict) :
        e = soup.find('span', attrs={'itemprop':'ratingValue'})

        if e is None:
            print('No score')
            raise ValueError
        
        output['Score'] = e.text.strip()
        output['Scorers'] = soup.find('span', attrs={'itemprop':'ratingCount'}).text.strip()

    def parseSynopsis(soup : BeautifulSoup, output : dict) :
        output['Synopsis'] = DetailsPage.cleanString(soup.find('p', attrs={'itemprop':'description'}).text)

    def parseRelatedEntries(soup : BeautifulSoup, output : dict) :
        output['Related_Entries'] = [DetailsPage.cleanString(entry.text) for entry in soup.find_all('div', attrs={'class':'entry borderClass'})]
    

## Testing
>Run these with known missing values and see how effective error handling is.

In [None]:
BATCH_TEST = df_Season.iloc[[np.random.randint(df_Season.shape[0]),np.random.randint(df_Season.shape[0]),-15,-1]]
BATCH_TEST

In [None]:
df_Details = None

for _, row in BATCH_TEST.iterrows():
    output = {}
    newRow = None
    
    try:
        soup = Scraper.scrape(row.Link)
        output = DetailsPage.parse(soup)
        output['Title'] = row.Title

        newRow = pd.DataFrame([output])
        
        if df_Details is None:
            df_Details = newRow
        else:
            df_Details = pd.concat([df_Details, newRow], ignore_index=True)
        
        print(row.Title)
    except Exception as e:
        print(f'ERROR: {row.Link} {e}')

In [None]:
df_Details.to_csv(f'Detail/{YEAR}_{SEASON.value}.csv')
df_Details

### End of Details Page
Run the full for loop in the other notebook to observe final results.

# Statistics Page

In [None]:
assert CONTINUE

> The stats page composes of 3 parts.
> <li>A summary of viewing status.</li>
> <li>A histogram of user scores.</li>
> <li>All score postings and viewing status</li>

<img src='Images/stats_page.png'/>

### URL
> The pages increment at values of `75` but cannot >= `7500` <br>
> In short, only the latest 7499 posts are provided.

#### Stats page landing<br>
> https://myanimelist.net/anime/9617/K-On_Movie/stats

or

> https://myanimelist.net/anime/9617/K-On_Movie/stats?show=0#members
#### Next page
> https://myanimelist.net/anime/9617/K-On_Movie/stats?show=75#members

#### Exceeding last page generates an error response
<img src='Images/stats_page_404.png'>

In [None]:
requests.get('https://myanimelist.net/anime/9617/K-On_Movie/stats?show=5000000000#members').status_code

## Part 1. User viewership submissions
> We will only be able to get partial data since it caps at 7499, but its data can be used in providing latest aggregates.

<img src='Images/stats_viewershipsubmission.png'>

In [None]:
class StatisticsSubmissionPage(ScrapedPage):
    def parse(soup : BeautifulSoup) -> dict :
        table = soup.find('table', attrs={'class':'table-recently-updated'})

        rows = [[header.text for header in table.find('tr').find_all('td')]]
        
        
        for tr in table.find_all('tr')[1:]:
            row = [tr.find('a', attrs={'class':'word-break'}).text.strip()]
            [row.append(td.text.strip()) for td in tr.find_all('td', attrs={'class':'borderClass ac'})]
    
            rows.append(row)
        return rows

    def parseAsDf(soup : BeautifulSoup) -> pd.DataFrame :
        rows = StatisticsSubmissionPage.parse(soup)
        return pd.DataFrame(rows[1:], columns=rows[0])

### Table data for plain text

<img src='Images/stats_submission_text.png'>

### Table data for user name

<img src='Images/stats_user.png'>

### Try parsing Frieren last page

In [None]:
LINK = 'https://myanimelist.net/anime/52991/Sousou_no_Frieren/stats?show=7425#members'
statsSoup = BeautifulSoup(requests.get('https://myanimelist.net/anime/52991/Sousou_no_Frieren/stats?show=7425#members').text, 'html.parser')

In [None]:
table = statsSoup.find('table', attrs={'class':'table-recently-updated'})

rows = [[header.text for header in table.find('tr').find_all('td')]]


for tr in table.find_all('tr')[1:]:
    row = [tr.find('a', attrs={'class':'word-break'}).text.strip()]
    [row.append(td.text.strip()) for td in tr.find_all('td', attrs={'class':'borderClass ac'})]

    rows.append(row)

rows

In [None]:
df_Statistics = StatisticsSubmissionPage.parseAsDf(statsSoup)
df_Statistics

### Parse the Entire KON User Submissions

In [None]:
LINK = 'https://myanimelist.net/anime/9617/K-On_Movie/'

df_Statistics = None

for i in np.arange(0, 75*100, 75):
    try:
        URL = f'{LINK}stats?show={i}#members'
        print(URL)

        statsSoup = Scraper.scrape(URL)

        
        if df_Statistics is None:
            df_Statistics = StatisticsSubmissionPage.parseAsDf(statsSoup)
        else:
            df_Statistics = pd.concat([df_Statistics, StatisticsSubmissionPage.parseAsDf(statsSoup)], ignore_index=True)
        
    except Exception as e:
        print(f'END {e}')
        break

filename = f'Statistics/{LINK.split('/')[-2]}_Submissions.csv'
df_Statistics.to_csv(filename)
print(filename)

## Part 2. Historical Summary
> Let's also get the full picture based on the summary stats provided.

<img src='Images/stats_summary.png'>

Get the Header then get the proceeding text elements with labels and values.

<img src='Images/stats_darktext.png'>

In [None]:
H2_SUMMARY = statsSoup.find('h2', attrs={'id':'summary_stats'})

In [None]:
output = {}

for s in H2_SUMMARY.find_all_next('span', attrs={'class':'dark_text'}):
    output[s.text.strip(':')] = s.find_next_sibling(string=True).strip()

output

<img src='Images/stats_scoretable.png'>

In [None]:
TABLE_SCORE = statsSoup.find('table', attrs={'class':'score-stats'})
TABLE_SCORE.find_all('tr')

In [None]:
for tr in TABLE_SCORE.find_all('tr'):
    output[tr.select('.score-label')[0].text] = re.search(r'(\d+)', tr.find('small').text).group(1)

pd.DataFrame([output])

In [None]:
class StatisticsSummaryPage(ScrapedPage):
    def parse(soup : BeautifulSoup) -> dict :
        output = {}

        try:
            StatisticsSummaryPage.parseSummary(soup, output)
            StatisticsSummaryPage.parseScores(soup, output)

        except Exception as e:
            print(e)
        
        finally:
            return output

        
    @staticmethod
    def parseSummary(soup : BeautifulSoup, output : dict):
        H2_SUMMARY = soup.find('h2', attrs={'id':'summary_stats'})

        for s in H2_SUMMARY.find_all_next('span', attrs={'class':'dark_text'}):
            output[s.text.strip(':')] = s.find_next_sibling(string=True).strip()

    @staticmethod
    def parseScores(soup : BeautifulSoup, output : dict):
        TABLE_SCORE = soup.find('table', attrs={'class':'score-stats'})
        
        for tr in TABLE_SCORE.find_all('tr'):
            output[tr.select('.score-label')[0].text] = re.search(r'(\d+)', tr.find('small').text).group(1)

In [None]:
output = StatisticsSummaryPage.parse(statsSoup)
output['Title'] = LINK.split('/')[-2]
[output]