# COGS 108 - Final Project Validation 

# Data Analysis

This data that we have scraped can serve to validate our hypothesis on a new dataset.

In [79]:
%matplotlib inline

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
from bs4 import BeautifulSoup
from time import sleep
from IPython.display import clear_output

In [2]:
# Configure libraries
# The seaborn library makes plots look nicer
sns.set()
sns.set_context('talk')

# Don't display too many rows/cols of DataFrames
pd.options.display.max_rows = 10
pd.options.display.max_columns = 10

# Round decimals when displaying DataFrames
pd.set_option('precision', 2)

We will scrape MAL for its recent users and get information about what they decide to publicly list on their profile and ratings lists.

In [32]:
# Useful functions for getting user data

def get_recent_users():
    """ Returns recent users """
    r = requests.get('https://myanimelist.net/users.php', timeout=10)

    if r.status_code == 200:
        soup = BeautifulSoup(r.text, 'lxml')

        # Finds recent users td
        td = soup.find_all('td', attrs={'align':'center', 'class':'borderClass'})
        users = []

        # Appends users to a list
        for e in td:
            users.append(e.find('div').text)
        return users
    
def getUserInformation(user):
    if user is not "":
        return requests.get('https://api.jikan.moe/v3/user/{}/profile'.format(user)).text
    return None

def insertInfoIntoDataFrame(df, info):
    info = json.loads(info)
    local = pd.DataFrame({'username': [info['username']],
                          'location': [info['location']],
                          'Anime-Watching': [info['anime_stats']['watching']],
                          'Anime-Completed': [info['anime_stats']['completed']],
                          'Anime-Total': [info['anime_stats']['total_entries']],
                          'Anime-Mean-Score': [info['anime_stats']['mean_score']],
                          'Manga-Reading': [info['manga_stats']['reading']],
                          'Manga-Completed': [info['manga_stats']['completed']],
                          'Manga-Total': [info['manga_stats']['total_entries']],
                          'Manga-Mean-Score': [info['manga_stats']['mean_score']]})
    local = local.set_index('username')
    df = df.combine_first(local)
    return df

In [4]:
userInfo = pd.DataFrame()

# Insert column headers to userInfo DataFrame
userInfo['username'] = ""
userInfo['location'] = None
userInfo['Anime-Watching'] = 0
userInfo['Anime-Completed'] = 0
userInfo['Anime-Total'] = 0
userInfo['Anime-Mean-Score'] = 0
userInfo['Manga-Reading'] = 0
userInfo['Manga-Completed'] = 0
userInfo['Manga-Total'] = 0
userInfo['Manga-Mean-Score'] = 0
userInfo = userInfo.set_index('username')

In [145]:
# skips scraping since it takes a very long time
skip = True

# Grabs a bunch of recent users and inserts them into the userInfo DataFrame
for i in range(10):
    
    # variable to skip downloading as this takes a long time
    if skip:
        continue
    
    # iterates 10 times and waits 5 seconds in between fetching new users
    for user in get_recent_users():
        userInfo = insertInfoIntoDataFrame(userInfo, getUserInformation(user))
        sleep(2)
    sleep(4)

In [44]:
if not skip:
    userInfo.to_csv('userInfo.csv')
else:
    userInfo = pd.read_csv('userInfo.csv', index_col='username')

In [45]:
# functions for parsing anime list
def getAnimeList(user):
    if user is not "":
        r = requests.get('https://api.jikan.moe/v3/user/{}/animelist'.format(user))
        if r.status_code == 200:
            return r.text
        return None
    return None

def insertAnimeUser(df, info, name, animeInfo):
    if info is None:
        return df, animeInfo
    
    info = json.loads(info)
    titles = []
    ratings = []
    mal_id = []
    
    # insert all anime titles to titles list
    # insert all ratings to ratings list
    for anime in info['anime']:
        titles.append(str(anime['title']))
        ratings.append(anime['score'])
        mal_id.append(anime['mal_id'])
    
    local = pd.DataFrame({'anime': titles,
                          str(name): ratings})
    local = local.set_index('anime')
    df = df.combine_first(local)
    
    localInfo = pd.DataFrame({'anime': titles,
                             'mal_id': mal_id})
    localInfo = localInfo.set_index('anime')
    animeInfo = animeInfo.combine_first(localInfo)
    return df, animeInfo

In [46]:
animeUser = pd.DataFrame()
animeInfo = pd.DataFrame()

# pull user animelists
for index, row in userInfo.iterrows():
    username = row.name
    
    if skip:
        continue
    
    # grab anime list
    animeUser, animeInfo = insertAnimeUser(animeUser, getAnimeList(username), username, animeInfo)
    sleep(1)

In [49]:
if not skip:
    animeUser.to_csv('animeUser.csv')
    animeInfo.to_csv('animeInfo.csv')
else:
    animeUser = pd.read_csv('animeUser.csv', index_col='anime')
    animeInfo = pd.read_csv('animeInfo.csv', index_col='anime')

In [119]:
# functions for getting anime info
def getAnimeInfo(anime):
    r = requests.get('https://api.jikan.moe/v3/anime/{}'.format(anime))
    if r.status_code == 200:
        return r.text
    return None

def insertAnimeInfo(df, info):
    if info is None:
        return df
    
    info = json.loads(info)
    # insert anime info into df
    local = pd.DataFrame({'anime': info['title'],
                          'mal_id': int(info['mal_id']),
                          'title_japanese': info['title_japanese'],
                          'type': info['type'],
                          'episodes': info['episodes'],
                          'synopsis': info['synopsis'],
                          'status': info['status'],
                          'score': info['score'],
                          'url': info['url'],
                          'popularity': info['popularity'],
                          'members': info['members'],
                          'scored_by': info['scored_by'],
                          'rank': info['rank']}, index=['anime'])
    local = local.set_index('anime')
    df = df.combine_first(local)
    return df

In [127]:
# pull anime info
for index, row in animeInfo.iterrows():
    
    if skip:
        continue
    
    # grab anime info
    animeInfo = insertAnimeInfo(animeInfo, getAnimeInfo(int(row.mal_id)))
    clear_output(wait=True)
    print(index)
    sleep(0.5)

◯


In [148]:
if 'anime' in animeInfo.columns:
    animeInfo = animeInfo.drop('anime', axis=1)
animeInfo = animeInfo.rename_axis('anime')

if not skip:
    animeInfo.to_csv('animeInfo.csv')
else:
    animeInfo = pd.read_csv('animeInfo.csv', index_col='anime')

Now that we have scraped the data, we need to clean it up before analysis. We will need to replace null values as well as make sure there are values in each cell of the data frame.

In [1]:
# clean up users data frame

# Privacy / Ethics Considerations

In regards to the project, we are using publicly available data. However, the data needs to be cleaned as it contains location data and usernames. We will need to parse to make sure there are no odd revealing pieces of data around. Once we have accomplished this part, the data we use and display would not reveal any confidential information. Aside from this, we are in compliance with the Terms of Service provided by MyAnimeList so our data usage will be ethically safe. Furthermore, users on MyAnimeList are also able to restrict the view of their ratings list so if they did not want their ratings used by any third party application, they could simply restrict who is able to view their lists.

The data results from this project do not contain user sensitive information as everything is anonymized. The purpose of the project is just to see if there is a way to draw meaningful correlations between anime or manga to create predictions. However, the results of the project may have some biased results as we will end up selecting a random population from the total users on MAL. This may create bias since it’s possible that users may have a preference to certain kinds of anime over others which would skew the data analysis and prediction.

# Conclusion & Discussion

