### ETL Project: Anime Ratings

DESCRIPTION OF THE PROJECT ####

##### 0 - Libraries

In [185]:
import pandas as pd
import numpy as np

##### 1 - Anime Information Dataset

In [186]:
#Loading data
anime = pd.read_csv('data/anime.csv', encoding= 'utf-8')
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [187]:
#Describe amount of ratings per anime.
round(anime['members'].describe(),0)

count      12294.0
mean       18071.0
std        54821.0
min            5.0
25%          225.0
50%         1550.0
75%         9437.0
max      1013917.0
Name: members, dtype: float64

In [188]:
#Describe ratings per anime.
round(anime['rating'].describe(),0)

count    12064.0
mean         6.0
std          1.0
min          2.0
25%          6.0
50%          7.0
75%          7.0
max         10.0
Name: rating, dtype: float64

In [189]:
#Top 10 anime by amount of ratings
anime.sort_values('members', ascending= False).head(10)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
40,1535,Death Note,"Mystery, Police, Psychological, Supernatural, ...",TV,37,8.71,1013917
86,16498,Shingeki no Kyojin,"Action, Drama, Fantasy, Shounen, Super Power",TV,25,8.54,896229
804,11757,Sword Art Online,"Action, Adventure, Fantasy, Game, Romance",TV,25,7.83,893100
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
159,6547,Angel Beats!,"Action, Comedy, Drama, School, Supernatural",TV,13,8.39,717796
19,1575,Code Geass: Hangyaku no Lelouch,"Action, Mecha, Military, School, Sci-Fi, Super...",TV,25,8.83,715151
841,20,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
445,10620,Mirai Nikki (TV),"Action, Mystery, Psychological, Shounen, Super...",TV,26,8.07,657190
131,4224,Toradora!,"Comedy, Romance, School, Slice of Life",TV,25,8.45,633817


In [190]:
#Variable 'Type' unique values
anime['type'].unique()

array(['Movie', 'TV', 'OVA', 'Special', 'Music', 'ONA', nan], dtype=object)

In [191]:
#Top 10 anime by global rating = TV (with at least 100 member ratings)
anime.loc[(anime['members'] > 100)&(anime['type'] == 'TV'),:].sort_values('rating', ascending= False).head(10)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351
6,11061,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,148,9.13,425855
9,15417,Gintama&#039;: Enchousen,"Action, Comedy, Historical, Parody, Samurai, S...",TV,13,9.11,81109
10,4181,Clannad: After Story,"Drama, Fantasy, Romance, Slice of Life, Supern...",TV,24,9.06,456749
12,918,Gintama,"Action, Comedy, Historical, Parody, Samurai, S...",TV,201,9.04,336376
13,2904,Code Geass: Hangyaku no Lelouch R2,"Action, Drama, Mecha, Military, Sci-Fi, Super ...",TV,25,8.98,572888


In [192]:
#Top 10 anime by global rating = Movie (with at least 100 member ratings)
anime.loc[(anime['members'] > 100)&(anime['type'] == 'Movie'),:].sort_values('rating', ascending= False).head(10)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
8,15335,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,"Action, Comedy, Historical, Parody, Samurai, S...",Movie,1,9.1,72534
11,28851,Koe no Katachi,"Drama, School, Shounen",Movie,1,9.05,102733
15,199,Sen to Chihiro no Kamikakushi,"Adventure, Drama, Supernatural",Movie,1,8.93,466254
18,12355,Ookami Kodomo no Ame to Yuki,"Fantasy, Slice of Life",Movie,1,8.84,226193
24,164,Mononoke Hime,"Action, Adventure, Fantasy",Movie,1,8.81,339556
25,7311,Suzumiya Haruhi no Shoushitsu,"Comedy, Mystery, Romance, School, Sci-Fi, Supe...",Movie,1,8.81,240297
33,28957,Mushishi Zoku Shou: Suzu no Shizuku,"Adventure, Fantasy, Historical, Mystery, Seine...",Movie,1,8.75,32266
35,431,Howl no Ugoku Shiro,"Adventure, Drama, Fantasy, Romance",Movie,1,8.74,333186
37,31757,Kizumonogatari II: Nekketsu-hen,"Action, Mystery, Supernatural, Vampire",Movie,1,8.73,34347


In [193]:
#Identifying unique genres
genres_list = []
for i in range(0, len(anime)):
    if isinstance(anime['genre'][i], str):
        genre = anime['genre'][i].split(', ')
        for j in range(0, len(genre)):
            if genre[j] not in genres_list:
                genres_list.append(genre[j])

#Creating columns in df with all unique genres
for genre in genres_list:
    anime['genre_'+genre] = np.nan
    
#Assigning values '1' or '0' to genre columns
for i in range(0, len(anime)):
    if isinstance(anime['genre'][i], str):
        genre = anime['genre'][i].split(', ')
    else: 
        genre= ['NONE']
    
    for j in range(7, len(anime.columns)):
        if anime.columns[j] in genre:
            anime.loc[i, anime.columns[j]] = 1
        else:
            anime.loc[i, anime.columns[j]] = 0

anime.drop(columns= ['genre'])
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,genre_Drama,genre_Romance,genre_School,...,genre_Shounen Ai,genre_Game,genre_Dementia,genre_Harem,genre_Cars,genre_Kids,genre_Shoujo Ai,genre_Hentai,genre_Yaoi,genre_Yuri
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [196]:
print(len(anime.columns))
anime.columns

50


Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members',
       'genre_Drama', 'genre_Romance', 'genre_School', 'genre_Supernatural',
       'genre_Action', 'genre_Adventure', 'genre_Fantasy', 'genre_Magic',
       'genre_Military', 'genre_Shounen', 'genre_Comedy', 'genre_Historical',
       'genre_Parody', 'genre_Samurai', 'genre_Sci-Fi', 'genre_Thriller',
       'genre_Sports', 'genre_Super Power', 'genre_Space',
       'genre_Slice of Life', 'genre_Mecha', 'genre_Music', 'genre_Mystery',
       'genre_Seinen', 'genre_Martial Arts', 'genre_Vampire', 'genre_Shoujo',
       'genre_Horror', 'genre_Police', 'genre_Psychological', 'genre_Demons',
       'genre_Ecchi', 'genre_Josei', 'genre_Shounen Ai', 'genre_Game',
       'genre_Dementia', 'genre_Harem', 'genre_Cars', 'genre_Kids',
       'genre_Shoujo Ai', 'genre_Hentai', 'genre_Yaoi', 'genre_Yuri'],
      dtype='object')

In [198]:
#Cleaning anime 'name' variable   ######## --?????

anime.loc[anime['name'] == 'Gintama',:]
anime.loc[anime['name'].str.contains('Shingeki no', regex=False),['anime_id', 'name', 'type']]

Unnamed: 0,anime_id,name,type
86,16498,Shingeki no Kyojin,TV
125,25781,Shingeki no Kyojin: Kuinaki Sentaku,OVA
717,18397,Shingeki no Kyojin OVA,OVA
920,21843,Shingeki no Bahamut: Genesis,TV
961,23777,Shingeki no Kyojin Movie 2: Jiyuu no Tsubasa,Movie
1069,23775,Shingeki no Kyojin Movie 1: Guren no Yumiya,Movie
3533,19285,Shingeki no Kyojin: Ano Hi Kara,Special
4090,19391,Shingeki no Kyojin Picture Drama,Special
6221,28447,Shingeki no Bahamut: Genesis Recap,Special
10899,25777,Shingeki no Kyojin Season 2,TV


##### 1 - Anime Ratings Dataset

In [199]:
ratings = pd.read_csv('data/rating.csv', encoding= 'utf-8')
ratings.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [200]:
#Remove ratings = '-1'  (meaning that the anime was not viewed yet)
ratings = ratings.loc[ratings['rating'] != -1,:]
ratings.head()

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10
81,1,11617,10
83,1,11757,10
101,1,15451,10
153,2,11771,10


In [201]:
#Exploring Dataset

len(ratings) #6.33 millones (without 'rating: -1')
len(ratings['user_id'].unique()) #69,600 active users 
ratings.groupby('user_id')['rating'].mean().mean() #Average rating for anime is 8.22
ratings.groupby('user_id')['rating'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4,10.000000,0.000000
2,1,10.000000,
3,92,7.565217,1.549933
5,459,4.355120,2.381293
7,343,7.387755,1.104786
...,...,...,...
73512,12,8.583333,1.564279
73513,33,7.515152,1.227834
73514,1,10.000000,
73515,179,8.547486,1.137577
