In [2]:
import pandas as pd
import numpy as np
import html

In [56]:
anime = pd.read_csv('./anime-recommendations-database/anime.csv')
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


### Genres

In [64]:
# Fill genre with NA (Source: https://www.kaggle.com/deepakputhraya/exploratory-analysis)
anime.genre.fillna('NA', inplace=True)

# Get a list of all genres
genres = set((', '.join(anime['genre'].values.flatten())).split(', '))

print('Number of genres', len(genres))
genres

Number of genres 44


{'Action',
 'Adventure',
 'Cars',
 'Comedy',
 'Dementia',
 'Demons',
 'Drama',
 'Ecchi',
 'Fantasy',
 'Game',
 'Harem',
 'Hentai',
 'Historical',
 'Horror',
 'Josei',
 'Kids',
 'Magic',
 'Martial Arts',
 'Mecha',
 'Military',
 'Music',
 'Mystery',
 'NA',
 'Parody',
 'Police',
 'Psychological',
 'Romance',
 'Samurai',
 'School',
 'Sci-Fi',
 'Seinen',
 'Shoujo',
 'Shoujo Ai',
 'Shounen',
 'Shounen Ai',
 'Slice of Life',
 'Space',
 'Sports',
 'Super Power',
 'Supernatural',
 'Thriller',
 'Vampire',
 'Yaoi',
 'Yuri'}

In [66]:
# Normalize genres (Source: https://www.kaggle.com/xthunder94/category-visualization)
anime = anime.replace({'Harem': 'Hentai'}, regex=True)
anime = anime.replace({'Ecchi': 'Hentai'}, regex=True)
anime = anime.replace({'Shoujo Ai': 'Hentai'}, regex=True)
anime = anime.replace({'Yaoi': 'Hentai'}, regex=True)
anime = anime.replace({'Yuri': 'Hentai'}, regex=True)
anime = anime.replace({'Shounen Ai': 'Hentai'}, regex=True)

anime = anime.replace({'Demons': 'Vampire'}, regex=True)

anime = anime.replace({'Supernatural': 'Magic'}, regex=True)
anime = anime.replace({'Super Power': 'Magic'}, regex=True)
anime = anime.replace({'Sci-Fi': 'Magic'}, regex=True)

### Title

In [42]:
anime['name'] = [html.unescape(name) for name in anime['name']] # decode spec html chars like &quot; => '
anime.sort_values(['name']).head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
7749,20707,"""0""",Music,Music,1,5.06,1170
8059,25627,"""Aesop"" no Ohanashi yori: Ushi to Kaeru, Yokub...",Kids,Movie,1,5.0,113
3156,7669,"""Bungaku Shoujo"" Kyou no Oyatsu: Hatsukoi","Comedy, Fantasy, School",OVA,1,7.06,14351
1436,8481,"""Bungaku Shoujo"" Memoire","Drama, Romance, School",OVA,3,7.54,18013
1199,6408,"""Bungaku Shoujo"" Movie","Drama, Mystery, Romance, School",Movie,1,7.63,40984


### Simple stats

#### Filling in missing episode data

In [43]:
# source: https://www.kaggle.com/xenocide/content-based-anime-recommender

anime.loc[(anime['genre']=='Hentai') & (anime['episodes']=='Unknown'), 'episodes'] = "1"
anime.loc[(anime['type'] == 'Movie') & (anime['episodes'] == 'Unknown'), 'episodes'] = "1"

In [44]:
known_animes = {
    "Naruto Shippuuden": 500,
    "Fairy Tail": 277,
    "One Piece": 807,
    "Detective Conan": 874,
    "Dragon Ball Super": 86,
    "Crayon Shin chan": 942,
    "Yu Gi Oh Arc V": 148,
    "Shingeki no Kyojin Season 2": 12,
    "Boku no Hero Academia 2nd Season": 25,
    "Little Witch Academia TV": 25
}

In [45]:
for k,v in known_animes.items():    
    anime.loc[anime["name"] == k, "episodes"] = v

In [46]:
# replace Unknown with NaN
anime["episodes"] = anime["episodes"].map(lambda x:np.nan if x == "Unknown" else x)

In [47]:
# convert numbers to floats
anime["episodes"] = anime["episodes"].astype(float)
anime["rating"] = anime["rating"].astype(float)

In [48]:
# movies already filled in, only use median of TV series
ep_median = anime[anime['type'] == "TV"].episodes.median()
ep_mean = anime[anime['type'] == "TV"].episodes.mean()
print('Median episode number', ep_median)
print('Mean episode number', ep_mean)

anime["episodes"].fillna(ep_median, inplace = True)
anime.head()

Median episode number 24.0
Mean episode number 36.3974323193


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64.0,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51.0,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24.0,9.17,673572
4,9969,Gintama',"Action, Comedy, Historical, Parody, Samurai, S...",TV,51.0,9.16,151266


In [49]:
anime.groupby('type').episodes.agg(['mean', 'median', 'count'])

Unnamed: 0_level_0,mean,median,count
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Movie,1.102215,1.0,2348
Music,1.178279,1.0,488
ONA,8.072838,3.0,659
OVA,2.493506,2.0,3311
Special,2.625298,1.0,1676
TV,35.729601,24.0,3787


In [50]:
anime = anime[anime['type'].isin(('Movie', 'TV'))] # remove unnecessary types


## Ratings

In [52]:
anime['rating'].fillna(anime['rating'].median(), inplace = True)

# Users

## Ratings

In [3]:
rating = pd.read_csv('./anime-recommendations-database/rating.csv')
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [9]:
print('Users', len(rating.groupby('user_id')))
print('Ratings', len(rating))
print('Watching without rating', len(rating[rating['rating'] == -1]))

Users 73515
Ratings 7813737
Watching without rating 1476496


In [8]:
print('Mean rating: ', rating['rating'].mean())

Mean rating:  6.14402954694


In [None]:
# replace -1 rating with MEAN

In [80]:
# mean ratings/graph for tv, movie