In [1]:
import pandas as pd
import numpy as np


In [2]:
pd.options.display.float_format = '{:,.3f}'.format

In [4]:
# load anime dataset
anime_df = pd.read_csv("anime.csv")

anime_df.head(10)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351
6,11061,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,148,9.13,425855
7,820,Ginga Eiyuu Densetsu,"Drama, Military, Sci-Fi, Space",OVA,110,9.11,80679
8,15335,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,"Action, Comedy, Historical, Parody, Samurai, S...",Movie,1,9.1,72534
9,15417,Gintama&#039;: Enchousen,"Action, Comedy, Historical, Parody, Samurai, S...",TV,13,9.11,81109


In [5]:
# number of rows and columns in the dataframe
anime_df.shape

(12294, 7)

In [6]:
# are the columns using suitable datatypes
anime_df.dtypes

anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object

In [7]:
# replace animes where the number of episodes are unknown into nan and then convert everything to float
anime_df["episodes"].replace({"Unknown": "nan", "unknown": "nan"}, inplace=True)
anime_df["episodes"] = anime_df["episodes"].astype("float")
anime_df.dtypes

anime_id      int64
name         object
genre        object
type         object
episodes    float64
rating      float64
members       int64
dtype: object

In [8]:
# Check which rows have missing values
anime_df.isnull().any()

anime_id    False
name        False
genre        True
type         True
episodes     True
rating       True
members     False
dtype: bool

In [9]:
anime_df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes    340
rating      230
members       0
dtype: int64

In [10]:
# remove rows where the film is not classified as "TV"
anime_df = anime_df[anime_df["type"] == "TV"]
anime_df.head(10)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64.0,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51.0,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24.0,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51.0,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10.0,9.15,93351
6,11061,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,148.0,9.13,425855
9,15417,Gintama&#039;: Enchousen,"Action, Comedy, Historical, Parody, Samurai, S...",TV,13.0,9.11,81109
10,4181,Clannad: After Story,"Drama, Fantasy, Romance, Slice of Life, Supern...",TV,24.0,9.06,456749
12,918,Gintama,"Action, Comedy, Historical, Parody, Samurai, S...",TV,201.0,9.04,336376
13,2904,Code Geass: Hangyaku no Lelouch R2,"Action, Drama, Mecha, Military, Sci-Fi, Super ...",TV,25.0,8.98,572888


In [11]:
anime_df.isnull().sum()

anime_id      0
name          0
genre        10
type          0
episodes    209
rating      116
members       0
dtype: int64

Ratings Dataset

In [12]:
# load ratings dataset
rating_df = pd.read_csv("rating.csv")

rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [13]:
rating_df.tail()

Unnamed: 0,user_id,anime_id,rating
1249987,11801,11757,10
1249988,11801,13357,8
1249989,11801,15437,9
1249990,11802,330,10
1249991,11802,523,10


In [14]:
rating_df.shape

(1249992, 3)

In [15]:
rating_df.dtypes

user_id     int64
anime_id    int64
rating      int64
dtype: object

In [16]:
rating_df.isnull().any()

user_id     False
anime_id    False
rating      False
dtype: bool

In [17]:
rating_df["rating"].replace({-1: np.nan}, inplace=True)
values = rating_df["rating"].unique()
values.sort
print(values)

[nan 10.  8.  6.  9.  7.  3.  5.  4.  1.  2.]


In [19]:
anime_df.to_csv("cleaned_anime.csv", index=False)
rating_df.to_csv("cleaned_rating.csv", index=False)