In [16]:
import pandas as pd

In [17]:
# set categorical types
category_cols = ['Rating','Type','Status','Source']
int_cols = ['Rank','Episodes','Scored By','Popularity','Favorites','Members']
float_cols = ['Score']

types = dict([(i,'category') for i in category_cols])

df = pd.read_csv('anime-dataset-2023.csv')
df = df.astype(types)
df.dtypes

anime_id           int64
Name              object
English name      object
Other name        object
Score             object
Genres            object
Synopsis          object
Type            category
Episodes          object
Aired             object
Premiered         object
Status          category
Producers         object
Licensors         object
Studios           object
Source          category
Duration          object
Rating          category
Rank              object
Popularity         int64
Favorites          int64
Scored By         object
Members            int64
Image URL         object
dtype: object

In [18]:
for cat in category_cols:
    print(cat)
    print(df[cat].cat.categories)

Rating
Index(['G - All Ages', 'PG - Children', 'PG-13 - Teens 13 or older',
       'R - 17+ (violence & profanity)', 'R+ - Mild Nudity', 'Rx - Hentai',
       'UNKNOWN'],
      dtype='object')
Type
Index(['Movie', 'Music', 'ONA', 'OVA', 'Special', 'TV', 'UNKNOWN'], dtype='object')
Status
Index(['Currently Airing', 'Finished Airing', 'Not yet aired'], dtype='object')
Source
Index(['4-koma manga', 'Book', 'Card game', 'Game', 'Light novel', 'Manga',
       'Mixed media', 'Music', 'Novel', 'Original', 'Other', 'Picture book',
       'Radio', 'Unknown', 'Visual novel', 'Web manga', 'Web novel'],
      dtype='object')


## Clean Numeric Columns

In [19]:
for col in int_cols:
    df[col] = pd.to_numeric(df[col],errors='coerce').fillna(-1).astype('int64')
for col in float_cols:
    df[col] = pd.to_numeric(df[col],errors='coerce').fillna(-1).astype('float64')
print(len(df.index))
df.dtypes

24905


anime_id           int64
Name              object
English name      object
Other name        object
Score            float64
Genres            object
Synopsis          object
Type            category
Episodes           int64
Aired             object
Premiered         object
Status          category
Producers         object
Licensors         object
Studios           object
Source          category
Duration          object
Rating          category
Rank               int64
Popularity         int64
Favorites          int64
Scored By          int64
Members            int64
Image URL         object
dtype: object

In [20]:
# we drop the rows that have no score
for i in int_cols:
    df = df[df[i]>0]
for i in float_cols:
    df = df[df[i]>0]
print(len(df.index))

df.head()

10619


Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26,"Apr 3, 1998 to Apr 24, 1999",...,Sunrise,Original,24 min per ep,R - 17+ (violence & profanity),41,43,78525,914193,1771505,https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,カウボーイビバップ 天国の扉,8.38,"Action, Sci-Fi","Another day, another bounty—such is the life o...",Movie,1,"Sep 1, 2001",...,Bones,Original,1 hr 55 min,R - 17+ (violence & profanity),189,602,1448,206248,360978,https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,Trigun,トライガン,8.22,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",TV,26,"Apr 1, 1998 to Sep 30, 1998",...,Madhouse,Manga,24 min per ep,PG-13 - Teens 13 or older,328,246,15035,356739,727252,https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),7.25,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...,TV,26,"Jul 3, 2002 to Dec 25, 2002",...,Sunrise,Original,25 min per ep,PG-13 - Teens 13 or older,2764,1795,613,42829,111931,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Bouken Ou Beet,Beet the Vandel Buster,冒険王ビィト,6.94,"Adventure, Fantasy, Supernatural",It is the dark century and the people are suff...,TV,52,"Sep 30, 2004 to Sep 29, 2005",...,Toei Animation,Manga,23 min per ep,PG - Children,4240,5126,14,6413,15001,https://cdn.myanimelist.net/images/anime/7/215...


### Clean Premiered column

In [21]:
df['Premiered'] = df['Premiered'].astype('string')
df[['season','year']] = df['Premiered'].str.split(' ',n=1,expand=True)

# create categorical data based on the order of seasons
dtype = pd.CategoricalDtype(['winter', 'spring', 'summer', 'fall', 'UNKNOWN'], ordered=True)
df['season'] = df['season'].astype(dtype)

# replace categories with their codes
df['season'] = df['season'].cat.codes

# remove UNKNOWNs
df = df[df['season'] < 4]
print(len(df.index))

# create new row
df['year'] = df['year'].astype(int)
df['Premier_Month'] = (df['year']-1900)*12 + df['season']*3

4281


In [22]:
df = df.drop(columns=['Premiered','year'])
df.head()

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL,season,Premier_Month
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26,"Apr 3, 1998 to Apr 24, 1999",...,24 min per ep,R - 17+ (violence & profanity),41,43,78525,914193,1771505,https://cdn.myanimelist.net/images/anime/4/196...,1,1179
2,6,Trigun,Trigun,トライガン,8.22,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",TV,26,"Apr 1, 1998 to Sep 30, 1998",...,24 min per ep,PG-13 - Teens 13 or older,328,246,15035,356739,727252,https://cdn.myanimelist.net/images/anime/7/203...,1,1179
3,7,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),7.25,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...,TV,26,"Jul 3, 2002 to Dec 25, 2002",...,25 min per ep,PG-13 - Teens 13 or older,2764,1795,613,42829,111931,https://cdn.myanimelist.net/images/anime/10/19...,2,1230
4,8,Bouken Ou Beet,Beet the Vandel Buster,冒険王ビィト,6.94,"Adventure, Fantasy, Supernatural",It is the dark century and the people are suff...,TV,52,"Sep 30, 2004 to Sep 29, 2005",...,23 min per ep,PG - Children,4240,5126,14,6413,15001,https://cdn.myanimelist.net/images/anime/7/215...,3,1257
5,15,Eyeshield 21,UNKNOWN,アイシールド21,7.92,Sports,"Shy, reserved, and small-statured, Deimon High...",TV,145,"Apr 6, 2005 to Mar 19, 2008",...,23 min per ep,PG-13 - Teens 13 or older,688,1252,1997,86524,177688,https://cdn.myanimelist.net/images/anime/1079/...,1,1263


## Clean Genre Row

In [23]:
genres = ['Action', 'Adventure', 'Avant Garde', 'Award Winning', 'Boys Love', 'Comedy', 'Drama', 'Fantasy', 'Girls Love', 'Gourmet', 'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Slice of Life', 'Sports', 'Supernatural', 'Suspense', 'Ecchi']

df['Genres'] = df['Genres'].astype('string')
for genre in genres:
    df[genre] = df['Genres'].str.contains(genre)
df.head()

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Gourmet,Horror,Mystery,Romance,Sci-Fi,Slice of Life,Sports,Supernatural,Suspense,Ecchi
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26,"Apr 3, 1998 to Apr 24, 1999",...,False,False,False,False,True,False,False,False,False,False
2,6,Trigun,Trigun,トライガン,8.22,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",TV,26,"Apr 1, 1998 to Sep 30, 1998",...,False,False,False,False,True,False,False,False,False,False
3,7,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),7.25,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...,TV,26,"Jul 3, 2002 to Dec 25, 2002",...,False,False,True,False,False,False,False,True,False,False
4,8,Bouken Ou Beet,Beet the Vandel Buster,冒険王ビィト,6.94,"Adventure, Fantasy, Supernatural",It is the dark century and the people are suff...,TV,52,"Sep 30, 2004 to Sep 29, 2005",...,False,False,False,False,False,False,False,True,False,False
5,15,Eyeshield 21,UNKNOWN,アイシールド21,7.92,Sports,"Shy, reserved, and small-statured, Deimon High...",TV,145,"Apr 6, 2005 to Mar 19, 2008",...,False,False,False,False,False,False,True,False,False,False


In [24]:
df.to_csv('cleaned_data.csv')