# Anime Dataset Classification (ratings 1 to 10)

In [40]:
import pandas as pd

In [41]:
df = pd.read_csv('mal_anime.csv')
df.shape

(19931, 25)

# Preprocessing 
- Removing extra appearance of url.*: and id.*: --> pandas has done this task by removing 11 rows while loading
- Covert duration column to minutes 
- Remove # from two columns
- Genre, themes and producers :: remove comma and do OHE

In [42]:
df.head(1895).tail(5)

Unnamed: 0,myanimelist_id,title,description,image,Type,Episodes,Status,Premiered,Released_Season,Released_Year,...,Demographic,Duration,Rating,Score,Ranked,Popularity,Members,Favorites,characters,source_url
1890,2083,Natsuki Crisis,Natsuki Kisumi has a passion for wrestling and...,https://cdn.myanimelist.net/images/anime/1008/...,OVA,2,Finished Airing,,,,...,,29 min. per ep.,PG-13 - Teens 13 or older,6.19,#9168,#9860,4173,1,"[{""id"": 12057, ""name"": ""Kandori, Akira"", ""url""...",https://myanimelist.net/anime/2083/Natsuki_Crisis
1891,2084,NG Knight Ramune & 40,Lamune is an ordinary 4th grade boy who loves ...,https://cdn.myanimelist.net/images/anime/2/467...,TV,38,Finished Airing,Spring 1990,Spring,1990.0,...,,22 min. per ep.,PG-13 - Teens 13 or older,7.02,#4458,#9242,5080,24,"[{""id"": 70087, ""name"": ""Arara, Cocoa"", ""url"": ...",https://myanimelist.net/anime/2084/NG_Knight_R...
1892,2085,NG Knight Ramune & 40 EX: Biku Biku Triangle A...,Lamune has returned to Earth with no memory of...,https://cdn.myanimelist.net/images/anime/7/227...,OVA,3,Finished Airing,,,,...,,30 min. per ep.,PG-13 - Teens 13 or older,6.39,#8012,#12600,1862,2,"[{""id"": 70087, ""name"": ""Arara, Cocoa"", ""url"": ...",https://myanimelist.net/anime/2085/NG_Knight_R...
1893,2086,NG Knight Ramune & 40 DX: Wakuwaku Jikuu - Hon...,Lamune and Da Cider head off to Puff-Puff Pala...,https://cdn.myanimelist.net/images/anime/8/215...,OVA,3,Finished Airing,,,,...,,30 min. per ep.,PG-13 - Teens 13 or older,6.41,#7915,#12626,1842,3,"[{""id"": 70087, ""name"": ""Arara, Cocoa"", ""url"": ...",https://myanimelist.net/anime/2086/NG_Knight_R...
1894,2087,VS Knight Lamune & 40 Fire,Baba Lamunade is followed home by two strange ...,https://cdn.myanimelist.net/images/anime/10/43...,TV,26,Finished Airing,Spring 1996,Spring,1996.0,...,,23 min. per ep.,PG-13 - Teens 13 or older,6.91,#4955,#9862,4171,16,"[{""id"": 76822, ""name"": ""Baba, Lemonade"", ""url""...",https://myanimelist.net/anime/2087/VS_Knight_L...


In [43]:
import numpy as np
def hrminsec(str_val):
    '''handle patterns like :: 
    1hr. 43min. 33 sec. per ep.
    Anything can be absent / present '''

    if pd.isna(str_val):
        return np.nan
    if str_val == " ":
        return np.nan
    l_val = str_val.split('.')
    sec_val = 0
    for e in l_val:
        if "hr" in e:
            v = e.split(" ")
            val = int(v[0])*60*60
        elif "min" in e:
            v = e.strip().split(" ")
            val = int(v[0])*60
        elif "sec" in e:
            v = e.strip().split(" ")
            val = int(v[0])
        else:
            val = 0
        sec_val += val
    return sec_val

In [44]:
df['duration_sec'] = df['Duration'].apply(hrminsec)

In [45]:
df['duration_sec'].describe()

count    19549.000000
mean      1574.167323
std       1587.085827
min          0.000000
25%        420.000000
50%       1380.000000
75%       1620.000000
max      10080.000000
Name: duration_sec, dtype: float64

In [46]:
(df['duration_sec'] == 0).sum()

np.int64(624)

# Remove '#'

In [47]:
df['Ranked'] = df['Ranked'].str.replace("#","" )

In [48]:
df['Ranked'].iloc[:5]

0      48
1     232
2     385
3    3344
4    4887
Name: Ranked, dtype: object

In [49]:
df['Popularity'] = df['Popularity'].str.replace("#", "")

In [50]:
df['Popularity'].iloc[:5]

0      42
1     649
2     265
3    1979
4    5765
Name: Popularity, dtype: object

# Genre Themes and Studios --> remove, and do OHE

In [51]:
df.shape

(19931, 26)

In [52]:
ndf = df['Genres'].str.get_dummies(sep=",")
df = df.join(ndf)
df.shape

(19931, 67)

In [53]:
ndf = df['Themes'].str.get_dummies(sep=",")
df = df.join(ndf)
df.shape

(19931, 170)

In [54]:
ndf = df['Producers'].str.strip().str.get_dummies(sep=",")
df = df.join(ndf, lsuffix='l')
df.shape

(19931, 2370)

In [55]:
df['Score'].isna().sum()

np.int64(4692)

In [57]:
df = df[df['Score'].isna() == False]
df.shape

(15239, 2370)

## Convert target column to categotical 

In [60]:
df['Score'] = np.round(df['Score'])

In [61]:
df['Score'].value_counts(normalize=True)

Score
6.0    0.382965
7.0    0.373384
8.0    0.126583
5.0    0.098300
9.0    0.009843
4.0    0.006956
3.0    0.001706
2.0    0.000262
Name: proportion, dtype: float64

## Save the clean file

In [62]:
df.to_csv("clean.csv", index=False)