In [1]:
# Import libraries
import pandas as pd

# Import ML libraries
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the data
raw_anime_df = pd.read_csv('raw_resources/anime.csv', encoding='utf-8')
raw_anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


## General Data Cleaning (Shared for both Machine Learning and Tableau)

In [3]:
# Check the dataframe 
print(raw_anime_df.shape)
raw_anime_df.info()

(12294, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [4]:
# Observations: 
# 1. Null values in genre, type, and rating columns
# 2. name column is object type, should be string
# 3. genre column is object type, should be list of strings
# 4. genre column has Null values
# 5. type column is object type, should be string
# 6. genre column has Null values
# 7. episodes column is object type, should be int

#### Look at null values

In [5]:
# Look at null values in the dataframe
raw_anime_df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [6]:
# Convert null values to percentages to understand the impact of dropping them
(raw_anime_df.isnull().sum() / 12294 * 100).apply(lambda x: f'{x:.3f}%')

anime_id    0.000%
name        0.000%
genre       0.504%
type        0.203%
episodes    0.000%
rating      1.871%
members     0.000%
dtype: object

In [7]:
# Since the impact of dropping null values is less than 1% in genre and type columns, we can drop all rows with null values
anime_df = raw_anime_df.dropna(subset=['genre', 'type'])

In [8]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12210 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12210 non-null  int64  
 1   name      12210 non-null  object 
 2   genre     12210 non-null  object 
 3   type      12210 non-null  object 
 4   episodes  12210 non-null  object 
 5   rating    12017 non-null  float64
 6   members   12210 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 763.1+ KB


In [9]:
# Since the impact of dropping null valuesin rating column is greater than 1%, we will fill the null values with the mean of the column
# Also, rating column is an important feature for our analysis, so we should not drop the column
anime_df.loc[anime_df['rating'].isnull(), 'rating'] = anime_df['rating'].mean()

In [10]:
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [11]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12210 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12210 non-null  int64  
 1   name      12210 non-null  object 
 2   genre     12210 non-null  object 
 3   type      12210 non-null  object 
 4   episodes  12210 non-null  object 
 5   rating    12210 non-null  float64
 6   members   12210 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 763.1+ KB


#### Check all columns individually to look for data cleaning

In [12]:
# Check name column
anime_df['name'].value_counts()

name
Shi Wan Ge Leng Xiaohua                     2
Saru Kani Gassen                            2
Kimi no Na wa.                              1
Arigatou Kumanofu                           1
Araiguma Rascal Specials                    1
                                           ..
Mahou Shoujo Lyrical Nanoha ViVid           1
Mini Van Special                            1
Mobile Suit Gundam ZZ                       1
Mokei Senshi Gunpla Builders Beginning G    1
Yasuji no Pornorama: Yacchimae!!            1
Name: count, Length: 12208, dtype: int64

In [13]:
# Check for special characters in name column
anime_df.loc[anime_df['name'].str.contains(r'[^\x00-\x7F]', na=False)]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
41,32366,Gintama°: Aizome Kaori-hen,"Comedy, Parody",OVA,2,8.69,16947
59,11577,Steins;Gate Movie: Fuka Ryouiki no Déjà vu,"Sci-Fi, Thriller",Movie,1,8.61,192424
96,9756,Mahou Shoujo Madoka★Magica,"Drama, Magic, Psychological, Thriller",TV,12,8.51,462974
102,11981,Mahou Shoujo Madoka★Magica Movie 3: Hangyaku n...,"Drama, Magic, Psychological, Thriller",Movie,1,8.50,135735
...,...,...,...,...,...,...,...
12106,4034,Sailor Senshi Venus♥Five,"Hentai, Parody, Super Power",OVA,2,5.53,909
12170,4818,Houkago Renai Club: Koi no Étude,Hentai,OVA,2,5.39,605
12179,13917,Star☆Jewel Gaiden: Natsumi Oblivion,"Hentai, Yuri",OVA,1,5.35,883
12232,3541,Kouin Tenshi: Haitoku no Lycéenne,Hentai,OVA,1,4.99,652


In [14]:
# Replace hearts and stars with a space
anime_df.loc[anime_df['name'].str.contains('♥'), 'name'] = anime_df['name'].str.replace('♥', ' ')
anime_df.loc[anime_df['name'].str.contains('★'), 'name'] = anime_df['name'].str.replace('★', ' ')
anime_df.loc[anime_df['name'].str.contains(r'[^\x00-\x7F]', na=False)]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
41,32366,Gintama°: Aizome Kaori-hen,"Comedy, Parody",OVA,2,8.69,16947
59,11577,Steins;Gate Movie: Fuka Ryouiki no Déjà vu,"Sci-Fi, Thriller",Movie,1,8.61,192424
117,572,Kaze no Tani no Nausicaä,"Adventure, Fantasy",Movie,1,8.47,143273
120,392,Yuu☆Yuu☆Hakusho,"Action, Comedy, Demons, Fantasy, Martial Arts,...",TV,112,8.47,195017
...,...,...,...,...,...,...,...
12044,3383,Orchid☆Emblem,Hentai,OVA,1,5.68,646
12170,4818,Houkago Renai Club: Koi no Étude,Hentai,OVA,2,5.39,605
12179,13917,Star☆Jewel Gaiden: Natsumi Oblivion,"Hentai, Yuri",OVA,1,5.35,883
12232,3541,Kouin Tenshi: Haitoku no Lycéenne,Hentai,OVA,1,4.99,652


In [15]:
# Replace other special characters with a space
anime_df.loc[anime_df['name'].str.contains('☆'), 'name'] = anime_df['name'].str.replace('☆', ' ')
anime_df.loc[anime_df['name'].str.contains('°'), 'name'] = anime_df['name'].str.replace('°', ' ')
anime_df.loc[anime_df['name'].str.contains('²'), 'name'] = anime_df['name'].str.replace('²', ' ')
anime_df.loc[anime_df['name'].str.contains(r'[^\x00-\x7F]', na=False)]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
59,11577,Steins;Gate Movie: Fuka Ryouiki no Déjà vu,"Sci-Fi, Thriller",Movie,1,8.61,192424
117,572,Kaze no Tani no Nausicaä,"Adventure, Fantasy",Movie,1,8.47,143273
232,33255,Saiki Kusuo no Ψ-nan (TV),"Comedy, School, Shounen, Supernatural",TV,120,8.29,47092
235,25681,Kamisama Hajimemashita◎,"Comedy, Demons, Fantasy, Romance, Shoujo, Supe...",TV,12,8.28,91796
451,6586,Yume-iro Pâtissière,"Kids, School, Shoujo",TV,50,8.07,36921
...,...,...,...,...,...,...,...
11305,2438,Hitozuma♪Kasumi-san,Hentai,OVA,2,6.84,3782
11500,21059,Oppai Infinity∞! The Animation,Hentai,OVA,1,6.51,1885
11756,4011,Binetsukko ♭37℃ The Animation,"Harem, Hentai, School",OVA,2,6.14,1925
12170,4818,Houkago Renai Club: Koi no Étude,Hentai,OVA,2,5.39,605


In [16]:
# Replace other special characters with a space
anime_df.loc[anime_df['name'].str.contains('Ψ'), 'name'] = anime_df['name'].str.replace('Ψ', ' ')
anime_df.loc[anime_df['name'].str.contains('◎'), 'name'] = anime_df['name'].str.replace('◎', ' ')
anime_df.loc[anime_df['name'].str.contains('♪'), 'name'] = anime_df['name'].str.replace('♪', ' ')
anime_df.loc[anime_df['name'].str.contains(';'), 'name'] = anime_df['name'].str.replace(';', ' ')
anime_df.loc[anime_df['name'].str.contains('∞'), 'name'] = anime_df['name'].str.replace('∞', ' ')
anime_df.loc[anime_df['name'].str.contains('♭'), 'name'] = anime_df['name'].str.replace('♭', ' ')
anime_df.loc[anime_df['name'].str.contains('∞'), 'name'] = anime_df['name'].str.replace('∞', ' ')
anime_df.loc[anime_df['name'].str.contains(r'[^\x00-\x7F]', na=False)]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
59,11577,Steins Gate Movie: Fuka Ryouiki no Déjà vu,"Sci-Fi, Thriller",Movie,1,8.610000,192424
117,572,Kaze no Tani no Nausicaä,"Adventure, Fantasy",Movie,1,8.470000,143273
451,6586,Yume-iro Pâtissière,"Kids, School, Shoujo",TV,50,8.070000,36921
539,1695,Les Misérables: Shoujo Cosette,"Drama, Historical, Shoujo, Slice of Life",TV,52,7.990000,9605
733,210,Ranma ½,"Comedy, Fantasy, Martial Arts, Slice of Life",TV,161,7.870000,105212
...,...,...,...,...,...,...,...
10922,30485,ChäoS Child,"Harem, Mystery, Psychological, Sci-Fi, Superna...",TV,Unknown,6.478264,19590
10944,32878,ēlDLIVE,"Action, Sci-Fi, Space",TV,Unknown,6.478264,6943
11756,4011,Binetsukko 37℃ The Animation,"Harem, Hentai, School",OVA,2,6.140000,1925
12170,4818,Houkago Renai Club: Koi no Étude,Hentai,OVA,2,5.390000,605


In [17]:
# Replace other special characters with a space
anime_df.loc[anime_df['name'].str.contains('é'), 'name'] = anime_df['name'].str.replace('é', 'e')
anime_df.loc[anime_df['name'].str.contains('à'), 'name'] = anime_df['name'].str.replace('à', 'a')
anime_df.loc[anime_df['name'].str.contains('ä'), 'name'] = anime_df['name'].str.replace('ä', 'a')
anime_df.loc[anime_df['name'].str.contains('â'), 'name'] = anime_df['name'].str.replace('â', 'a')
anime_df.loc[anime_df['name'].str.contains('è'), 'name'] = anime_df['name'].str.replace('è', 'e')
anime_df.loc[anime_df['name'].str.contains('ē'), 'name'] = anime_df['name'].str.replace('ē', 'e')
anime_df.loc[anime_df['name'].str.contains('℃'), 'name'] = anime_df['name'].str.replace('℃', 'C')
anime_df.loc[anime_df['name'].str.contains(r'[^\x00-\x7F]', na=False)]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
733,210,Ranma ½,"Comedy, Fantasy, Martial Arts, Slice of Life",TV,161,7.87,105212
734,1007,Ranma ½ OVA,"Comedy, Martial Arts, Romance, Shounen",OVA,6,7.87,16804
801,1719,Rozen Maiden: Ouvertüre,"Action, Comedy, Drama, Magic, Seinen",Special,2,7.83,44388
876,1011,Ranma ½ Super,"Adventure, Comedy, Martial Arts, Romance, Shou...",OVA,3,7.79,9936
901,1008,Ranma ½ Specials,"Comedy, Drama, Romance, Shounen",OVA,2,7.78,11084
...,...,...,...,...,...,...,...
8135,20237,Anime Document: München e no Michi,Sports,TV,16,6.50,83
8185,29708,Üks Uks,Dementia,Movie,1,6.17,66
9124,31605,Kana Kana Kazoku: Kakusan Mare Bo ! 1-Wa-5-wa ...,Comedy,ONA,1,5.11,44
10033,32148,PS3® no Tsukai Kata: feat.Peeping Life,"Comedy, Slice of Life",OVA,6,5.08,93


In [18]:
# Replace other special characters with a space
anime_df.loc[anime_df['name'].str.contains('ü'), 'name'] = anime_df['name'].str.replace('ü', 'u')
anime_df.loc[anime_df['name'].str.contains('Ü'), 'name'] = anime_df['name'].str.replace('Ü', 'U')
anime_df.loc[anime_df['name'].str.contains('½'), 'name'] = anime_df['name'].str.replace('½', '1/2')
anime_df.loc[anime_df['name'].str.contains('®'), 'name'] = anime_df['name'].str.replace('®', ' ')
anime_df.loc[anime_df['name'].str.contains('É'), 'name'] = anime_df['name'].str.replace('É', 'E')
anime_df['name'].loc[anime_df['name'].str.contains(r'[^\x00-\x7F]', na=False)]

1518                                       Tokyo Ghoul √A
1636                                         Re:␣Hamatora
1868                                                  MÄR
2155                                            Macross Δ
2717                                          Maria†Holic
2856                     Shin Koihime†Musou: Otome Tairan
2955    Love Live! School Idol Project: μ&#039 s →NEXT...
3175                                    Maria†Holic Alive
3357                                       Weiß Kreuz OVA
3507                                   Shin Koihime†Musou
3699                            Maria†Holic Alive Special
3708                 Shin Koihime†Musou: Otome Tairan OVA
3721                                           C³ Special
3788                                                   C³
3804                                        Koihime†Musou
3834    Love Live! School Idol Project: μ&#039 s →NEXT...
3880    Puchimas!!: Petit Petit iDOLM@STER - Fuyu→Kota...
3957          

In [19]:
# Replace other special characters with a space
anime_df.loc[anime_df['name'].str.contains(' √A'), 'name'] = anime_df['name'].str.replace(' √A', '')
anime_df.loc[anime_df['name'].str.contains('␣'), 'name'] = anime_df['name'].str.replace('␣', ' ')
anime_df.loc[anime_df['name'].str.contains('Ä'), 'name'] = anime_df['name'].str.replace('Ä', 'A')
anime_df.loc[anime_df['name'].str.contains('Δ'), 'name'] = anime_df['name'].str.replace('Δ', 'Delta')
anime_df.loc[anime_df['name'].str.contains('†'), 'name'] = anime_df['name'].str.replace('†', ' ')
anime_df.loc[anime_df['name'].str.contains('ß'), 'name'] = anime_df['name'].str.replace('ß', 'B')
anime_df.loc[anime_df['name'].str.contains('³'), 'name'] = anime_df['name'].str.replace('³', '3')
anime_df.loc[anime_df['name'].str.contains('ö'), 'name'] = anime_df['name'].str.replace('ö', 'o')
anime_df.loc[anime_df['name'].str.contains('＊'), 'name'] = anime_df['name'].str.replace('＊', ' ')
anime_df.loc[anime_df['name'].str.contains('♡'), 'name'] = anime_df['name'].str.replace('♡', ' ')
anime_df.loc[anime_df['name'].str.contains('š'), 'name'] = anime_df['name'].str.replace('š', 's')
anime_df.loc[anime_df['name'].str.contains('“'), 'name'] = anime_df['name'].str.replace('“', '')
anime_df.loc[anime_df['name'].str.contains('”'), 'name'] = anime_df['name'].str.replace('”', '')
anime_df.loc[anime_df['name'].str.contains('ă'), 'name'] = anime_df['name'].str.replace('ă', 'a')
anime_df['name'].loc[anime_df['name'].str.contains(r'[^\x00-\x7F]', na=False)]

2955    Love Live! School Idol Project: μ&#039 s →NEXT...
3834    Love Live! School Idol Project: μ&#039 s →NEXT...
3880    Puchimas!!: Petit Petit iDOLM@STER - Fuyu→Kota...
4954    Monster Musume no Iru Nichijou: Hobo Mainichi ...
7993                                                    ◯
9124    Kana Kana Kazoku: Kakusan Mare Bo ! 1-Wa-5-wa ...
Name: name, dtype: object

In [20]:
anime_df.loc[anime_df['name'].str.contains('→'), 'name'] = anime_df['name'].str.replace('→', '->')
anime_df.loc[anime_df['name'].str.contains('@'), 'name'] = anime_df['name'].str.replace('@', 'A')
anime_df.loc[anime_df['name'].str.contains('μ'), 'name'] = anime_df['name'].str.replace('μ', 'u')
anime_df.loc[anime_df['name'].str.contains('&#039 '), 'name'] = anime_df['name'].str.replace('&#039 ', "'")
anime_df['name'].loc[anime_df['name'].str.contains(r'[^\x00-\x7F]', na=False)]

3880    Puchimas!!: Petit Petit iDOLMASTER - Fuyu->Kot...
4954    Monster Musume no Iru Nichijou: Hobo Mainichi ...
7993                                                    ◯
9124    Kana Kana Kazoku: Kakusan Mare Bo ! 1-Wa-5-wa ...
Name: name, dtype: object

In [21]:
# Temporarily set the maximum column width to 200 to view the entire name
# Ref: https://stackoverflow.com/questions/69570667/temporarily-set-max-col-width-and-keep-pandas-styling-jupyter
with pd.option_context('display.max_colwidth', 400):
    print(anime_df.loc[anime_df['name'].str.contains(r'[^\x00-\x7F]', na=False), 'name'])

3880                   Puchimas!!: Petit Petit iDOLMASTER - Fuyu->Kotatsu←Haru
4954          Monster Musume no Iru Nichijou: Hobo Mainichi ◯◯! Namappoi Douga
7993                                                                         ◯
9124    Kana Kana Kazoku: Kakusan Mare Bo ! 1-Wa-5-wa oo Matome Koukai… Ka na?
Name: name, dtype: object


In [22]:
anime_df.loc[anime_df['name'].str.contains('←'), 'name'] = anime_df['name'].str.replace('←', '<-')
anime_df.loc[anime_df['name'].str.contains('◯'), 'name'] = anime_df['name'].str.replace('◯', 'O')
anime_df.loc[anime_df['name'].str.contains('…'), 'name'] = anime_df['name'].str.replace('…', '...')
anime_df['name'].loc[anime_df['name'].str.contains(r'[^\x00-\x7F]', na=False)]

Series([], Name: name, dtype: object)

In [23]:
# Check genre column
anime_df['genre'].value_counts()

genre
Hentai                                                            823
Comedy                                                            523
Music                                                             300
Kids                                                              199
Comedy, Slice of Life                                             179
                                                                 ... 
Action, Comedy, Drama, Mecha, Music, Sci-Fi, Shounen                1
Action, Comedy, Fantasy, Mecha, Sci-Fi, Shounen                     1
Adventure, Fantasy, Martial Arts, Sci-Fi, Shounen, Super Power      1
Comedy, Ecchi, Martial Arts, Romance                                1
Hentai, Slice of Life                                               1
Name: count, Length: 3260, dtype: int64

In [24]:
# For Tableau dashboard, we will leave genre column alone for now

In [25]:
# Check type column
anime_df['type'].value_counts()

type
TV         3777
OVA        3310
Movie      2306
Special    1674
ONA         655
Music       488
Name: count, dtype: int64

In [26]:
# Type column seem fine

In [27]:
# Check episodes column
anime_df['episodes'].value_counts()

episodes
1      5631
2      1076
12      814
13      572
26      514
       ... 
358       1
366       1
201       1
172       1
125       1
Name: count, Length: 187, dtype: int64

In [28]:
# Check the number of unique values in episodes column
anime_df['episodes'].nunique()

187

In [29]:
# Since there are only 187 unique values in episodes column and the datatype is object, check for non-numeric values
anime_df.loc[~anime_df['episodes'].str.isnumeric(), 'episodes']

74       Unknown
252      Unknown
615      Unknown
991      Unknown
1021     Unknown
          ...   
12265    Unknown
12274    Unknown
12280    Unknown
12282    Unknown
12285    Unknown
Name: episodes, Length: 307, dtype: object

In [30]:
# Count the number of non-numeric values in episodes column
non_numeric_episodes = anime_df.loc[~anime_df['episodes'].str.isnumeric(), 'episodes'].count()
print(f'The number of non-numeric values in episodes column is {non_numeric_episodes/12294*100:.3f}%')

The number of non-numeric values in episodes column is 2.497%


In [31]:
# Since only 2.5% of the episodes column has non-numeric values, we can drop them
anime_df = anime_df[anime_df['episodes'].str.isnumeric()]

In [32]:
# Convert episodes column to integer
anime_df['episodes'] = anime_df['episodes'].astype(int)
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11903 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  11903 non-null  int64  
 1   name      11903 non-null  object 
 2   genre     11903 non-null  object 
 3   type      11903 non-null  object 
 4   episodes  11903 non-null  int32  
 5   rating    11903 non-null  float64
 6   members   11903 non-null  int64  
dtypes: float64(1), int32(1), int64(2), object(3)
memory usage: 697.4+ KB


In [33]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11903 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  11903 non-null  int64  
 1   name      11903 non-null  object 
 2   genre     11903 non-null  object 
 3   type      11903 non-null  object 
 4   episodes  11903 non-null  int32  
 5   rating    11903 non-null  float64
 6   members   11903 non-null  int64  
dtypes: float64(1), int32(1), int64(2), object(3)
memory usage: 697.4+ KB


In [34]:
# Check rating column
anime_df['rating'].value_counts()

rating
6.000000    136
7.000000     96
6.500000     88
6.250000     83
6.478264     73
           ... 
3.710000      1
3.870000      1
3.910000      1
4.190000      1
3.140000      1
Name: count, Length: 595, dtype: int64

In [35]:
# Check for non-numeric values in rating column
anime_df.loc[~anime_df['rating'].apply(lambda x: str(x).replace('.', '').isnumeric()), 'rating']

Series([], Name: rating, dtype: float64)

In [36]:
# Since rating column does not have any non-numeric values, we can leave them as is

In [37]:
# Check members column
anime_df['members'].value_counts()

members
60       36
72       33
74       32
71       31
62       31
         ..
2578      1
2088      1
2102      1
23132     1
5771      1
Name: count, Length: 6601, dtype: int64

In [38]:
# Since members column has a type of int, it does not have any non-numeric values. We can leave it as is

In [39]:
# Check the dataframe
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11903 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  11903 non-null  int64  
 1   name      11903 non-null  object 
 2   genre     11903 non-null  object 
 3   type      11903 non-null  object 
 4   episodes  11903 non-null  int32  
 5   rating    11903 non-null  float64
 6   members   11903 non-null  int64  
dtypes: float64(1), int32(1), int64(2), object(3)
memory usage: 697.4+ KB


In [40]:
# NOTE: By removing the 'Unknown' values from the episodes column, all rows without ratings were also removed

In [41]:
# Check the percentage of rows we have after cleaning in comparison to the original dataframe
print(f"The percentage of rows we have after cleaning is {anime_df.shape[0]/12294*100:.3f}%")
print(f"The percentage of rows lost after cleaning is {100-(anime_df.shape[0]/12294*100):.3f}%")

The percentage of rows we have after cleaning is 96.820%
The percentage of rows lost after cleaning is 3.180%


In [42]:
# Since we only lost 3.18% of the rows after cleaning, we can proceed with the cleaned dataframe

#### Save data for Tableau

In [43]:
# Save the cleaned dataframe to a csv file for Tableau dashboard
anime_df.to_csv('Resources/anime_tableau.csv', index=False, encoding='utf-8')

#### Prepare data for Machine Learning 

In [44]:
# Copy the cleaned dataframe to a new dataframe for Machine Learning model
anime_ml_df = anime_df.copy()
print(anime_ml_df.info())
anime_ml_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 11903 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  11903 non-null  int64  
 1   name      11903 non-null  object 
 2   genre     11903 non-null  object 
 3   type      11903 non-null  object 
 4   episodes  11903 non-null  int32  
 5   rating    11903 non-null  float64
 6   members   11903 non-null  int64  
dtypes: float64(1), int32(1), int64(2), object(3)
memory usage: 697.4+ KB
None


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama',"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [45]:
# Drop anime_id and name columns
anime_ml_df.drop(columns=['anime_id', 'name'], inplace=True)
anime_ml_df.head()

Unnamed: 0,genre,type,episodes,rating,members
0,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,"Sci-Fi, Thriller",TV,24,9.17,673572
4,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [46]:
# One-Hot Encoding genre column
# Remove all spaces from the genre column
anime_ml_df['genre'] = anime_ml_df['genre'].apply(lambda x: x.replace(' ', ''))
gen_split_df = anime_ml_df['genre'].str.get_dummies(sep=',')
gen_split_df.head()

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,ShounenAi,SliceofLife,Space,Sports,SuperPower,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
# Check the columns in the genre split dataframe
gen_split_df.columns

Index(['Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama',
       'Ecchi', 'Fantasy', 'Game', 'Harem', 'Hentai', 'Historical', 'Horror',
       'Josei', 'Kids', 'Magic', 'MartialArts', 'Mecha', 'Military', 'Music',
       'Mystery', 'Parody', 'Police', 'Psychological', 'Romance', 'Samurai',
       'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'ShoujoAi', 'Shounen',
       'ShounenAi', 'SliceofLife', 'Space', 'Sports', 'SuperPower',
       'Supernatural', 'Thriller', 'Vampire', 'Yaoi', 'Yuri'],
      dtype='object')

In [48]:
# One-Hot Encoding type column
type_split_df = pd.get_dummies(anime_ml_df['type'], dtype=int)
type_split_df.head()

Unnamed: 0,Movie,Music,ONA,OVA,Special,TV
0,1,0,0,0,0,0
1,0,0,0,0,0,1
2,0,0,0,0,0,1
3,0,0,0,0,0,1
4,0,0,0,0,0,1


In [49]:
# Concatenate the one-hot encoded genre and type columns with the original dataframe
anime_ml_df = pd.concat([anime_ml_df, gen_split_df, type_split_df], axis=1)
anime_ml_df.head()

Unnamed: 0,genre,type,episodes,rating,members,Action,Adventure,Cars,Comedy,Dementia,...,Thriller,Vampire,Yaoi,Yuri,Movie,Music,ONA,OVA,Special,TV
0,"Drama,Romance,School,Supernatural",Movie,1,9.37,200630,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,"Action,Adventure,Drama,Fantasy,Magic,Military,...",TV,64,9.26,793665,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,"Action,Comedy,Historical,Parody,Samurai,Sci-Fi...",TV,51,9.25,114262,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,"Sci-Fi,Thriller",TV,24,9.17,673572,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,"Action,Comedy,Historical,Parody,Samurai,Sci-Fi...",TV,51,9.16,151266,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [50]:
# Drop genre and type columns
anime_ml_df.drop(columns=['genre', 'type'], inplace=True)
anime_ml_df.head()

Unnamed: 0,episodes,rating,members,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,...,Thriller,Vampire,Yaoi,Yuri,Movie,Music,ONA,OVA,Special,TV
0,1,9.37,200630,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,64,9.26,793665,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,51,9.25,114262,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,24,9.17,673572,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,51,9.16,151266,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [51]:
# Separate the columns that need scaling
numerical_cols = ['episodes', 'rating', 'members']
numerical_cols

['episodes', 'rating', 'members']

In [52]:
# Apply StandardScaler to the numerical columns
scaler = StandardScaler()
anime_ml_df[numerical_cols] = scaler.fit_transform(anime_ml_df[numerical_cols])
anime_ml_df.head()

Unnamed: 0,episodes,rating,members,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,...,Thriller,Vampire,Yaoi,Yuri,Movie,Music,ONA,OVA,Special,TV
0,-0.24333,2.840061,3.299624,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,1.098297,2.731791,14.040307,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,0.821453,2.721948,1.73538,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0.24647,2.643206,11.865257,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,0.821453,2.633363,2.405574,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [53]:
# Save the cleaned Machine Learning dataframe to a csv file
anime_ml_df.to_csv('Resources/anime_ml.csv', index=False, encoding='utf-8')

In [54]:
anime_ml_df.head()

Unnamed: 0,episodes,rating,members,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,...,Thriller,Vampire,Yaoi,Yuri,Movie,Music,ONA,OVA,Special,TV
0,-0.24333,2.840061,3.299624,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,1.098297,2.731791,14.040307,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,0.821453,2.721948,1.73538,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0.24647,2.643206,11.865257,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,0.821453,2.633363,2.405574,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
