In [1]:
# Import libraries
import pandas as pd

# Import ML libraries
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the data
raw_anime_df = pd.read_csv('Resources/anime_clean.csv', encoding='utf-8')
raw_anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama',"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


#### Prepare data for Machine Learning 

In [3]:
# Copy the cleaned dataframe to a new dataframe for Machine Learning model
anime_ml_df = raw_anime_df.copy()
print(anime_ml_df.info())
anime_ml_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11903 entries, 0 to 11902
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  11903 non-null  int64  
 1   name      11903 non-null  object 
 2   genre     11903 non-null  object 
 3   type      11903 non-null  object 
 4   episodes  11903 non-null  int64  
 5   rating    11903 non-null  float64
 6   members   11903 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 651.1+ KB
None


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama',"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
# Drop anime_id columns
#anime_ml_df.drop(columns=['anime_id'], inplace=True)
anime_ml_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama',"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
# One-Hot Encoding genre column
# Remove all spaces from the genre column
anime_ml_df['genre'] = anime_ml_df['genre'].apply(lambda x: x.replace(' ', ''))
gen_split_df = anime_ml_df['genre'].str.get_dummies(sep=',')
gen_split_df.head()

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,ShounenAi,SliceofLife,Space,Sports,SuperPower,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Check the columns in the genre split dataframe
gen_split_df.columns

Index(['Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama',
       'Ecchi', 'Fantasy', 'Game', 'Harem', 'Hentai', 'Historical', 'Horror',
       'Josei', 'Kids', 'Magic', 'MartialArts', 'Mecha', 'Military', 'Music',
       'Mystery', 'Parody', 'Police', 'Psychological', 'Romance', 'Samurai',
       'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'ShoujoAi', 'Shounen',
       'ShounenAi', 'SliceofLife', 'Space', 'Sports', 'SuperPower',
       'Supernatural', 'Thriller', 'Vampire', 'Yaoi', 'Yuri'],
      dtype='object')

In [7]:
# Remove the following genre columns: 'Ecchi', 'Hentai', 'Yaoi', 'Yuri'
gen_split_df.drop(columns=['Ecchi', 'Harem', 'Hentai', 'Yaoi', 'Yuri'], inplace=True)
print(gen_split_df.columns)
gen_split_df.head()

Index(['Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama',
       'Fantasy', 'Game', 'Historical', 'Horror', 'Josei', 'Kids', 'Magic',
       'MartialArts', 'Mecha', 'Military', 'Music', 'Mystery', 'Parody',
       'Police', 'Psychological', 'Romance', 'Samurai', 'School', 'Sci-Fi',
       'Seinen', 'Shoujo', 'ShoujoAi', 'Shounen', 'ShounenAi', 'SliceofLife',
       'Space', 'Sports', 'SuperPower', 'Supernatural', 'Thriller', 'Vampire'],
      dtype='object')


Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Fantasy,Game,Historical,...,ShoujoAi,Shounen,ShounenAi,SliceofLife,Space,Sports,SuperPower,Supernatural,Thriller,Vampire
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,1,0,0,0,0,1,1,0,0,...,0,1,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1,0,0,1,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0


In [8]:
# Drop genre column Music to not conflict with Type Music
gen_split_df.drop(columns=['Music'], inplace=True)
print(gen_split_df.columns)

Index(['Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama',
       'Fantasy', 'Game', 'Historical', 'Horror', 'Josei', 'Kids', 'Magic',
       'MartialArts', 'Mecha', 'Military', 'Mystery', 'Parody', 'Police',
       'Psychological', 'Romance', 'Samurai', 'School', 'Sci-Fi', 'Seinen',
       'Shoujo', 'ShoujoAi', 'Shounen', 'ShounenAi', 'SliceofLife', 'Space',
       'Sports', 'SuperPower', 'Supernatural', 'Thriller', 'Vampire'],
      dtype='object')


In [9]:
# Rename Sci-Fi column to SciFi
gen_split_df.rename(columns={'Sci-Fi': 'SciFi'}, inplace=True)
gen_split_df.columns

Index(['Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama',
       'Fantasy', 'Game', 'Historical', 'Horror', 'Josei', 'Kids', 'Magic',
       'MartialArts', 'Mecha', 'Military', 'Mystery', 'Parody', 'Police',
       'Psychological', 'Romance', 'Samurai', 'School', 'SciFi', 'Seinen',
       'Shoujo', 'ShoujoAi', 'Shounen', 'ShounenAi', 'SliceofLife', 'Space',
       'Sports', 'SuperPower', 'Supernatural', 'Thriller', 'Vampire'],
      dtype='object')

In [10]:
# One-Hot Encoding type column
type_split_df = pd.get_dummies(anime_ml_df['type'], dtype=int)
type_split_df.head()

Unnamed: 0,Movie,Music,ONA,OVA,Special,TV
0,1,0,0,0,0,0
1,0,0,0,0,0,1
2,0,0,0,0,0,1
3,0,0,0,0,0,1
4,0,0,0,0,0,1


In [11]:
# Concatenate the one-hot encoded genre and type columns with the original dataframe
anime_ml_df = pd.concat([anime_ml_df, gen_split_df, type_split_df], axis=1)
anime_ml_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,Action,Adventure,Cars,...,SuperPower,Supernatural,Thriller,Vampire,Movie,Music,ONA,OVA,Special,TV
0,32281,Kimi no Na wa.,"Drama,Romance,School,Supernatural",Movie,1,9.37,200630,0,0,0,...,0,1,0,0,1,0,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,"Action,Adventure,Drama,Fantasy,Magic,Military,...",TV,64,9.26,793665,1,1,0,...,0,0,0,0,0,0,0,0,0,1
2,28977,Gintama,"Action,Comedy,Historical,Parody,Samurai,Sci-Fi...",TV,51,9.25,114262,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,9253,Steins Gate,"Sci-Fi,Thriller",TV,24,9.17,673572,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,9969,Gintama',"Action,Comedy,Historical,Parody,Samurai,Sci-Fi...",TV,51,9.16,151266,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [12]:
# Drop genre and type columns
anime_ml_df.drop(columns=['genre', 'type'], inplace=True)
anime_ml_df.head()

Unnamed: 0,anime_id,name,episodes,rating,members,Action,Adventure,Cars,Comedy,Dementia,...,SuperPower,Supernatural,Thriller,Vampire,Movie,Music,ONA,OVA,Special,TV
0,32281,Kimi no Na wa.,1,9.37,200630,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,64,9.26,793665,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,28977,Gintama,51,9.25,114262,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,9253,Steins Gate,24,9.17,673572,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,9969,Gintama',51,9.16,151266,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
# Save the cleaned dataframe before scaling to a new csv file
anime_ml_df.to_csv('Resources/anime_b4_scale_ml.csv', index=False, encoding='utf-8')

In [14]:
# Separate the columns that need scaling
numerical_cols = ['episodes', 'rating', 'members']
numerical_cols

['episodes', 'rating', 'members']

In [15]:
# Apply StandardScaler to the numerical columns
scaler = StandardScaler()
anime_ml_df[numerical_cols] = scaler.fit_transform(anime_ml_df[numerical_cols])
anime_ml_df.head()

Unnamed: 0,anime_id,name,episodes,rating,members,Action,Adventure,Cars,Comedy,Dementia,...,SuperPower,Supernatural,Thriller,Vampire,Movie,Music,ONA,OVA,Special,TV
0,32281,Kimi no Na wa.,-0.24333,2.840061,3.299624,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,1.098297,2.731791,14.040307,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,28977,Gintama,0.821453,2.721948,1.73538,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,9253,Steins Gate,0.24647,2.643206,11.865257,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,9969,Gintama',0.821453,2.633363,2.405574,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [16]:
# Save the cleaned Machine Learning dataframe to a csv file
anime_ml_df.to_csv('Resources/anime_mlb.csv', index=False, encoding='utf-8')