In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv('Train_Staff_data.csv', index_col=0)
df['Release_year'] = pd.to_datetime(df['Release_year'] )
df_transform = df.loc[:]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13254 entries, 0 to 15488
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Name          13254 non-null  object        
 1   Rank          13254 non-null  float64       
 2   Rating        13254 non-null  float64       
 3   Release_year  13254 non-null  datetime64[ns]
 4   Episodes      13254 non-null  float64       
 5   Type          13254 non-null  object        
 6   Studio        13254 non-null  object        
 7   Tags          13254 non-null  object        
 8   staff         13254 non-null  object        
 9   Target        13254 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(1), object(5)
memory usage: 1.1+ MB


# Drop unneeded columns

In [3]:
df_transform = df_transform.drop(columns= ['Name','Episodes','Release_year','Studio', 'staff'], axis = 1)
df_transform

Unnamed: 0,Rank,Rating,Type,Tags,Target
0,6655.0,6.82,Other,"Promotional, Shorts",0
1,6839.0,6.80,OVA,"Romance, Library, School Life, Based on a Ligh...",0
2,2834.0,7.38,OVA,"Drama, Romance, Episodic, School Life, Based o...",0
3,2353.0,7.48,Movie,"Drama, Mystery, Romance, School Club, School L...",0
4,13116.0,5.96,Movie,"Shounen, Sports, Boxing, Hand to Hand Combat, ...",0
...,...,...,...,...,...
15477,514.0,8.24,OVA,"Comedy, Seinen, Crossover, Japanese Mythology,...",0
15483,11.0,9.02,Movie,"Drama, Romance, Body Swapping, Gender Bender, ...",0
15485,11970.0,6.18,Other,"Comedy, Animal Protagonists, Dogs, Food Protag...",0
15486,13584.0,5.82,TV,"Action, Comedy, Sci Fi, Shounen, Aliens, Body ...",0


# Transform Rating and Ranking Columns

In [4]:
#Rating and Ranking are continuous data so we will use MinMaxScaler or Standard Scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(df[['Rating']])
df_transform['Rating'] = scaler.transform(df[['Rating']])

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(df[['Rank']])
df_transform['Rank'] = scaler.transform(df[['Rank']])

df_transform

Unnamed: 0,Rank,Rating,Type,Tags,Target
0,-0.150170,0.673077,Other,"Promotional, Shorts",0
1,-0.109314,0.670330,OVA,"Romance, Library, School Life, Based on a Ligh...",0
2,-0.998596,0.750000,OVA,"Drama, Romance, Episodic, School Life, Based o...",0
3,-1.105398,0.763736,Movie,"Drama, Mystery, Romance, School Club, School L...",0
4,1.284450,0.554945,Movie,"Shounen, Sports, Boxing, Hand to Hand Combat, ...",0
...,...,...,...,...,...
15477,-1.513735,0.868132,OVA,"Comedy, Seinen, Crossover, Japanese Mythology,...",0
15483,-1.625423,0.975275,Movie,"Drama, Romance, Body Swapping, Gender Bender, ...",0
15485,1.029988,0.585165,Other,"Comedy, Animal Protagonists, Dogs, Food Protag...",0
15486,1.388366,0.535714,TV,"Action, Comedy, Sci Fi, Shounen, Aliens, Body ...",0


# Transform Type Column

In [5]:
#Type column is an nominal attribute so we will use oneHotEncoder for translation
from sklearn.preprocessing import OneHotEncoder

encoder_for_type = OneHotEncoder().fit(df[['Type']])
encoder_for_type_values = encoder_for_type.transform(df[['Type']])


df_transform[['DVD S', 'Movie',"OVA", 'Other','TV', 'TV Sp', 'Web']] = encoder_for_type_values.toarray()

df_transform.drop('Type', axis = 1, inplace = True)

df_transform

Unnamed: 0,Rank,Rating,Tags,Target,DVD S,Movie,OVA,Other,TV,TV Sp,Web
0,-0.150170,0.673077,"Promotional, Shorts",0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.109314,0.670330,"Romance, Library, School Life, Based on a Ligh...",0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.998596,0.750000,"Drama, Romance, Episodic, School Life, Based o...",0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-1.105398,0.763736,"Drama, Mystery, Romance, School Club, School L...",0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.284450,0.554945,"Shounen, Sports, Boxing, Hand to Hand Combat, ...",0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
15477,-1.513735,0.868132,"Comedy, Seinen, Crossover, Japanese Mythology,...",0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
15483,-1.625423,0.975275,"Drama, Romance, Body Swapping, Gender Bender, ...",0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
15485,1.029988,0.585165,"Comedy, Animal Protagonists, Dogs, Food Protag...",0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
15486,1.388366,0.535714,"Action, Comedy, Sci Fi, Shounen, Aliens, Body ...",0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Transform Tags Column

In [6]:
from sklearn.preprocessing import MultiLabelBinarizer

# Step 1: Ensure all entries in the 'Tags' column are lists of cleaned tags
def ensure_list(x):
    if isinstance(x, str):
        return [tag.strip() for tag in x.split(',')]
    elif isinstance(x, list):
        return [tag.strip() for tag in x]
    else:
        return []

df['Tags'] = df['Tags'].apply(ensure_list)

mlb = MultiLabelBinarizer()
encode_tags = mlb.fit_transform(df['Tags'])

# Create a DataFrame with the encoded tags
encoded_df = pd.DataFrame(encode_tags, columns=mlb.classes_)

#Add the encoded DataFrame to the existing DF
df_transform.loc[:, encoded_df.columns] = encoded_df

#Replace existing NA's with 0
df_transform.loc[:, encoded_df.columns].fillna(0, inplace = True)

#Drop the Tags field.
df_transform = df_transform.drop('Tags', axis = 1)

df_transform.head(20)


Unnamed: 0,Rank,Rating,Target,DVD S,Movie,OVA,Other,TV,TV Sp,Web,...,Wuxia,Xianxia,Yakuza,Yaoi,Yaoi Hand Syndrome,Youkai,Yuri,Zombies,Zoo,noitaminA
0,-0.15017,0.673077,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.109314,0.67033,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.998596,0.75,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.105398,0.763736,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.28445,0.554945,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.026658,0.585165,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.619068,0.472527,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.766201,0.60989,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,-0.637554,0.711538,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,1.128131,0.574176,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Export data

In [7]:
df_transform.to_csv('Transform_data.csv')