In [31]:
# Import the required modules
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, RocCurveDisplay
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("anime.csv")
print(df.shape)
df.head()

(12294, 7)


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [4]:
df_cleaned = df.dropna()
df_cleaned.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12017 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12017 non-null  int64  
 1   name      12017 non-null  object 
 2   genre     12017 non-null  object 
 3   type      12017 non-null  object 
 4   episodes  12017 non-null  object 
 5   rating    12017 non-null  float64
 6   members   12017 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 751.1+ KB


In [6]:
df_cleaned = df_cleaned.drop(columns='anime_id')

In [7]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12017 entries, 0 to 12293
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      12017 non-null  object 
 1   genre     12017 non-null  object 
 2   type      12017 non-null  object 
 3   episodes  12017 non-null  object 
 4   rating    12017 non-null  float64
 5   members   12017 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 657.2+ KB


In [8]:
# Select string columns
string_cols = df_cleaned.select_dtypes(include=['object'])

# Count unique categories for each string column
unique_counts = string_cols.nunique()
unique_counts

name        12015
genre        3229
type            6
episodes      187
dtype: int64

In [9]:
genre_dummies = df_cleaned['genre'].str.get_dummies(sep=',')

In [10]:
df_combined = pd.concat([df_cleaned, genre_dummies], axis=1)

In [11]:
df_combined.head()

Unnamed: 0,name,genre,type,episodes,rating,members,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,Hentai,Historical,Horror,Josei,Kids,Magic,Martial Arts,Mecha,Military,Music,Mystery,Parody,Police,Psychological,Romance,Samurai,School,Sci-Fi,Seinen,Shoujo,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri,Action,Adventure.1,Cars.1,Comedy.1,Dementia.1,Demons.1,Drama.1,Ecchi.1,Fantasy.1,Game.1,Harem.1,Hentai.1,Historical.1,Horror.1,Josei.1,Kids.1,Magic.1,Martial Arts.1,Mecha.1,Military.1,Music.1,Mystery.1,Parody.1,Police.1,Psychological.1,Romance.1,Samurai.1,School.1,Sci-Fi.1,Seinen.1,Shoujo.1,Shounen.1,Slice of Life.1,Space.1,Sports.1,Super Power.1,Supernatural.1,Thriller.1,Vampire.1,Yaoi.1
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
# Create boolean columns for each unique value in the 'type' column
type_dummies = df_combined['type'].str.get_dummies()

# Combine with the original DataFrame
df_combined_2 = pd.concat([df_combined, type_dummies], axis=1)

# Drop the original 'type' column
df_combined_2 = df_combined_2.drop('type', axis=1)

In [24]:
df_combined_2.head()

Unnamed: 0,name,genre,episodes,rating,members,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,Hentai,Historical,Horror,Josei,Kids,Magic,Martial Arts,Mecha,Military,Music,Mystery,Parody,Police,Psychological,Romance,Samurai,School,Sci-Fi,Seinen,Shoujo,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri,Action,Adventure.1,Cars.1,Comedy.1,Dementia.1,Demons.1,Drama.1,Ecchi.1,Fantasy.1,Game.1,Harem.1,Hentai.1,Historical.1,Horror.1,Josei.1,Kids.1,Magic.1,Martial Arts.1,Mecha.1,Military.1,Music.1,Mystery.1,Parody.1,Police.1,Psychological.1,Romance.1,Samurai.1,School.1,Sci-Fi.1,Seinen.1,Shoujo.1,Shounen.1,Slice of Life.1,Space.1,Sports.1,Super Power.1,Supernatural.1,Thriller.1,Vampire.1,Yaoi.1,Movie,Music.2,ONA,OVA,Special,TV
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural",1,9.37,200630,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",64,9.26,793665,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",51,9.25,114262,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,Steins;Gate,"Sci-Fi, Thriller",24,9.17,673572,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",51,9.16,151266,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [25]:
df2 = df_combined_2.drop(columns='genre')
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12017 entries, 0 to 12293
Data columns (total 92 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            12017 non-null  object 
 1   episodes        12017 non-null  object 
 2   rating          12017 non-null  float64
 3   members         12017 non-null  int64  
 4    Adventure      12017 non-null  int64  
 5    Cars           12017 non-null  int64  
 6    Comedy         12017 non-null  int64  
 7    Dementia       12017 non-null  int64  
 8    Demons         12017 non-null  int64  
 9    Drama          12017 non-null  int64  
 10   Ecchi          12017 non-null  int64  
 11   Fantasy        12017 non-null  int64  
 12   Game           12017 non-null  int64  
 13   Harem          12017 non-null  int64  
 14   Hentai         12017 non-null  int64  
 15   Historical     12017 non-null  int64  
 16   Horror         12017 non-null  int64  
 17   Josei          12017 non-null  int6

In [35]:
df2.columns

Index([' Adventure', ' Cars', ' Comedy', ' Dementia', ' Demons', ' Drama',
       ' Ecchi', ' Fantasy', ' Game', ' Harem', ' Hentai', ' Historical',
       ' Horror', ' Josei', ' Kids', ' Magic', ' Martial Arts', ' Mecha',
       ' Military', ' Music', ' Mystery', ' Parody', ' Police',
       ' Psychological', ' Romance', ' Samurai', ' School', ' Sci-Fi',
       ' Seinen', ' Shoujo', ' Shoujo Ai', ' Shounen', ' Shounen Ai',
       ' Slice of Life', ' Space', ' Sports', ' Super Power', ' Supernatural',
       ' Thriller', ' Vampire', ' Yaoi', ' Yuri', 'Action', 'Adventure',
       'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy',
       'Game', 'Harem', 'Hentai', 'Historical', 'Horror', 'Josei', 'Kids',
       'Magic', 'Martial Arts', 'Mecha', 'Military', 'Movie', 'Music',
       'Mystery', 'ONA', 'OVA', 'Parody', 'Police', 'Psychological', 'Romance',
       'Samurai', 'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shounen',
       'Slice of Life', 'Space', 'Special', 'Spor

In [40]:
 # set columns
meta_cols = ['name']

feature_cols = ['episodes', 'rating', 'members', ' Adventure', ' Cars',
       ' Comedy', ' Dementia', ' Demons', ' Drama', ' Ecchi', ' Fantasy',
       ' Game', ' Harem', ' Hentai', ' Historical', ' Horror', ' Josei',
       ' Kids', ' Magic', ' Martial Arts', ' Mecha', ' Military', ' Music',
       ' Mystery', ' Parody', ' Police', ' Psychological', ' Romance',
       ' Samurai', ' School', ' Sci-Fi', ' Seinen', ' Shoujo', ' Shoujo Ai',
       ' Shounen', ' Shounen Ai', ' Slice of Life', ' Space', ' Sports',
       ' Super Power', ' Supernatural', ' Thriller', ' Vampire', ' Yaoi',
       ' Yuri', 'Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons',
       'Drama', 'Ecchi', 'Fantasy', 'Game', 'Harem', 'Hentai', 'Historical',
       'Horror', 'Josei', 'Kids', 'Magic', 'Martial Arts', 'Mecha', 'Military',
       'Music', 'Mystery', 'Parody', 'Police', 'Psychological', 'Romance',
       'Samurai', 'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shounen',
       'Slice of Life', 'Space', 'Sports', 'Super Power', 'Supernatural',
       'Thriller', 'Vampire', 'Yaoi', 'Movie', 'Music', 'ONA', 'OVA',
       'Special', 'TV']

In [41]:
# Define Preprocessing Pipelines

# Define preprocessing for numeric features
numeric_features = ['rating', 'members']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Define preprocessing for the binary features
binary_features = []
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=pd.NA)),
    ('label', OrdinalEncoder())])

# Define preprocessing for categorical features
categorical_features = [' Adventure', ' Cars',
       ' Comedy', ' Dementia', ' Demons', ' Drama', ' Ecchi', ' Fantasy',
       ' Game', ' Harem', ' Hentai', ' Historical', ' Horror', ' Josei',
       ' Kids', ' Magic', ' Martial Arts', ' Mecha', ' Military', ' Music',
       ' Mystery', ' Parody', ' Police', ' Psychological', ' Romance',
       ' Samurai', ' School', ' Sci-Fi', ' Seinen', ' Shoujo', ' Shoujo Ai',
       ' Shounen', ' Shounen Ai', ' Slice of Life', ' Space', ' Sports',
       ' Super Power', ' Supernatural', ' Thriller', ' Vampire', ' Yaoi',
       ' Yuri', 'Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons',
       'Drama', 'Ecchi', 'Fantasy', 'Game', 'Harem', 'Hentai', 'Historical',
       'Horror', 'Josei', 'Kids', 'Magic', 'Martial Arts', 'Mecha', 'Military',
       'Music', 'Mystery', 'Parody', 'Police', 'Psychological', 'Romance',
       'Samurai', 'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shounen',
       'Slice of Life', 'Space', 'Sports', 'Super Power', 'Supernatural',
       'Thriller', 'Vampire', 'Yaoi', 'Movie', 'Music', 'ONA', 'OVA',
       'Special', 'TV']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=pd.NA)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('binary', binary_transformer, binary_features),
        ('cat', categorical_transformer, categorical_features)])

In [39]:
# Correlation Analysis (this is optional and not needed for the ML Experiment)
# It just shows what the data looks like after transformation before training
# We will still declare a full pipeline of preprocessing + training

# Use only preprocessing pipeline to transform the data
preprocessed_X_train = preprocessor.fit_transform(df2)

# Convert preprocessed data to a DataFrame
# Get the feature names after one-hot encoding
encoded_feature_names = (numeric_features + binary_features +
                         list(preprocessor.transformers_[2][1]['onehot'].get_feature_names_out(categorical_features)))

df_final = pd.DataFrame(preprocessed_X_train, columns=encoded_feature_names)
df_final[meta_cols] = df2.loc[:, meta_cols]
df_final.head()

ValueError: Selected columns, [' Adventure', ' Cars', ' Comedy', ' Dementia', ' Demons', ' Drama', ' Ecchi', ' Fantasy', ' Game', ' Harem', ' Hentai', ' Historical', ' Horror', ' Josei', ' Kids', ' Magic', ' Martial Arts', ' Mecha', ' Military', ' Music', ' Mystery', ' Parody', ' Police', ' Psychological', ' Romance', ' Samurai', ' School', ' Sci-Fi', ' Seinen', ' Shoujo', ' Shoujo Ai', ' Shounen', ' Shounen Ai', ' Slice of Life', ' Space', ' Sports', ' Super Power', ' Supernatural', ' Thriller', ' Vampire', ' Yaoi', ' Yuri', 'Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy', 'Game', 'Harem', 'Hentai', 'Historical', 'Horror', 'Josei', 'Kids', 'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music', 'Mystery', 'Parody', 'Police', 'Psychological', 'Romance', 'Samurai', 'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shounen', 'Slice of Life', 'Space', 'Sports', 'Super Power', 'Supernatural', 'Thriller', 'Vampire', 'Yaoi', 'Movie', 'Music', 'ONA', 'OVA', 'Special', 'TV'], are not unique in dataframe