In [632]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Loading the data

In [633]:
anime_data = pd.read_csv("../Data/anime-list.csv")
anime_data.head()

Unnamed: 0.1,Unnamed: 0,name,studio,theme,tags,source,rating,year,synopsis,demographic,status,eps,eps_avg_duration_in_min,rated_by
0,0,Shingeki no Kyojin,Wit Studio,"Gore, Military, Survival","Action, Drama",Manga,8.53,2013.0,"Centuries ago, mankind was slaughtered to near...",Shounen,Finished,25.0,24.0,3.4M
1,1,Death Note,Madhouse,Psychological,"Supernatural, Suspense",Manga,8.63,2006.0,"Brutal murders, petty thefts, and senseless vi...",Shounen,Finished,37.0,23.0,3.4M
2,2,Fullmetal Alchemist: Brotherhood,Bones,Military,"Action, Adventure, Drama, Fantasy",Manga,9.14,2009.0,After a horrific alchemy experiment goes wrong...,Shounen,Finished,64.0,24.0,2.9M
3,3,Boku no Hero Academia,Bones,"School, Super Power",Action,Manga,7.95,2016.0,"The appearance of ""quirks,"" newly discovered s...",Shounen,Finished,13.0,24.0,2.6M
4,4,Naruto,Studio Pierrot,Martial Arts,"Action, Adventure, Fantasy",Manga,7.97,2002.0,"Moments prior to Naruto Uzumaki's birth, a hug...",Shounen,Finished,220.0,23.0,2.5M


In [634]:
anime_data.drop('Unnamed: 0',axis=1, inplace=True)

# General overview of the dataset

In [635]:
anime_data.describe()

Unnamed: 0,rating,year,eps,eps_avg_duration_in_min
count,1812.0,2949.0,2890.0,2950.0
mean,7.043311,2007.264157,43.062284,18.194576
std,0.776097,12.151285,75.149153,8.019819
min,2.93,1963.0,2.0,0.0
25%,6.49,2002.0,13.0,12.0
50%,7.05,2011.0,26.0,23.0
75%,7.55,2016.0,51.0,24.0
max,9.14,2023.0,1787.0,50.0


In [636]:
anime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3005 entries, 0 to 3004
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   name                     3005 non-null   object 
 1   studio                   3005 non-null   object 
 2   theme                    3005 non-null   object 
 3   tags                     2470 non-null   object 
 4   source                   2510 non-null   object 
 5   rating                   1812 non-null   float64
 6   year                     2949 non-null   float64
 7   synopsis                 2315 non-null   object 
 8   demographic              3005 non-null   object 
 9   status                   2981 non-null   object 
 10  eps                      2890 non-null   float64
 11  eps_avg_duration_in_min  2950 non-null   float64
 12  rated_by                 3005 non-null   object 
dtypes: float64(4), object(9)
memory usage: 305.3+ KB


# Evaluating NaN values

### Counting them

In [637]:
missing_count = anime_data.isna().sum()
total_missing = missing_count.sum()

print(f"The dataframe has {total_missing} cells with NaN value")
missing_count

The dataframe has 3163 cells with NaN value


name                          0
studio                        0
theme                         0
tags                        535
source                      495
rating                     1193
year                         56
synopsis                    690
demographic                   0
status                       24
eps                         115
eps_avg_duration_in_min      55
rated_by                      0
dtype: int64

### Calculating the percentage

In [638]:
total_cells = np.product(anime_data.shape)

missing_percentage = (total_missing / total_cells) * 100
print(f"There is an amount of {missing_percentage}% missing values")

There is an amount of 8.096761807244336% missing values


## Filling NaN cells
1. Handling numerical values

In [639]:
numerical_cols = ["rating","year","eps","eps_avg_duration_in_min"]

# For the year, the mean will be used
for column in numerical_cols:
    column_mean = anime_data[column].mean()
    
    if column not in ["year","eps"]:
        anime_data[column].fillna(column_mean, inplace=True)
        
    else:
        anime_data[column].fillna(int(column_mean), inplace=True)

2. Handling non numerical values
- 2.1 Retrieving the number of rates and converting them to float type

In [640]:
rated_by_vector = np.array(anime_data.rated_by.str.replace("[A-Z]",""))
rated_by_vector = rated_by_vector.astype(float)

# Elevating the values to the exact values
rated_by_vector = rated_by_vector * pow(10,6)

anime_data.drop("rated_by",axis=1,inplace=True)
anime_data["rated_by"] = pd.Series(rated_by_vector)

  rated_by_vector = np.array(anime_data.rated_by.str.replace("[A-Z]",""))


In [641]:
anime_data[anime_data.rated_by == 0.0] = float(anime_data.rated_by.mean())
anime_data.rated_by.value_counts()

1.564454e+08    277
3.700000e+07     32
2.900000e+07     27
3.900000e+07     26
4.000000e+07     25
               ... 
9.480000e+08      1
7.660000e+08      1
9.210000e+08      1
7.630000e+08      1
8.820000e+08      1
Name: rated_by, Length: 721, dtype: int64

- 2.2 Filling non numerical columns NaN cells

In [642]:
for column in anime_data.columns:
    if column not in numerical_cols:
        anime_data[column].fillna("unknown",inplace=True)

In [643]:
anime_data.isna().sum()

name                       0
studio                     0
theme                      0
tags                       0
source                     0
rating                     0
year                       0
synopsis                   0
demographic                0
status                     0
eps                        0
eps_avg_duration_in_min    0
rated_by                   0
dtype: int64

# DataFrame normalization
---
1. Theme and tags columns text normalization<br>
Each value from the `name`, `tags` and `theme` columns will be lowercased.<br>By doing so, when each of these values will be retrieved in a given `set`, it will be easier to create their respectif DataFrame and then do some `feature encoding`.
After feature encoding, the `anime_data` DataFrame will be linked to tags and theme DataFrames.

In [644]:
def lower_case_value(*columns):
    for column in columns:
        anime_data[column] = anime_data[column].map(lambda value: str(value).lower())        

In [645]:
lower_case_value("tags","theme","name")

In [646]:
anime_data[["name","tags","theme"]].head()

Unnamed: 0,name,tags,theme
0,shingeki no kyojin,"action, drama","gore, military, survival"
1,death note,"supernatural, suspense",psychological
2,fullmetal alchemist: brotherhood,"action, adventure, drama, fantasy",military
3,boku no hero academia,action,"school, super power"
4,naruto,"action, adventure, fantasy",martial arts


2. Extracting values from `tags` and `theme` columns

In [647]:
def extract_cell_value(column_name:str) -> set:
    """
    For a given column with string values separated by a comma, 
    each cell value(s) will be retrieved and then returned as a set.
    """
    result = set()
    column = anime_data[column_name].astype(str)
    
    for cell in column:
        for value in cell.split(","):
            result.add(str(value).strip())
            
    return result

In [648]:
tags_set = extract_cell_value("tags")
themes_set = extract_cell_value("theme")

3. Creating a DataFrame for tags and theme

In [649]:
tags_data = pd.DataFrame(index=anime_data.name, columns=tags_set)
theme_data = pd.DataFrame(index=anime_data.name, columns=themes_set)

In [650]:
anime_data.head()

Unnamed: 0,name,studio,theme,tags,source,rating,year,synopsis,demographic,status,eps,eps_avg_duration_in_min,rated_by
0,shingeki no kyojin,Wit Studio,"gore, military, survival","action, drama",Manga,8.53,2013.0,"Centuries ago, mankind was slaughtered to near...",Shounen,Finished,25.0,24.0,3400000.0
1,death note,Madhouse,psychological,"supernatural, suspense",Manga,8.63,2006.0,"Brutal murders, petty thefts, and senseless vi...",Shounen,Finished,37.0,23.0,3400000.0
2,fullmetal alchemist: brotherhood,Bones,military,"action, adventure, drama, fantasy",Manga,9.14,2009.0,After a horrific alchemy experiment goes wrong...,Shounen,Finished,64.0,24.0,2900000.0
3,boku no hero academia,Bones,"school, super power",action,Manga,7.95,2016.0,"The appearance of ""quirks,"" newly discovered s...",Shounen,Finished,13.0,24.0,2600000.0
4,naruto,Studio Pierrot,martial arts,"action, adventure, fantasy",Manga,7.97,2002.0,"Moments prior to Naruto Uzumaki's birth, a hug...",Shounen,Finished,220.0,23.0,2500000.0


In [651]:
tags_data.head()

Unnamed: 0_level_0,supernatural,love,action,horror,adventure,sports,gourmet,romance,drama,ecchi,...,unknown,suspense,fantasy,sci-fi,garde,girls,of,slice,boys,mystery
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
shingeki no kyojin,,,,,,,,,,,...,,,,,,,,,,
death note,,,,,,,,,,,...,,,,,,,,,,
fullmetal alchemist: brotherhood,,,,,,,,,,,...,,,,,,,,,,
boku no hero academia,,,,,,,,,,,...,,,,,,,,,,
naruto,,,,,,,,,,,...,,,,,,,,,,


4. Feature encoding
---
For a given anime, if it has a given `tag` or `theme` then the cell's value will be set to `1`. Otherwise it'll be `0`.

In [652]:
def encode_dataframe(column_name:str,frame:pd.DataFrame) -> None:
    """For a given DataFrame which represent the """
    # for each anime's data
    for line in anime_data[["name",column_name]].values:
        anime_tags = [str(value).strip() for value in line[1].split(",")]

        # for each tag
        for column in frame.columns:
            if column in anime_tags:
                frame.loc[line[0],column] = 1
            else:
                 frame.loc[line[0],column] = 0

In [653]:
encode_dataframe("tags",tags_data)
tags_data.head()

Unnamed: 0_level_0,supernatural,love,action,horror,adventure,sports,gourmet,romance,drama,ecchi,...,unknown,suspense,fantasy,sci-fi,garde,girls,of,slice,boys,mystery
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
shingeki no kyojin,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
death note,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
fullmetal alchemist: brotherhood,0,0,1,0,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
boku no hero academia,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
naruto,0,0,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [654]:
encode_dataframe("theme",theme_data)
theme_data.head()

Unnamed: 0_level_0,parody,combat sports,strategy game,crossdressing,harem,kids,reincarnation,time travel,cgdct,josei,...,performing arts,otaku culture,workplace,high stakes game,childcare,space,medical,shoujo,historical,racing
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
shingeki no kyojin,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
death note,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
fullmetal alchemist: brotherhood,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
boku no hero academia,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
naruto,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [655]:
theme_data.loc["shingeki no kyojin"].value_counts()

0    53
1     3
Name: shingeki no kyojin, dtype: int64

In [656]:
test = pd.DataFrame(index=[anime_data.name, np.arange(len(anime_data))], columns=tags_set)

In [657]:
test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,supernatural,love,action,horror,adventure,sports,gourmet,romance,drama,ecchi,...,unknown,suspense,fantasy,sci-fi,garde,girls,of,slice,boys,mystery
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
shingeki no kyojin,0,,,,,,,,,,,...,,,,,,,,,,
death note,1,,,,,,,,,,,...,,,,,,,,,,
fullmetal alchemist: brotherhood,2,,,,,,,,,,,...,,,,,,,,,,
boku no hero academia,3,,,,,,,,,,,...,,,,,,,,,,
naruto,4,,,,,,,,,,,...,,,,,,,,,,


5. Renaming DataFrames axis

In [658]:
anime_data.rename_axis("anime_id",axis="rows", inplace=True)
anime_data.rename_axis("features", axis="columns", inplace=True)

From `tags_data` or `theme_data` DataFrame there is a link to `anime_data` DataFrame which is the `name` index.
Thus the `theme` and `tags` columns can be dropped from the `anime_data` DataFrame for creating a normalized data set.