## Imports

In [None]:
import os
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer


## Data Loader

In [2]:
anime_file_path = os.path.join("..", "data", "raw", "raw_scraped_anime_data.csv") 
manga_file_path = os.path.join("..", "data", "raw", "raw_scraped_manga_data.csv") 

df_anime  = pd.read_csv(anime_file_path)
df_manga  = pd.read_csv(manga_file_path)

backup_anime_raw = df_anime.copy()
backup_manga_raw = df_manga.copy()

In [3]:
# Run to fetch backup
df_anime = backup_anime_raw.copy() 
df_manga = backup_manga_raw.copy() 

## Data Cleaning


In [4]:
df_anime.head()

Unnamed: 0,Title,Subtitle,URL,Genres,Synopsis,Type
0,Shingeki no Kyojin,Attack on Titan,https://myanimelist.net/anime/16498/Shingeki_n...,"Action, Award Winning, Drama, Suspense","Centuries ago, mankind was slaughtered to near...",Anime
1,Fullmetal Alchemist: Brotherhood,,https://myanimelist.net/anime/5114/Fullmetal_A...,"Action, Adventure, Drama, Fantasy",After a horrific alchemy experiment goes wrong...,Anime
2,One Punch Man,,https://myanimelist.net/anime/30276/One_Punch_Man,"Action, Comedy",The seemingly unimpressive Saitama has a rathe...,Anime
3,Kimetsu no Yaiba,Demon Slayer: Kimetsu no Yaiba,https://myanimelist.net/anime/38000/Kimetsu_no...,"Action, Award Winning, Supernatural","Ever since the death of his father, the burden...",Anime
4,Sword Art Online,,https://myanimelist.net/anime/11757/Sword_Art_...,"Action, Adventure, Fantasy, Romance",Ever since the release of the innovative Nerve...,Anime


In [5]:
df_anime.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35630 entries, 0 to 35629
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Title     35630 non-null  object
 1   Subtitle  17601 non-null  object
 2   URL       35630 non-null  object
 3   Genres    35630 non-null  object
 4   Synopsis  35626 non-null  object
 5   Type      35630 non-null  object
dtypes: object(6)
memory usage: 1.6+ MB


In [6]:
df_manga.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57514 entries, 0 to 57513
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Title     57514 non-null  object
 1   Subtitle  16983 non-null  object
 2   URL       57514 non-null  object
 3   Genres    57514 non-null  object
 4   Synopsis  57506 non-null  object
 5   Type      57514 non-null  object
dtypes: object(6)
memory usage: 2.6+ MB


In [7]:
#df_manga['Title'].drop_duplicates().shape[0]
#df_anime['Title'].drop_duplicates().shape[0]


In [8]:
# Merge the anime and manga dataframes
df_combined = pd.concat([df_anime, df_manga], ignore_index=True)
print("Combined DataFrame shape:", df_combined.shape)
df_combined.head()

Combined DataFrame shape: (93144, 6)


Unnamed: 0,Title,Subtitle,URL,Genres,Synopsis,Type
0,Shingeki no Kyojin,Attack on Titan,https://myanimelist.net/anime/16498/Shingeki_n...,"Action, Award Winning, Drama, Suspense","Centuries ago, mankind was slaughtered to near...",Anime
1,Fullmetal Alchemist: Brotherhood,,https://myanimelist.net/anime/5114/Fullmetal_A...,"Action, Adventure, Drama, Fantasy",After a horrific alchemy experiment goes wrong...,Anime
2,One Punch Man,,https://myanimelist.net/anime/30276/One_Punch_Man,"Action, Comedy",The seemingly unimpressive Saitama has a rathe...,Anime
3,Kimetsu no Yaiba,Demon Slayer: Kimetsu no Yaiba,https://myanimelist.net/anime/38000/Kimetsu_no...,"Action, Award Winning, Supernatural","Ever since the death of his father, the burden...",Anime
4,Sword Art Online,,https://myanimelist.net/anime/11757/Sword_Art_...,"Action, Adventure, Fantasy, Romance",Ever since the release of the innovative Nerve...,Anime


In [None]:
all_unique_genres = df_combined["Genres"].str.split(", ", expand=True).stack().unique()
#all_unique_genres = df_combined["Genres"].str.split(", ").explode().unique()

print("Available genres in the dataset: ", all_unique_genres)

"""
~ Code explanantion : 
str.split() splits the column values based on the delimiter passed.
expand=True makes the split values expand into columns.
stack() makes the columns merge into a heirarchical index of one column.
unique() returns the unique values in the column.

.explode() does what we can achive combining  expand=true and stack() but in a more direct way.

"""

Available genres in the dataset:  ['Action' 'Award Winning' 'Drama' 'Suspense' 'Adventure' 'Fantasy'
 'Comedy' 'Supernatural' 'Romance' 'Horror' 'Sci-Fi' 'Avant Garde' 'Ecchi'
 'Mystery' 'Erotica' 'Gourmet' 'Girls Love' '' 'Boys Love' 'Hentai'
 'Slice of Life' 'Sports']


'\nCode explanantion : \nstr.split() splits the column values based on the delimiter passed.\nexpand=True makes the split values expand into columns.\nstack() makes the columns merge into a heirarchical index of one column.\nunique() returns the unique values in the column.\n\n'

In [11]:
all_unique_genres = df_combined["Genres"].str.split(", ").explode().unique()
print("Available genres in the dataset: ", all_unique_genres)
genre_counts = df_combined['Genres'].str.split(', ').explode().value_counts()
print(genre_counts)

Available genres in the dataset:  ['Action' 'Award Winning' 'Drama' 'Suspense' 'Adventure' 'Fantasy'
 'Comedy' 'Supernatural' 'Romance' 'Horror' 'Sci-Fi' 'Avant Garde' 'Ecchi'
 'Mystery' 'Erotica' 'Gourmet' 'Girls Love' '' 'Boys Love' 'Hentai'
 'Slice of Life' 'Sports']
Genres
Comedy           30013
Action           29969
Fantasy          28323
Drama            22220
Romance          22060
Adventure        20804
Supernatural     15962
Sci-Fi           15337
Slice of Life    10868
Boys Love         8779
Mystery           8704
Erotica           7605
Horror            5870
Ecchi             5594
Girls Love        3204
Sports            2936
Suspense          2829
Award Winning     2125
Avant Garde       1548
Gourmet           1410
Hentai            1349
                   513
Name: count, dtype: int64


In [None]:
test = df_combined["Genres"].str.split(',').apply(lambda x: pd.Series(x).value_counts()).sum().sort_values(ascending=False)
print(test[:])

"""
~ Code explanation:
str.split(',') splits the column values based on the comma delimiter.
apply(lambda x: pd.Series(x).value_counts()) applies a lambda function to each row, converting the list of genres into a Pandas Series and then counting the occurrences of each genre within that row.
sum() aggregates the counts of each genre across all rows.
sort_values(ascending=False) sorts the aggregated counts in descending order.

This approach allows us to count the occurrences of each genre in the entire dataset and sort them by frequency.
"""

Action            29969.0
 Fantasy          24130.0
 Romance          19759.0
Comedy            17597.0
 Supernatural     15218.0
 Sci-Fi           14230.0
 Drama            12932.0
 Comedy           12416.0
 Adventure        12211.0
Drama              9288.0
 Slice of Life     8858.0
Adventure          8593.0
Boys Love          8209.0
 Erotica           7605.0
 Mystery           7306.0
 Ecchi             5594.0
Fantasy            4193.0
 Horror            3922.0
 Suspense          2637.0
Romance            2301.0
Slice of Life      2010.0
Horror             1948.0
 Sports            1907.0
 Girls Love        1898.0
Avant Garde        1410.0
Mystery            1398.0
 Hentai            1349.0
Award Winning      1338.0
Girls Love         1306.0
Sci-Fi             1107.0
Sports             1029.0
 Gourmet            898.0
 Award Winning      787.0
Supernatural        744.0
 Boys Love          570.0
                    513.0
Gourmet             512.0
Suspense            192.0
 Avant Garde

In [None]:
# Remove tags with less instances 

genre_counts = df_combined['Genres'].str.split(', ').explode().value_counts()

# Set threshold as 8% of the most frequent genre
threshold = int(0.08 * genre_counts.max())

# Identify genres to keep
genres_to_keep = genre_counts[genre_counts >= threshold].index

print(genres_to_keep)
# Remove rare genres from the dataset
df_combined['Genres'] = df_combined['Genres'].apply(
    lambda x: ', '.join([g for g in x.split(', ') if g in genres_to_keep])
)
""" 
~ Code Explanantion:

1. [`df_combined['Genres']`]: This part of the code selects the "Genres" column from the DataFrame. The multiple genres are separated by commas.

2. `.apply(lambda x:  The [`apply`]method is called on the "Genres" column. 

3. `lambda x: ', '.join([g for g in x.split(', ') if g in genres_to_keep])`: 
    The lambda function takes a string `x` (representing a genre string) and processes it as follows:
   - `x.split(', ')`: The series.py method is called on the string `x`. This method splits the string at every occurrence of the comma followed by a space (", "). This results in a list of individual genres.
   - `[g for g in x.split(', ') if g in genres_to_keep]`: A list comprehension is used to filter the list of genres. 
        The comprehension iterates over each genre `g` in the list and includes it in the
        new list only if it is present in the predefined list `genres_to_keep`.
   - `', '.join(...)`: The [`join`] method is called on the filtered list of genres.
        This method concatenates the genres into a single string, with each genre separated by a comma followed by a space (", ").

The overall effect of this line of code is to take a DataFrame column containing comma-separated genre strings, 
split those strings into individual genres, filter the genres based on a predefined list of genres to keep,
and then reassemble the filtered genres back into a single string. This is useful for tasks such as data cleaning,
where you might want to remove unwanted genres from your dataset while preserving the format of the genre strings.
"""


# Remove rows where all genres were removed (empty values)
df_combined = df_combined[df_combined['Genres'] != '']

print("Updated genre distribution:\n", df_combined['Genres'].str.split(', ').explode().value_counts())

In [58]:
# Backup
df_combined_backup = df_combined.copy()

In [59]:
# Load Backup 
df_combined = df_combined_backup.copy()

## Data Pre-processing

**Advantages of MultiLabelBinarizer over manually doing One Hot Encoding:**	

	•	No manual column splitting: We don’t need to manually split genres or create new columns for each genre. MultiLabelBinarizer automatically handles this for us and directly produces a binary matrix for each label.
	

	•	Handles multi-labels easily: It allows each sample to have multiple 1s in different columns (genres in your case) without needing to manually handle splits or duplicates.

In [None]:

# Split the genres into lists
df_combined['Genres'] = df_combined['Genres'].str.split(', ')

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Transform the genres into one-hot encoding
encoded_genres = mlb.fit_transform(df_combined['Genres'])

# Convert back to a DataFrame
genre_df = pd.DataFrame(encoded_genres, columns=mlb.classes_)

# Concatenate with the original dataset
df_combined = pd.concat([df_combined, genre_df], axis=1)

# Drop the original "Genres" column (optional, as it’s now encoded)
df_combined.drop(columns=['Genres'], inplace=True)

print(df_combined.head())  # Check encoded dataset

In [64]:
df_combined.head(5).T

Unnamed: 0,0,1,2,3,4
Title,Shingeki no Kyojin,Fullmetal Alchemist: Brotherhood,One Punch Man,Kimetsu no Yaiba,Sword Art Online
Subtitle,Attack on Titan,,,Demon Slayer: Kimetsu no Yaiba,
URL,https://myanimelist.net/anime/16498/Shingeki_n...,https://myanimelist.net/anime/5114/Fullmetal_A...,https://myanimelist.net/anime/30276/One_Punch_Man,https://myanimelist.net/anime/38000/Kimetsu_no...,https://myanimelist.net/anime/11757/Sword_Art_...
Synopsis,"Centuries ago, mankind was slaughtered to near...",After a horrific alchemy experiment goes wrong...,The seemingly unimpressive Saitama has a rathe...,"Ever since the death of his father, the burden...",Ever since the release of the innovative Nerve...
Type,Anime,Anime,Anime,Anime,Anime
Action,1.0,1.0,1.0,1.0,1.0
Adventure,0.0,1.0,0.0,0.0,1.0
Boys Love,0.0,0.0,0.0,0.0,0.0
Comedy,0.0,0.0,1.0,0.0,0.0
Drama,1.0,1.0,0.0,0.0,0.0


In [65]:
df_combined.drop(columns=["Subtitle","URL","Type"], inplace=True)
df_combined.head().T

Unnamed: 0,0,1,2,3,4
Title,Shingeki no Kyojin,Fullmetal Alchemist: Brotherhood,One Punch Man,Kimetsu no Yaiba,Sword Art Online
Synopsis,"Centuries ago, mankind was slaughtered to near...",After a horrific alchemy experiment goes wrong...,The seemingly unimpressive Saitama has a rathe...,"Ever since the death of his father, the burden...",Ever since the release of the innovative Nerve...
Action,1.0,1.0,1.0,1.0,1.0
Adventure,0.0,1.0,0.0,0.0,1.0
Boys Love,0.0,0.0,0.0,0.0,0.0
Comedy,0.0,0.0,1.0,0.0,0.0
Drama,1.0,1.0,0.0,0.0,0.0
Ecchi,0.0,0.0,0.0,0.0,0.0
Erotica,0.0,0.0,0.0,0.0,0.0
Fantasy,0.0,1.0,0.0,0.0,1.0


In [69]:
backup = df_combined.copy()

In [73]:
df_combined = backup.copy()

In [72]:
# Remove Duplicate Titles 
#duplicated_titles = df_combined[df_combined["Title"].duplicated(keep=False)]["Title"].unique()
#print(duplicated_titles)



In [None]:
df_combined.to_csv('../data/cleaned/clean_combined_data.csv', index=False)