In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sentence_transformers import SentenceTransformer
from difflib import get_close_matches
from datetime import datetime

In [2]:
anime_list_df = pd.read_csv("/content/anime.csv")
anime_descriptions_df = pd.read_csv("/content/anime_with_synopsis.csv")

In [3]:
anime_list_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,2182.0,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,312.0,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0


In [4]:
anime_descriptions_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [5]:
new_anime_list_df = anime_list_df.drop(anime_list_df.iloc[:, 16:36], axis=1)

In [6]:
new_anime_list_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,Producers,Licensors,Studios,Source,Duration,Rating
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,24 min. per ep.,R - 17+ (violence & profanity)
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,1 hr. 55 min.,R - 17+ (violence & profanity)
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,24 min. per ep.,PG-13 - Teens 13 or older
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,"TV Tokyo, Bandai Visual, Dentsu, Victor Entert...","Funimation, Bandai Entertainment",Sunrise,Original,25 min. per ep.,PG-13 - Teens 13 or older
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,"TV Tokyo, Dentsu",Unknown,Toei Animation,Manga,23 min. per ep.,PG - Children


In [7]:
columns_to_drop = ["English name", "Japanese name", "Premiered", "Licensors"]

In [8]:
anime_stats = new_anime_list_df.drop(columns=columns_to_drop, axis=1)

In [9]:
anime_stats.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Aired,Producers,Studios,Source,Duration,Rating
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,"Apr 3, 1998 to Apr 24, 1999",Bandai Visual,Sunrise,Original,24 min. per ep.,R - 17+ (violence & profanity)
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1,"Sep 1, 2001","Sunrise, Bandai Visual",Bones,Original,1 hr. 55 min.,R - 17+ (violence & profanity)
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,26,"Apr 1, 1998 to Sep 30, 1998",Victor Entertainment,Madhouse,Manga,24 min. per ep.,PG-13 - Teens 13 or older
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",TV,26,"Jul 2, 2002 to Dec 24, 2002","TV Tokyo, Bandai Visual, Dentsu, Victor Entert...",Sunrise,Original,25 min. per ep.,PG-13 - Teens 13 or older
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",TV,52,"Sep 30, 2004 to Sep 29, 2005","TV Tokyo, Dentsu",Toei Animation,Manga,23 min. per ep.,PG - Children


In [10]:
anime_stats.isna().sum()

Unnamed: 0,0
MAL_ID,0
Name,0
Score,0
Genres,0
Type,0
Episodes,0
Aired,0
Producers,0
Studios,0
Source,0


In [11]:
# Fill rows with "Unknown" in "Episodes" volumn to -1
anime_stats["Episodes"] = anime_stats["Episodes"].replace(["Unknown"], -1)

In [12]:
# Drop rows that contain "Unknown"
anime_stats_cleaned = anime_stats[~(anime_stats == 'Unknown').any(axis=1)]

In [13]:
anime_stats_cleaned.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Aired,Producers,Studios,Source,Duration,Rating
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,"Apr 3, 1998 to Apr 24, 1999",Bandai Visual,Sunrise,Original,24 min. per ep.,R - 17+ (violence & profanity)
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1,"Sep 1, 2001","Sunrise, Bandai Visual",Bones,Original,1 hr. 55 min.,R - 17+ (violence & profanity)
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,26,"Apr 1, 1998 to Sep 30, 1998",Victor Entertainment,Madhouse,Manga,24 min. per ep.,PG-13 - Teens 13 or older
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",TV,26,"Jul 2, 2002 to Dec 24, 2002","TV Tokyo, Bandai Visual, Dentsu, Victor Entert...",Sunrise,Original,25 min. per ep.,PG-13 - Teens 13 or older
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",TV,52,"Sep 30, 2004 to Sep 29, 2005","TV Tokyo, Dentsu",Toei Animation,Manga,23 min. per ep.,PG - Children


In [14]:
# Select only the 'sypnopsis' column from anime_descriptions_df, along with the 'MAL_ID' for merging
anime_descriptions_df_subset = anime_descriptions_df[['MAL_ID', 'sypnopsis']]

# Merge anime_stats_cleaned with the subset of anime_descriptions_df, keeping only 'sypnopsis' from anime_descriptions_df
anime_df = pd.merge(anime_stats_cleaned, anime_descriptions_df_subset, on='MAL_ID', how='left')

In [15]:
anime_df

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Aired,Producers,Studios,Source,Duration,Rating,sypnopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,"Apr 3, 1998 to Apr 24, 1999",Bandai Visual,Sunrise,Original,24 min. per ep.,R - 17+ (violence & profanity),"In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1,"Sep 1, 2001","Sunrise, Bandai Visual",Bones,Original,1 hr. 55 min.,R - 17+ (violence & profanity),"other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,26,"Apr 1, 1998 to Sep 30, 1998",Victor Entertainment,Madhouse,Manga,24 min. per ep.,PG-13 - Teens 13 or older,"Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",TV,26,"Jul 2, 2002 to Dec 24, 2002","TV Tokyo, Bandai Visual, Dentsu, Victor Entert...",Sunrise,Original,25 min. per ep.,PG-13 - Teens 13 or older,ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",TV,52,"Sep 30, 2004 to Sep 29, 2005","TV Tokyo, Dentsu",Toei Animation,Manga,23 min. per ep.,PG - Children,It is the dark century and the people are suff...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5254,44086,Prayer X,6.75,"Music, Dementia",Music,1,"Aug 6, 2018",Sony Music Entertainment,PERIMETRON,Original,3 min.,PG-13 - Teens 13 or older,usic video directed and animated by Ryoji Yama...
5255,44208,Yami Shibai 8,6.09,"Dementia, Horror, Supernatural",TV,-1,"Jan 11, 2021 to ?",TV Tokyo,ILCA,Original,4 min.,PG-13 - Teens 13 or older,Eighth season of Yami Shibai .
5256,44236,Senyoku no Sigrdrifa: Dai-909 Senjutsu Hime-ta...,5.87,"Action, Military",Special,1,"Nov 28, 2020",Aniplex,A-1 Pictures,Original,24 min.,PG-13 - Teens 13 or older,cap of the first eight episodes of Senyoku no ...
5257,46118,Wave!!: Surfing Yappe!! (TV),6.05,"Slice of Life, Sports",TV,12,"Jan 12, 2021 to ?",Delfi Sound,Asahi Production,Other,23 min. per ep.,PG-13 - Teens 13 or older,"asaki Hinaoka, who grew up near the coast of O..."


In [16]:
# Check for duplicated data
display(anime_df.duplicated(subset=['Name']).sum())

np.int64(0)

In [17]:
"""Temporal Processing for Aired"""
# Split on "to" to get start and end dates
anime_df[['start_str', 'end_str']] = anime_df['Aired'].str.split(' to ', expand=True)

# Get start and end dates in datetime format and leaves invalid strings like " ", or "?" into NaT
anime_df['start_date'] = pd.to_datetime(anime_df['start_str'], errors='coerce')
anime_df['end_date'] = pd.to_datetime(anime_df['end_str'], errors='coerce')

# Turns NaT for single time events (movies, music, etc) into same date as start date

# Add a column for ongoing shows
anime_df['ongoing'] = anime_df['Aired'].str.contains('to \?')

# Fill end_date for rows that are not ongoing
anime_df.loc[anime_df['end_date'].isna() & ~anime_df['ongoing'], 'end_date'] = anime_df['start_date']




  anime_df['ongoing'] = anime_df['Aired'].str.contains('to \?')


In [18]:
anime_df

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Aired,Producers,Studios,Source,Duration,Rating,sypnopsis,start_str,end_str,start_date,end_date,ongoing
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,"Apr 3, 1998 to Apr 24, 1999",Bandai Visual,Sunrise,Original,24 min. per ep.,R - 17+ (violence & profanity),"In the year 2071, humanity has colonized sever...","Apr 3, 1998","Apr 24, 1999",1998-04-03,1999-04-24,False
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1,"Sep 1, 2001","Sunrise, Bandai Visual",Bones,Original,1 hr. 55 min.,R - 17+ (violence & profanity),"other day, another bounty—such is the life of ...","Sep 1, 2001",,2001-09-01,2001-09-01,False
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,26,"Apr 1, 1998 to Sep 30, 1998",Victor Entertainment,Madhouse,Manga,24 min. per ep.,PG-13 - Teens 13 or older,"Vash the Stampede is the man with a $$60,000,0...","Apr 1, 1998","Sep 30, 1998",1998-04-01,1998-09-30,False
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",TV,26,"Jul 2, 2002 to Dec 24, 2002","TV Tokyo, Bandai Visual, Dentsu, Victor Entert...",Sunrise,Original,25 min. per ep.,PG-13 - Teens 13 or older,ches are individuals with special powers like ...,"Jul 2, 2002","Dec 24, 2002",2002-07-02,2002-12-24,False
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",TV,52,"Sep 30, 2004 to Sep 29, 2005","TV Tokyo, Dentsu",Toei Animation,Manga,23 min. per ep.,PG - Children,It is the dark century and the people are suff...,"Sep 30, 2004","Sep 29, 2005",2004-09-30,2005-09-29,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5254,44086,Prayer X,6.75,"Music, Dementia",Music,1,"Aug 6, 2018",Sony Music Entertainment,PERIMETRON,Original,3 min.,PG-13 - Teens 13 or older,usic video directed and animated by Ryoji Yama...,"Aug 6, 2018",,2018-08-06,2018-08-06,False
5255,44208,Yami Shibai 8,6.09,"Dementia, Horror, Supernatural",TV,-1,"Jan 11, 2021 to ?",TV Tokyo,ILCA,Original,4 min.,PG-13 - Teens 13 or older,Eighth season of Yami Shibai .,"Jan 11, 2021",?,2021-01-11,NaT,True
5256,44236,Senyoku no Sigrdrifa: Dai-909 Senjutsu Hime-ta...,5.87,"Action, Military",Special,1,"Nov 28, 2020",Aniplex,A-1 Pictures,Original,24 min.,PG-13 - Teens 13 or older,cap of the first eight episodes of Senyoku no ...,"Nov 28, 2020",,2020-11-28,2020-11-28,False
5257,46118,Wave!!: Surfing Yappe!! (TV),6.05,"Slice of Life, Sports",TV,12,"Jan 12, 2021 to ?",Delfi Sound,Asahi Production,Other,23 min. per ep.,PG-13 - Teens 13 or older,"asaki Hinaoka, who grew up near the coast of O...","Jan 12, 2021",?,2021-01-12,NaT,True


In [19]:
columns_to_drop = ["start_str", "end_str", "Aired"]

In [20]:
anime_df = anime_df.drop(columns=columns_to_drop, axis=1)

In [21]:
anime_metadata = anime_df

In [22]:
anime_metadata.to_csv("Anime Metadata", index=False)

In [23]:
anime_df

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Producers,Studios,Source,Duration,Rating,sypnopsis,start_date,end_date,ongoing
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,Bandai Visual,Sunrise,Original,24 min. per ep.,R - 17+ (violence & profanity),"In the year 2071, humanity has colonized sever...",1998-04-03,1999-04-24,False
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1,"Sunrise, Bandai Visual",Bones,Original,1 hr. 55 min.,R - 17+ (violence & profanity),"other day, another bounty—such is the life of ...",2001-09-01,2001-09-01,False
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,26,Victor Entertainment,Madhouse,Manga,24 min. per ep.,PG-13 - Teens 13 or older,"Vash the Stampede is the man with a $$60,000,0...",1998-04-01,1998-09-30,False
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",TV,26,"TV Tokyo, Bandai Visual, Dentsu, Victor Entert...",Sunrise,Original,25 min. per ep.,PG-13 - Teens 13 or older,ches are individuals with special powers like ...,2002-07-02,2002-12-24,False
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",TV,52,"TV Tokyo, Dentsu",Toei Animation,Manga,23 min. per ep.,PG - Children,It is the dark century and the people are suff...,2004-09-30,2005-09-29,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5254,44086,Prayer X,6.75,"Music, Dementia",Music,1,Sony Music Entertainment,PERIMETRON,Original,3 min.,PG-13 - Teens 13 or older,usic video directed and animated by Ryoji Yama...,2018-08-06,2018-08-06,False
5255,44208,Yami Shibai 8,6.09,"Dementia, Horror, Supernatural",TV,-1,TV Tokyo,ILCA,Original,4 min.,PG-13 - Teens 13 or older,Eighth season of Yami Shibai .,2021-01-11,NaT,True
5256,44236,Senyoku no Sigrdrifa: Dai-909 Senjutsu Hime-ta...,5.87,"Action, Military",Special,1,Aniplex,A-1 Pictures,Original,24 min.,PG-13 - Teens 13 or older,cap of the first eight episodes of Senyoku no ...,2020-11-28,2020-11-28,False
5257,46118,Wave!!: Surfing Yappe!! (TV),6.05,"Slice of Life, Sports",TV,12,Delfi Sound,Asahi Production,Other,23 min. per ep.,PG-13 - Teens 13 or older,"asaki Hinaoka, who grew up near the coast of O...",2021-01-12,NaT,True


In [24]:
anime_df['end_date'] = anime_df['end_date'].fillna(pd.to_datetime('today'))

In [25]:
anime_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Producers,Studios,Source,Duration,Rating,sypnopsis,start_date,end_date,ongoing
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,Bandai Visual,Sunrise,Original,24 min. per ep.,R - 17+ (violence & profanity),"In the year 2071, humanity has colonized sever...",1998-04-03,1999-04-24,False
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1,"Sunrise, Bandai Visual",Bones,Original,1 hr. 55 min.,R - 17+ (violence & profanity),"other day, another bounty—such is the life of ...",2001-09-01,2001-09-01,False
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,26,Victor Entertainment,Madhouse,Manga,24 min. per ep.,PG-13 - Teens 13 or older,"Vash the Stampede is the man with a $$60,000,0...",1998-04-01,1998-09-30,False
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",TV,26,"TV Tokyo, Bandai Visual, Dentsu, Victor Entert...",Sunrise,Original,25 min. per ep.,PG-13 - Teens 13 or older,ches are individuals with special powers like ...,2002-07-02,2002-12-24,False
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",TV,52,"TV Tokyo, Dentsu",Toei Animation,Manga,23 min. per ep.,PG - Children,It is the dark century and the people are suff...,2004-09-30,2005-09-29,False


In [26]:
def preprocess(text):
    if pd.isna(text): return ""
    text = re.sub(r'[^\w\s-]', ' ', text)  # keep words, spaces, hyphens
    return re.sub(r'\s+', ' ', text).strip().lower()

In [27]:
# Preprocess text in textual columns
def preprocess(text):
    if pd.isna(text):
      return ""
    text = re.sub(r'[^\w\s-]', ' ', text)  # keep words, spaces, hyphens
    return re.sub(r'\s+', ' ', text).strip().lower()

In [28]:
textual_columns = ["Genres", "Type", "Producers", "Studios", "Source", "Duration", "Rating", "sypnopsis"]

for text_col in textual_columns:
  anime_df[text_col] = anime_df[text_col].apply(preprocess)


In [29]:
anime_df

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Producers,Studios,Source,Duration,Rating,sypnopsis,start_date,end_date,ongoing
0,1,Cowboy Bebop,8.78,action adventure comedy drama sci-fi space,tv,26,bandai visual,sunrise,original,24 min per ep,r - 17 violence profanity,in the year 2071 humanity has colonized severa...,1998-04-03,1999-04-24 00:00:00.000000,False
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,action drama mystery sci-fi space,movie,1,sunrise bandai visual,bones,original,1 hr 55 min,r - 17 violence profanity,other day another bounty such is the life of t...,2001-09-01,2001-09-01 00:00:00.000000,False
2,6,Trigun,8.24,action sci-fi adventure comedy drama shounen,tv,26,victor entertainment,madhouse,manga,24 min per ep,pg-13 - teens 13 or older,vash the stampede is the man with a 60 000 000...,1998-04-01,1998-09-30 00:00:00.000000,False
3,7,Witch Hunter Robin,7.27,action mystery police supernatural drama magic,tv,26,tv tokyo bandai visual dentsu victor entertain...,sunrise,original,25 min per ep,pg-13 - teens 13 or older,ches are individuals with special powers like ...,2002-07-02,2002-12-24 00:00:00.000000,False
4,8,Bouken Ou Beet,6.98,adventure fantasy shounen supernatural,tv,52,tv tokyo dentsu,toei animation,manga,23 min per ep,pg - children,it is the dark century and the people are suff...,2004-09-30,2005-09-29 00:00:00.000000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5254,44086,Prayer X,6.75,music dementia,music,1,sony music entertainment,perimetron,original,3 min,pg-13 - teens 13 or older,usic video directed and animated by ryoji yama...,2018-08-06,2018-08-06 00:00:00.000000,False
5255,44208,Yami Shibai 8,6.09,dementia horror supernatural,tv,-1,tv tokyo,ilca,original,4 min,pg-13 - teens 13 or older,eighth season of yami shibai,2021-01-11,2025-08-28 01:26:17.587144,True
5256,44236,Senyoku no Sigrdrifa: Dai-909 Senjutsu Hime-ta...,5.87,action military,special,1,aniplex,a-1 pictures,original,24 min,pg-13 - teens 13 or older,cap of the first eight episodes of senyoku no ...,2020-11-28,2020-11-28 00:00:00.000000,False
5257,46118,Wave!!: Surfing Yappe!! (TV),6.05,slice of life sports,tv,12,delfi sound,asahi production,other,23 min per ep,pg-13 - teens 13 or older,asaki hinaoka who grew up near the coast of oo...,2021-01-12,2025-08-28 01:26:17.587144,True


In [31]:
"""One Hot Encoding for Type & Source"""
# Identify categorical columns to encode
one_hot_columns = ["Type", "Source"]

# Create a OneHotEncoder instance
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the selected categorical columns
one_hot_encoded_data = encoder.fit_transform(anime_df[one_hot_columns])

# Create a Dataframe from the encoded data
one_hot_df = pd.DataFrame(one_hot_encoded_data, columns=encoder.get_feature_names_out(one_hot_columns))


one_hot_encoded_df = pd.concat([anime_df.drop(columns=one_hot_columns), one_hot_df], axis=1)

In [32]:
"""TF-IDF for Genres, Rating, Producers, Studios"""
tfidf_columns = ["Genres", "Producers", "Studios", "Rating"]
tfidf_column_vectorizers = {}
tfidf_column_matrices = {}
tfidf_dfs = []

for tfidf_column in tfidf_columns:
    # Ensure the column contains strings & not lists
    text_series = one_hot_encoded_df[tfidf_column].astype(str)

    # Handle any NaN values
    text_series = text_series.fillna('')

    vectorizer = TfidfVectorizer()
    matrix = vectorizer.fit_transform(text_series)
    tfidf_column_vectorizers[tfidf_column] = vectorizer
    tfidf_column_matrices[tfidf_column] = matrix

    # Create DataFrame with proper column names
    feature_names = [f"{tfidf_column}_{feature}" for feature in vectorizer.get_feature_names_out()]
    tfidf_df = pd.DataFrame(matrix.toarray(), columns=feature_names, index=one_hot_encoded_df.index)
    tfidf_dfs.append(tfidf_df)

# Concatenate all TF-IDF features with original DataFrame
tfidf_combined_df = pd.concat([one_hot_encoded_df] + tfidf_dfs, axis=1)

In [33]:
model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode the synopsis into embeddings
synopsis_embeddings = model.encode(
    tfidf_combined_df["sypnopsis"].tolist(),
    convert_to_numpy=True
)

# create dataframe for embeddings
synposis_embeddings_df = pd.DataFrame(synopsis_embeddings, columns=[f"emb_{i}" for i in range(synopsis_embeddings.shape[1])])

# Combine with TF-IDF dataframe
text_vectorized_df = pd.concat([tfidf_combined_df.reset_index(drop=True), synposis_embeddings_df], axis=1)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [34]:
text_vectorized_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,Episodes,Producers,Studios,Duration,Rating,sypnopsis,...,emb_374,emb_375,emb_376,emb_377,emb_378,emb_379,emb_380,emb_381,emb_382,emb_383
0,1,Cowboy Bebop,8.78,action adventure comedy drama sci-fi space,26,bandai visual,sunrise,24 min per ep,r - 17 violence profanity,in the year 2071 humanity has colonized severa...,...,0.041378,-0.008807,-0.051468,0.12027,-0.005928,-0.009226,0.028198,-0.04346,-0.016529,-0.058174
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,action drama mystery sci-fi space,1,sunrise bandai visual,bones,1 hr 55 min,r - 17 violence profanity,other day another bounty such is the life of t...,...,0.073409,-0.03158,0.000875,0.026888,-0.017871,0.012852,0.039618,-0.15776,-0.005686,0.008987
2,6,Trigun,8.24,action sci-fi adventure comedy drama shounen,26,victor entertainment,madhouse,24 min per ep,pg-13 - teens 13 or older,vash the stampede is the man with a 60 000 000...,...,0.023642,-0.000594,-0.080299,-0.000698,-0.032473,-0.00947,0.029656,-0.087012,0.050944,-0.024974
3,7,Witch Hunter Robin,7.27,action mystery police supernatural drama magic,26,tv tokyo bandai visual dentsu victor entertain...,sunrise,25 min per ep,pg-13 - teens 13 or older,ches are individuals with special powers like ...,...,-0.047775,0.064541,0.046725,-0.029462,-0.025879,-0.040388,0.01804,-0.053823,0.057935,-0.021244
4,8,Bouken Ou Beet,6.98,adventure fantasy shounen supernatural,52,tv tokyo dentsu,toei animation,23 min per ep,pg - children,it is the dark century and the people are suff...,...,-0.033185,-0.002945,-0.058816,0.033064,-0.045447,0.008317,0.054319,-0.091631,0.057574,0.011912


In [35]:
def extract_minutes(duration):
    match = re.search(r'(\d+)\s*min', duration)
    return int(match.group(1)) if match else 0
anime_df['Duration'] = anime_df['Duration'].apply(extract_minutes)

numeric_cols = ['Episodes','Score','Duration']
scaler = StandardScaler()
numeric_matrix = scaler.fit_transform(anime_df[numeric_cols])

In [36]:
numeric_df = pd.DataFrame(numeric_matrix, columns=scaler.get_feature_names_out(numeric_cols))

final_df = pd.concat([anime_df, text_vectorized_df], axis=1)

In [37]:
final_df

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Producers,Studios,Source,Duration,...,emb_374,emb_375,emb_376,emb_377,emb_378,emb_379,emb_380,emb_381,emb_382,emb_383
0,1,Cowboy Bebop,8.78,action adventure comedy drama sci-fi space,tv,26,bandai visual,sunrise,original,24,...,0.041378,-0.008807,-0.051468,0.120270,-0.005928,-0.009226,0.028198,-0.043460,-0.016529,-0.058174
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,action drama mystery sci-fi space,movie,1,sunrise bandai visual,bones,original,55,...,0.073409,-0.031580,0.000875,0.026888,-0.017871,0.012852,0.039618,-0.157760,-0.005686,0.008987
2,6,Trigun,8.24,action sci-fi adventure comedy drama shounen,tv,26,victor entertainment,madhouse,manga,24,...,0.023642,-0.000594,-0.080299,-0.000698,-0.032473,-0.009470,0.029656,-0.087012,0.050944,-0.024974
3,7,Witch Hunter Robin,7.27,action mystery police supernatural drama magic,tv,26,tv tokyo bandai visual dentsu victor entertain...,sunrise,original,25,...,-0.047775,0.064541,0.046725,-0.029462,-0.025879,-0.040388,0.018040,-0.053823,0.057935,-0.021244
4,8,Bouken Ou Beet,6.98,adventure fantasy shounen supernatural,tv,52,tv tokyo dentsu,toei animation,manga,23,...,-0.033185,-0.002945,-0.058816,0.033064,-0.045447,0.008317,0.054319,-0.091631,0.057574,0.011912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5254,44086,Prayer X,6.75,music dementia,music,1,sony music entertainment,perimetron,original,3,...,0.090700,0.107044,0.018753,0.032493,-0.080742,-0.051087,0.026453,0.029584,0.029692,-0.042751
5255,44208,Yami Shibai 8,6.09,dementia horror supernatural,tv,-1,tv tokyo,ilca,original,4,...,-0.038754,0.044209,-0.009307,-0.062289,-0.030723,0.008113,0.105199,0.028377,-0.034312,0.024800
5256,44236,Senyoku no Sigrdrifa: Dai-909 Senjutsu Hime-ta...,5.87,action military,special,1,aniplex,a-1 pictures,original,24,...,-0.019292,0.031462,-0.017796,0.056616,-0.062369,-0.038634,0.092818,-0.008020,-0.033852,-0.005082
5257,46118,Wave!!: Surfing Yappe!! (TV),6.05,slice of life sports,tv,12,delfi sound,asahi production,other,23,...,0.011168,-0.010487,-0.063942,0.010269,-0.042241,0.042111,0.104969,0.026636,-0.065326,0.028177


In [38]:
anime_metadata

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Producers,Studios,Source,Duration,Rating,sypnopsis,start_date,end_date,ongoing
0,1,Cowboy Bebop,8.78,action adventure comedy drama sci-fi space,tv,26,bandai visual,sunrise,original,24,r - 17 violence profanity,in the year 2071 humanity has colonized severa...,1998-04-03,1999-04-24 00:00:00.000000,False
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,action drama mystery sci-fi space,movie,1,sunrise bandai visual,bones,original,55,r - 17 violence profanity,other day another bounty such is the life of t...,2001-09-01,2001-09-01 00:00:00.000000,False
2,6,Trigun,8.24,action sci-fi adventure comedy drama shounen,tv,26,victor entertainment,madhouse,manga,24,pg-13 - teens 13 or older,vash the stampede is the man with a 60 000 000...,1998-04-01,1998-09-30 00:00:00.000000,False
3,7,Witch Hunter Robin,7.27,action mystery police supernatural drama magic,tv,26,tv tokyo bandai visual dentsu victor entertain...,sunrise,original,25,pg-13 - teens 13 or older,ches are individuals with special powers like ...,2002-07-02,2002-12-24 00:00:00.000000,False
4,8,Bouken Ou Beet,6.98,adventure fantasy shounen supernatural,tv,52,tv tokyo dentsu,toei animation,manga,23,pg - children,it is the dark century and the people are suff...,2004-09-30,2005-09-29 00:00:00.000000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5254,44086,Prayer X,6.75,music dementia,music,1,sony music entertainment,perimetron,original,3,pg-13 - teens 13 or older,usic video directed and animated by ryoji yama...,2018-08-06,2018-08-06 00:00:00.000000,False
5255,44208,Yami Shibai 8,6.09,dementia horror supernatural,tv,-1,tv tokyo,ilca,original,4,pg-13 - teens 13 or older,eighth season of yami shibai,2021-01-11,2025-08-28 01:26:17.587144,True
5256,44236,Senyoku no Sigrdrifa: Dai-909 Senjutsu Hime-ta...,5.87,action military,special,1,aniplex,a-1 pictures,original,24,pg-13 - teens 13 or older,cap of the first eight episodes of senyoku no ...,2020-11-28,2020-11-28 00:00:00.000000,False
5257,46118,Wave!!: Surfing Yappe!! (TV),6.05,slice of life sports,tv,12,delfi sound,asahi production,other,23,pg-13 - teens 13 or older,asaki hinaoka who grew up near the coast of oo...,2021-01-12,2025-08-28 01:26:17.587144,True


In [40]:
def diagnose_duplicate_columns(df, df_name="DataFrame"):
    """
    Diagnose and identify duplicate columns in a DataFrame
    """
    print(f"DUPLICATE COLUMN ANALYSIS FOR {df_name}")
    print("="*60)

    # Check for duplicate column names
    duplicate_cols = df.columns[df.columns.duplicated()].tolist()

    if duplicate_cols:
        print(f"DUPLICATE COLUMNS FOUND: {len(duplicate_cols)}")
        print(f"Duplicate column names: {duplicate_cols}")

        # Show details for each duplicate
        for col_name in set(duplicate_cols):
            indices = [i for i, x in enumerate(df.columns) if x == col_name]
            print(f"\nColumn '{col_name}' appears at positions: {indices}")

            if len(indices) > 1:
                try:
                    # Check if the duplicate columns have identical data
                    first_col = df.iloc[:, indices[0]]
                    identical = True
                    for i in range(1, len(indices)): # Iterate through indices list
                        idx = indices[i] # Get the actual index from the list
                        if idx < df.shape[1]:  # Safety check
                            other_col = df.iloc[:, idx]
                            if not first_col.equals(other_col):
                                identical = False
                                break

                    print(f"  Data identical across duplicates: {identical}")

                    # Show sample data from each duplicate
                    for i, idx in enumerate(indices):
                        if idx < df.shape[1]:  # Safety check
                            sample_data = df.iloc[:3, idx].tolist()
                            print(f"  Position {idx} sample: {sample_data}")
                except Exception as e:
                    print(f"  Error comparing duplicates: {e}")
                    print(f"  Skipping detailed analysis for '{col_name}'")
    else:
        print("NO DUPLICATE COLUMNS FOUND")

    return duplicate_cols

def fix_duplicate_columns(df, strategy='keep_first'):
    """
    Fix duplicate columns in DataFrame

    Parameters:
    - strategy: 'keep_first', 'keep_last', or 'suffix'
    """
    print(f"\n FIXING DUPLICATE COLUMNS (strategy: {strategy})")

    original_shape = df.shape

    if strategy == 'keep_first':
        # Keep only the first occurrence of each column name
        df_fixed = df.loc[:, ~df.columns.duplicated()]

    elif strategy == 'keep_last':
        # Keep only the last occurrence of each column name
        df_fixed = df.loc[:, ~df.columns.duplicated(keep='last')]

    elif strategy == 'suffix':
        # Add suffixes to make column names unique
        df_fixed = df.copy()

        # Get duplicate column names
        duplicate_mask = df_fixed.columns.duplicated(keep=False)
        duplicate_cols = df_fixed.columns[duplicate_mask]

        # Create new column names with suffixes
        new_columns = df_fixed.columns.tolist()
        seen = {}

        for i, col in enumerate(new_columns):
            if col in duplicate_cols:
                if col not in seen:
                    seen[col] = 0
                else:
                    seen[col] += 1
                    new_columns[i] = f"{col}_{seen[col]}"

        df_fixed.columns = new_columns

    new_shape = df_fixed.shape
    print(f"Shape change: {original_shape} → {new_shape}")
    print(f"Columns removed: {original_shape[1] - new_shape[1]}")

    return df_fixed

def check_specific_date_columns(df, df_name="DataFrame"):
    """
    Specifically check start_date and end_date columns
    """
    print(f"\n DATE COLUMN ANALYSIS FOR {df_name}")
    print("-"*40)

    for col_name in ['start_date', 'end_date']:
        if col_name in df.columns:
            # Check if selecting this column returns DataFrame or Series
            selected = df[col_name]

            print(f"\n'{col_name}' column:")
            print(f"  Type when selected: {type(selected)}")

            if isinstance(selected, pd.DataFrame):
                print(f"  Returns DataFrame with shape: {selected.shape}")
                print(f"  This means there are {selected.shape[1]} columns named '{col_name}'")

                # Show sample data from each duplicate
                for i in range(selected.shape[1]):
                    sample = selected.iloc[:3, i].tolist()
                    print(f"    Column {i} sample: {sample}")

            else:  # Series
                print(f"  Returns Series with shape: {selected.shape}")
                print(f"  Data type: {selected.dtype}")
                sample = selected.head(3).tolist()
                print(f"  Sample data: {sample}")

# Usage functions
def full_diagnosis_and_fix(df, df_name="DataFrame"):
    """
    Complete diagnosis and fix for duplicate columns
    """
    print(f"COMPLETE DUPLICATE COLUMN DIAGNOSIS FOR {df_name}")
    print("="*70)

    # General duplicate check
    duplicate_cols = diagnose_duplicate_columns(df, df_name)

    # Specific date column check
    check_specific_date_columns(df, df_name)

    # Recommend fix
    if duplicate_cols:
        print(f"\n RECOMMENDED FIXES:")
        print(f"1. Keep first occurrence: df_fixed = df.loc[:, ~df.columns.duplicated()]")
        print(f"2. Keep last occurrence: df_fixed = df.loc[:, ~df.columns.duplicated(keep='last')]")
        print(f"3. Add suffixes: Use fix_duplicate_columns(df, 'suffix')")

        print(f"\n WARNING: After fixing, recheck feature columns count!")
        print(f"   Current total columns: {df.shape[1]}")
        print(f"   Unique column names: {len(df.columns.unique())}")

    return duplicate_cols

In [41]:
duplicate_issues = full_diagnosis_and_fix(final_df, "final_df")

if duplicate_issues:
  final_df_fixed = fix_duplicate_columns(final_df, 'keep_first')

print("\n FIX VERIFICATION:")
check_specific_date_columns(final_df_fixed, "final_df_fixed")

COMPLETE DUPLICATE COLUMN DIAGNOSIS FOR final_df
DUPLICATE COLUMN ANALYSIS FOR final_df
DUPLICATE COLUMNS FOUND: 13
Duplicate column names: ['MAL_ID', 'Name', 'Score', 'Genres', 'Episodes', 'Producers', 'Studios', 'Duration', 'Rating', 'sypnopsis', 'start_date', 'end_date', 'ongoing']

Column 'start_date' appears at positions: [12, 25]
  Data identical across duplicates: True
  Position 12 sample: [Timestamp('1998-04-03 00:00:00'), Timestamp('2001-09-01 00:00:00'), Timestamp('1998-04-01 00:00:00')]
  Position 25 sample: [Timestamp('1998-04-03 00:00:00'), Timestamp('2001-09-01 00:00:00'), Timestamp('1998-04-01 00:00:00')]

Column 'MAL_ID' appears at positions: [0, 15]
  Data identical across duplicates: True
  Position 0 sample: [1, 5, 6]
  Position 15 sample: [1, 5, 6]

Column 'Score' appears at positions: [2, 17]
  Data identical across duplicates: True
  Position 2 sample: ['8.78', '8.39', '8.24']
  Position 17 sample: ['8.78', '8.39', '8.24']

Column 'Producers' appears at positions

In [42]:
final_df_fixed

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Producers,Studios,Source,Duration,...,emb_374,emb_375,emb_376,emb_377,emb_378,emb_379,emb_380,emb_381,emb_382,emb_383
0,1,Cowboy Bebop,8.78,action adventure comedy drama sci-fi space,tv,26,bandai visual,sunrise,original,24,...,0.041378,-0.008807,-0.051468,0.120270,-0.005928,-0.009226,0.028198,-0.043460,-0.016529,-0.058174
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,action drama mystery sci-fi space,movie,1,sunrise bandai visual,bones,original,55,...,0.073409,-0.031580,0.000875,0.026888,-0.017871,0.012852,0.039618,-0.157760,-0.005686,0.008987
2,6,Trigun,8.24,action sci-fi adventure comedy drama shounen,tv,26,victor entertainment,madhouse,manga,24,...,0.023642,-0.000594,-0.080299,-0.000698,-0.032473,-0.009470,0.029656,-0.087012,0.050944,-0.024974
3,7,Witch Hunter Robin,7.27,action mystery police supernatural drama magic,tv,26,tv tokyo bandai visual dentsu victor entertain...,sunrise,original,25,...,-0.047775,0.064541,0.046725,-0.029462,-0.025879,-0.040388,0.018040,-0.053823,0.057935,-0.021244
4,8,Bouken Ou Beet,6.98,adventure fantasy shounen supernatural,tv,52,tv tokyo dentsu,toei animation,manga,23,...,-0.033185,-0.002945,-0.058816,0.033064,-0.045447,0.008317,0.054319,-0.091631,0.057574,0.011912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5254,44086,Prayer X,6.75,music dementia,music,1,sony music entertainment,perimetron,original,3,...,0.090700,0.107044,0.018753,0.032493,-0.080742,-0.051087,0.026453,0.029584,0.029692,-0.042751
5255,44208,Yami Shibai 8,6.09,dementia horror supernatural,tv,-1,tv tokyo,ilca,original,4,...,-0.038754,0.044209,-0.009307,-0.062289,-0.030723,0.008113,0.105199,0.028377,-0.034312,0.024800
5256,44236,Senyoku no Sigrdrifa: Dai-909 Senjutsu Hime-ta...,5.87,action military,special,1,aniplex,a-1 pictures,original,24,...,-0.019292,0.031462,-0.017796,0.056616,-0.062369,-0.038634,0.092818,-0.008020,-0.033852,-0.005082
5257,46118,Wave!!: Surfing Yappe!! (TV),6.05,slice of life sports,tv,12,delfi sound,asahi production,other,23,...,0.011168,-0.010487,-0.063942,0.010269,-0.042241,0.042111,0.104969,0.026636,-0.065326,0.028177


In [43]:
def check_dataframe_compatibility(anime_df, final_df):
    """
    Comprehensive compatibility check for AniRecs
    """
    print(" DATAFRAME COMPATIBILITY CHECK")
    print("="*60)

    # Basic df info
    print("\n1. BASIC DATAFRAME INFORMATION:")
    print(f"anime_df shape: {anime_df.shape}")
    print(f"final_df shape: {final_df.shape}")
    print(f"Row count match: {anime_df.shape[0] == final_df.shape[0]}")

    # Check required columns
    print("\n2. REQUIRED COLUMNS CHECK:")

    # Required columns for anime_df
    anime_required_columns = [
        'MAL_ID', 'Name', 'Score', 'Genres', 'Type', 'Episodes',
        'Producers', 'Studios', 'start_date', 'end_date'
    ]

    # Required columns for final_df
    final_required_columns = [
        'MAL_ID', 'Name', 'start_date', 'end_date', 'ongoing',
        'sypnopsis', 'Genres', 'Producers', 'Studios', 'Rating', 'Type', 'Source'
    ]

    print("anime_df missing columns:")
    anime_missing = [col for col in anime_required_columns if col not in anime_df.columns]
    print(f"  {anime_missing if anime_missing else 'None - All required columns present ✓'}")

    print("final_df missing columns:")
    final_missing = [col for col in final_required_columns if col not in final_df.columns]
    print(f"  {final_missing if final_missing else 'None - All required columns present ✓'}")

    # 3. Check data types
    print("\n3. CRITICAL DATA TYPES CHECK:")

    # Check date columns - handle both DataFrame and Series
    date_columns = ['start_date', 'end_date']
    for col in date_columns:
        if col in anime_df.columns:
            try:
                dtype_info = anime_df[col].dtype
                print(f"anime_df['{col}'] dtype: {dtype_info}")
                if not pd.api.types.is_datetime64_any_dtype(anime_df[col]):
                    print(f"  ⚠️  WARNING: {col} should be datetime, currently {dtype_info}")
                    print(f"  Fix with: anime_df['{col}'] = pd.to_datetime(anime_df['{col}'], errors='coerce')")
            except AttributeError as e:
                print(f"anime_df['{col}'] - Error checking dtype: {e}")

        if col in final_df.columns:
            try:
                dtype_info = final_df[col].dtype
                print(f"final_df['{col}'] dtype: {dtype_info}")
                if not pd.api.types.is_datetime64_any_dtype(final_df[col]):
                    print(f"  WARNING: {col} should be datetime, currently {dtype_info}")
                    print(f"  Fix with: final_df['{col}'] = pd.to_datetime(final_df['{col}'], errors='coerce')")
            except AttributeError as e:
                print(f"final_df['{col}'] - Error checking dtype: {e}")

    # Check Score column
    if 'Score' in anime_df.columns:
        print(f"anime_df['Score'] dtype: {anime_df['Score'].dtype}")
        if anime_df['Score'].dtype == 'object':
            print("  ℹ️  Score is object type - system handles this with pd.to_numeric()")
            unique_scores = anime_df['Score'].unique()
            non_numeric = [s for s in unique_scores if pd.isna(pd.to_numeric(s, errors='coerce'))]
            if non_numeric:
                print(f"  Non-numeric values found: {non_numeric[:10]}...")  # Show first 10

    # 4. Check alignment of index
    print("\n4. INDEX ALIGNMENT CHECK:")
    if not anime_df.index.equals(final_df.index):
        print("  WARNING: DataFrame indices don't match!")
        print("  This could cause issues with similarity matrix alignment")
        print("  Consider: anime_df.reset_index(drop=True, inplace=True)")
        print("           final_df.reset_index(drop=True, inplace=True)")
    else:
        print("  Indices are aligned")

    # Check feature columns
    print("\n5. FEATURE COLUMNS FOR SIMILARITY MATRIX:")
    exclude_columns = ['MAL_ID', 'Name', 'start_date', 'end_date', 'ongoing',
                      'sypnopsis', 'Genres', 'Producers', 'Studios', 'Rating', 'Type', 'Source']

    feature_columns = [col for col in final_df.columns if col not in exclude_columns]
    print(f"Feature columns count: {len(feature_columns)}")
    print(f"First 10 feature columns: {feature_columns[:10]}")

    if len(feature_columns) == 0:
        print("  CRITICAL: No feature columns found for similarity computation!")
        print("  The system needs numerical features to compute similarity")

    # Check for non-numeric feature columns
    non_numeric_features = []
    for col in feature_columns:
        if not pd.api.types.is_numeric_dtype(final_df[col]):
            non_numeric_features.append((col, final_df[col].dtype))

    if non_numeric_features:
        print("  WARNING: Non-numeric feature columns found:")
        for col, dtype in non_numeric_features[:5]:  # Show first 5
            print(f"    {col}: {dtype}")
        print("  These should be converted to numeric or excluded")

    # Check for null values
    print("\n6. NULL VALUES IN CRITICAL COLUMNS:")

    critical_cols = ['Name', 'MAL_ID']
    for col in critical_cols:
        if col in anime_df.columns:
            null_count = anime_df[col].isnull().sum()
            print(f"anime_df['{col}'] nulls: {null_count}")
            if null_count > 0:
                print(f"  WARNING: {null_count} null values in critical column '{col}'")

    # Check nulls in feature columns
    if feature_columns:
        feature_nulls = final_df[feature_columns].isnull().sum().sum()
        print(f"Total nulls in feature columns: {feature_nulls}")
        print("  Note: System fills these with 0 in _fill_nan_in_features()")

    # Check string/categorical columns
    print("\n7. STRING COLUMNS PROCESSING CHECK:")
    string_cols = ['Genres', 'Producers', 'Studios']
    for col in string_cols:
        if col in anime_df.columns:
            null_count = anime_df[col].isnull().sum()
            total_count = len(anime_df)
            print(f"anime_df['{col}'] - Nulls: {null_count}/{total_count} ({null_count/total_count*100:.1f}%)")

            # Check data types in the column
            sample_values = anime_df[col].dropna().head(3).tolist()
            print(f"  Sample values: {sample_values}")

    # Summary of above + recs on next steps
    print("\n" + "="*60)
    print("8. SUMMARY AND RECOMMENDATIONS:")

    issues_found = []
    if anime_missing or final_missing:
        issues_found.append("Missing required columns")
    if not anime_df.index.equals(final_df.index):
        issues_found.append("Index alignment issues")
    if len(feature_columns) == 0:
        issues_found.append("No feature columns for similarity")
    if non_numeric_features:
        issues_found.append("Non-numeric feature columns")

    if issues_found:
        print("ISSUES FOUND:")
        for issue in issues_found:
            print(f"  - {issue}")
    else:
        print("✓ No critical issues found - DataFrames should work with the system")

    return {
        'anime_missing_cols': anime_missing,
        'final_missing_cols': final_missing,
        'feature_columns': feature_columns,
        'non_numeric_features': non_numeric_features,
        'indices_aligned': anime_df.index.equals(final_df.index),
        'issues_found': issues_found
    }

In [44]:
check_dataframe_compatibility(anime_df, final_df_fixed)

 DATAFRAME COMPATIBILITY CHECK

1. BASIC DATAFRAME INFORMATION:
anime_df shape: (5259, 15)
final_df shape: (5259, 2222)
Row count match: True

2. REQUIRED COLUMNS CHECK:
anime_df missing columns:
  None - All required columns present ✓
final_df missing columns:
  None - All required columns present ✓

3. CRITICAL DATA TYPES CHECK:
anime_df['start_date'] dtype: datetime64[ns]
final_df['start_date'] dtype: datetime64[ns]
anime_df['end_date'] dtype: datetime64[ns]
final_df['end_date'] dtype: datetime64[ns]
anime_df['Score'] dtype: object
  ℹ️  Score is object type - system handles this with pd.to_numeric()

4. INDEX ALIGNMENT CHECK:
  Indices are aligned

5. FEATURE COLUMNS FOR SIMILARITY MATRIX:
Feature columns count: 2210
First 10 feature columns: ['Score', 'Episodes', 'Duration', 'Type_movie', 'Type_music', 'Type_ona', 'Type_ova', 'Type_special', 'Type_tv', 'Source_4-koma manga']
    Score: object
    Episodes: object
  These should be converted to numeric or excluded

6. NULL VALUES I

{'anime_missing_cols': [],
 'final_missing_cols': [],
 'feature_columns': ['Score',
  'Episodes',
  'Duration',
  'Type_movie',
  'Type_music',
  'Type_ona',
  'Type_ova',
  'Type_special',
  'Type_tv',
  'Source_4-koma manga',
  'Source_book',
  'Source_card game',
  'Source_digital manga',
  'Source_game',
  'Source_light novel',
  'Source_manga',
  'Source_music',
  'Source_novel',
  'Source_original',
  'Source_other',
  'Source_picture book',
  'Source_radio',
  'Source_visual novel',
  'Source_web manga',
  'Genres_action',
  'Genres_adventure',
  'Genres_ai',
  'Genres_arts',
  'Genres_cars',
  'Genres_comedy',
  'Genres_dementia',
  'Genres_demons',
  'Genres_drama',
  'Genres_ecchi',
  'Genres_fantasy',
  'Genres_fi',
  'Genres_game',
  'Genres_harem',
  'Genres_hentai',
  'Genres_historical',
  'Genres_horror',
  'Genres_josei',
  'Genres_kids',
  'Genres_life',
  'Genres_magic',
  'Genres_martial',
  'Genres_mecha',
  'Genres_military',
  'Genres_music',
  'Genres_mystery',


In [45]:
# Convert non-numeric feature columns to numeric
final_df_fixed['Score'] = pd.to_numeric(final_df_fixed['Score'], errors='coerce')
final_df_fixed['Episodes'] = pd.to_numeric(final_df_fixed['Episodes'], errors='coerce')

# Fill any resulting NaN values with 0
final_df_fixed['Score'] = final_df_fixed['Score'].fillna(0)
final_df_fixed['Episodes'] = final_df_fixed['Episodes'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df_fixed['Score'] = pd.to_numeric(final_df_fixed['Score'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df_fixed['Episodes'] = pd.to_numeric(final_df_fixed['Episodes'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df_fixed['Score'] = final_df_

In [46]:
final_df_fixed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5259 entries, 0 to 5258
Columns: 2222 entries, MAL_ID to emb_383
dtypes: bool(1), datetime64[ns](2), float32(384), float64(1824), int64(3), object(8)
memory usage: 81.4+ MB


In [47]:
class AniRecs:
    def __init__(self, anime_df, final_df):
        self.anime_df = anime_df
        self.final_df = final_df
        self.feature_columns = None
        self.similarity_matrix = None
        self._prepare_features()
        self._fill_nan_in_features() # Added a step to fill NaN values
        self._compute_similarity_matrix()
        self._prepare_lookup_sets()

    def _prepare_features(self):
        """Prepare feature columns for similarity computation"""
        exclude_columns = ['MAL_ID', 'Name', 'start_date', 'end_date', 'ongoing',
                          'sypnopsis', 'Genres', 'Producers', 'Studios', 'Rating', 'Type', 'Source']

        self.feature_columns = [col for col in self.final_df.columns if col not in exclude_columns]

    def _fill_nan_in_features(self):
        """Fill NaN values in feature columns with 0"""
        self.final_df[self.feature_columns] = self.final_df[self.feature_columns].fillna(0)


    def _compute_similarity_matrix(self):
        """Compute cosine similarity matrix for all anime"""
        feature_matrix = self.final_df[self.feature_columns].values
        # feature_matrix = np.nan_to_num(feature_matrix) # This is no longer needed as NaNs are filled
        self.similarity_matrix = cosine_similarity(feature_matrix)

    def _prepare_lookup_sets(self):
        """Prepare lookup sets for genres, producers, studios"""
        self.genres_set = set()
        self.producers_set = set()
        self.studios_set = set()

        for _, row in self.anime_df.iterrows():
            if pd.notna(row['Genres']):
                self.genres_set.update([g.strip().lower() for g in str(row['Genres']).split()])
            if pd.notna(row['Producers']):
                self.producers_set.update([p.strip().lower() for p in str(row['Producers']).split()])
            if pd.notna(row['Studios']):
                self.studios_set.update([s.strip().lower() for s in str(row['Studios']).split()])

    def extract_year_decade(self, query):
        """Extract year or decade information from query"""
        query_lower = query.lower()

        # Patterns for decades with descriptors
        decade_patterns = [
            (r'\b(?:early|beginning of|start of)\s+(\d{4})s?\b', 'early'),
            (r'\b(?:mid|middle of)\s+(\d{4})s?\b', 'mid'),
            (r'\b(?:late|end of)\s+(\d{4})s?\b', 'late'),
            (r'\b(\d{4})s?\b', 'full'),  # Just the decade
            (r'\b(\d{4})\b', 'year'),     # Specific year
        ]

        # Special decade mappings
        decade_words = {
            'eighties': '1980', 'nineties': '1990', 'ninties': '1990',
            '80s': '1980', '90s': '1990', '2000s': '2000', '2010s': '2010'
        }

        # Check for word-based decades first
        for word, decade in decade_words.items():
            if word in query_lower:
                return self._get_decade_range(decade, 'full')

        # Check for numeric patterns
        for pattern, period_type in decade_patterns:
            match = re.search(pattern, query_lower)
            if match:
                year = match.group(1)
                if period_type == 'year':
                    return int(year), int(year)
                else:
                    return self._get_decade_range(year, period_type)

        return None, None

    def _get_decade_range(self, decade_start, period_type):
        """Get year range for decade periods"""
        start_year = int(decade_start)

        if period_type == 'early':
            return start_year, start_year + 3
        elif period_type == 'mid':
            return start_year + 3, start_year + 6
        elif period_type == 'late':
            return start_year + 7, start_year + 9
        else:  # full decade
            return start_year, start_year + 9

    def extract_genres(self, query):
        """Extract genres from query using fuzzy matching"""
        query_lower = query.lower()
        found_genres = []

        # Direct genre matching
        for genre in self.genres_set:
            if genre in query_lower:
                found_genres.append(genre)

        genre_synonyms = {
            'action': ['action'],
            'adventure': ['adventure'],
            'ai': ['ai'],
            'arts': ['arts'],
            'cars': ['cars'],
            'comedy': ['comedy'],
            'dementia': ['dementia'],
            'demons': ['demons'],
            'drama': ['drama'],
            'ecchi': ['ecchi'],
            'fantasy': ['fantasy', 'magic'],
            'game': ['game'],
            'harem': ['harem'],
            'hentai': ['hentai'],
            'historical': ['historical'],
            'horror': ['horror'],
            'josei': ['josei'],
            'kids': ['kids'],
            'slice of life': ['slice', 'life'],
            'martial arts': ['martial', 'arts'],
            'mecha': ['mecha'],
            'military': ['military'],
            'music': ['music'],
            'mystery': ['mystery'],
            'parody': ['parody'],
            'police': ['police'],
            'power': ['power'],
            'psychological': ['psychological'],
            'romance': ['romance'],
            'samurai': ['samurai'],
            'school': ['school'],
            'sci-fi': ['sci-fi'],
            'seinen': ['seinen'],
            'shoujo': ['shoujo'],
            'shounen': ['shounen'],
            'space': ['space'],
            'sports': ['sports'],
            'super power': ['super', 'power'],
            'supernatural': ['supernatural'],
            'thriller': ['thriller'],
            'vampire': ['vampire'],
            'yaoi': ['yaoi'],
            'yuri': ['yuri']
        }


        for genre, synonyms in genre_synonyms.items():
            for synonym in synonyms:
                if synonym in query_lower and genre not in found_genres:
                    found_genres.append(genre)
                    break

        return found_genres

    def extract_anime_name(self, query):
        """Extract anime name from similarity queries"""
        similarity_patterns = [
            r'similar to (.+?)(?:\?|$|,)',
            r'like (.+?)(?:\?|$|,)',
            r'recommend.*?(?:similar to|like) (.+?)(?:\?|$|,)',
            r'find.*?(?:similar to|like) (.+?)(?:\?|$|,)',
            r'show.*?(?:similar to|like) (.+?)(?:\?|$|,)',
        ]

        query_lower = query.lower()
        for pattern in similarity_patterns:
            match = re.search(pattern, query_lower)
            if match:
                return match.group(1).strip()

        return None

    def filter_by_time_period(self, df, start_year, end_year):
        """Filter anime by time period"""
        if start_year is None or end_year is None:
            return df

        # Filter based on start_date year
        filtered_df = df[
            (df['start_date'].dt.year >= start_year) &
            (df['start_date'].dt.year <= end_year)
        ].copy()

        return filtered_df

    def filter_by_genres(self, df, genres):
        """Filter anime by genres"""
        if not genres:
            return df

        mask = pd.Series([False] * len(df))

        for genre in genres:
            genre_mask = df['Genres'].str.contains(genre, case=False, na=False)
            mask = mask | genre_mask

        return df[mask].copy()

    def find_anime_by_name(self, query_name, threshold=0.8):
        """Find anime by name using fuzzy matching"""
        anime_names = self.anime_df['Name'].tolist()

        # Try exact match first (case insensitive)
        exact_matches = self.anime_df[self.anime_df['Name'].str.lower() == query_name.lower()]
        if not exact_matches.empty:
            return exact_matches.iloc[0]

        # Use fuzzy matching
        close_matches = get_close_matches(query_name, anime_names, n=5, cutoff=0.6)

        if close_matches:
            print(f"Found similar anime names: {close_matches}")
            best_match = close_matches[0]
            return self.anime_df[self.anime_df['Name'] == best_match].iloc[0]

        return None

    def get_similar_anime(self, anime_name, top_k=10):
        """Get similar anime recommendations based on anime name"""
        target_anime = self.find_anime_by_name(anime_name)

        if target_anime is None:
            return f"Sorry, I couldn't find an anime named '{anime_name}'. Please check the spelling or try a different name."

        target_idx = target_anime.name
        similarity_scores = self.similarity_matrix[target_idx]
        similar_indices = np.argsort(similarity_scores)[::-1][1:top_k+1]

        recommendations = []
        for idx in similar_indices:
            anime_info = self.anime_df.iloc[idx]
            recommendations.append({
                'name': anime_info['Name'],
                'score': anime_info['Score'],
                'genres': anime_info['Genres'],
                'type': anime_info['Type'],
                'episodes': anime_info['Episodes'],
                'similarity_score': similarity_scores[idx],
                'mal_id': anime_info['MAL_ID'],
                'start_year': anime_info['start_date'].year if pd.notna(anime_info['start_date']) else 'Unknown'
            })

        return recommendations

    def get_filtered_recommendations(self, start_year=None, end_year=None, genres=None, top_k=10, min_score=6.0):
        """Get recommendations based on filters"""
        # Start with full dataset
        filtered_df = self.anime_df.copy()

        # Apply time filter
        if start_year is not None and end_year is not None:
            filtered_df = self.filter_by_time_period(filtered_df, start_year, end_year)

        # Apply genre filter
        if genres:
            filtered_df = self.filter_by_genres(filtered_df, genres)

        # Filter by minimum score and remove entries with unknown scores
        filtered_df = filtered_df[
            (pd.to_numeric(filtered_df['Score'], errors='coerce') >= min_score) &
            (filtered_df['Score'] != 'Unknown')
        ]

        if filtered_df.empty:
            return "No anime found matching your criteria. Try adjusting your filters."

        # Sort by score and return top results
        filtered_df = filtered_df.sort_values('Score', ascending=False).head(top_k)

        recommendations = []
        for _, anime_info in filtered_df.iterrows():
            recommendations.append({
                'name': anime_info['Name'],
                'score': anime_info['Score'],
                'genres': anime_info['Genres'],
                'type': anime_info['Type'],
                'episodes': anime_info['Episodes'],
                'mal_id': anime_info['MAL_ID'],
                'start_year': anime_info['start_date'].year if pd.notna(anime_info['start_date']) else 'Unknown'
            })

        return recommendations

    def process_user_query(self, user_input, top_k=10):
        """Process various types of natural language queries"""
        user_input_lower = user_input.lower().strip()

        # Check if it's a similarity-based query
        anime_name = self.extract_anime_name(user_input)
        if anime_name:
            return self.get_similar_anime(anime_name, top_k)

        # Extract time period
        start_year, end_year = self.extract_year_decade(user_input)

        # Extract genres
        genres = self.extract_genres(user_input)

        # If we have filters, use filtered recommendations
        if start_year or genres:
            return self.get_filtered_recommendations(
                start_year=start_year,
                end_year=end_year,
                genres=genres,
                top_k=top_k
            )

        # If no specific filters, try to find the most relevant recommendation type
        if any(word in user_input_lower for word in ['good', 'recommend', 'suggest', 'show']):
            # General recommendation - return top rated anime
            return self.get_filtered_recommendations(top_k=top_k, min_score=7.0)

        return "I couldn't understand your request. Please try queries like:\n" + \
               "- 'recommend anime similar to One Piece'\n" + \
               "- 'what are some good animes from the 90s'\n" + \
               "- 'recommend me some action animes from the early 2000s'\n" + \
               "- 'please recommend me some sci-fi animes'"

    def display_recommendations(self, recommendations, query_info=None):
        """Display recommendations in a nice format"""
        if isinstance(recommendations, str):
            print(recommendations)
            return

        if query_info:
            print(f"\n🎌 {query_info}:\n")
        else:
            print("\n🎌 Recommended Anime:\n")

        for i, anime in enumerate(recommendations, 1):
            print(f"{i}. {anime['name']}")
            print(f"   📊 Score: {anime['score']} | 📺 Type: {anime['type']} | 📋 Episodes: {anime['episodes']}")
            print(f"   🎭 Genres: {anime['genres']}")

            if 'similarity_score' in anime:
                print(f"   🔗 Similarity: {anime['similarity_score']:.3f}")

            if 'start_year' in anime:
                print(f"   📅 Year: {anime['start_year']}")

            print(f"   🆔 MAL ID: {anime['mal_id']}")
            print("-" * 60)

def create_recommendation_system(anime_df, final_df):
    """Create and return the recommendation system"""
    return AniRecs(anime_df, final_df)

def demonstrate_system(anime_df, final_df):
    """Demonstrate the  recommendation system"""
    recommender = create_recommendation_system(anime_df, final_df)

    # Test queries
    test_queries = [
        "what are some good animes from the 90s",
        "recommend me some action animes from the early 2000s",
        "recommend me some slice of life animes from the late 2010s",
        "please recommend me an anime similar to one piece",
        "please recommend me some adventure animes",
        "please recommend me some sci-fi animes"
    ]

    for query in test_queries:
        print(f"\n{'='*80}")
        print(f"User Query: '{query}'")
        print('='*80)

        recommendations = recommender.process_user_query(query, top_k=5)
        recommender.display_recommendations(recommendations)

def interactive_system(anime_df, final_df):
    """Interactive recommendation system"""
    recommender = create_recommendation_system(anime_df, final_df)

    print("🎌 Welcome to AniRecs!")
    print("\nSupported query types:")
    print("📅 Time-based: 'what are some good animes from the 90s'")
    print("🎭 Genre-based: 'recommend me some action animes'")
    print("📅🎭 Combined: 'recommend me some sci-fi animes from the early 2000s'")
    print("🔗 Similarity: 'recommend anime similar to One Piece'")
    print("\nType 'quit' to exit\n")

    while True:
        user_input = input("What would you like? ").strip()

        if user_input.lower() in ['quit', 'exit', 'q']:
            print("Thanks for using AniRecs!")
            break

        if not user_input:
            print("Please enter a query.")
            continue

        recommendations = recommender.process_user_query(user_input)
        recommender.display_recommendations(recommendations)
        print()

In [48]:
# Initialize the recommendation system
print("🔧 Initializing AniRecs...")
recommender = create_recommendation_system(anime_df, final_df_fixed)
print("✅ System initialized successfully!\n")

# Test different query types
queries = [
    "what are some good animes from the 90s",
    "recommend me some action animes from the early 2000s",
    "recommend me some slice of life animes from the late 2010s",
    "please recommend me an anime similar to one piece",
    "please recommend me some adventure animes",
    "please recommend me some sci-fi animes"
]

print("🧪 Testing different query types:\n")

for i, query in enumerate(queries, 1):
    print(f"\n{'='*80}")
    print(f"TEST {i}: '{query}'")
    print('='*80)

    try:
        recommendations = recommender.process_user_query(query, top_k=5)
        recommender.display_recommendations(recommendations)
    except Exception as e:
        print(f"❌ Error processing query: {e}")

print("\n🎉 All tests completed!")

🔧 Initializing AniRecs...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.final_df[self.feature_columns] = self.final_df[self.feature_columns].fillna(0)


✅ System initialized successfully!

🧪 Testing different query types:


TEST 1: 'what are some good animes from the 90s'

🎌 Recommended Anime:

1. Cowboy Bebop
   📊 Score: 8.78 | 📺 Type: tv | 📋 Episodes: 26
   🎭 Genres: action adventure comedy drama sci-fi space
   📅 Year: 1998
   🆔 MAL ID: 1
------------------------------------------------------------
2. Rurouni Kenshin: Meiji Kenkaku Romantan - Tsuioku-hen
   📊 Score: 8.73 | 📺 Type: ova | 📋 Episodes: 4
   🎭 Genres: action historical drama romance martial arts samurai shounen
   📅 Year: 1999
   🆔 MAL ID: 44
------------------------------------------------------------
3. Mononoke Hime
   📊 Score: 8.72 | 📺 Type: movie | 📋 Episodes: 1
   🎭 Genres: action adventure fantasy
   📅 Year: 1997
   🆔 MAL ID: 164
------------------------------------------------------------
4. Great Teacher Onizuka
   📊 Score: 8.7 | 📺 Type: tv | 📋 Episodes: 43
   🎭 Genres: slice of life comedy drama school shounen
   📅 Year: 1999
   🆔 MAL ID: 245
------------------

  return df[mask].copy()
  return df[mask].copy()
