In [37]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text data into TF-IDF vectors
from sklearn.metrics.pairwise import cosine_similarity  # For computing cosine similarity between vectors
from scipy.spatial.distance import pdist, squareform  # For pairwise distance computations and converting to a square matrix
import pickle
import math

# pd.set_option('display.max_columns', None)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)


In [2]:
anime_df = pd.read_csv('AnimeNEW.csv')

Exploratory Data Analysis

In [48]:
pd.set_option('display.max_columns', None)
anime_df.head()

Unnamed: 0,ID,Title,Synonyms,Japanese,English,Synopsis,Type,Episodes,Status,Start_Aired,End_Aired,Premiered,Broadcast,Producers,Licensors,Studios,Source,Genres,Themes,Demographics,Duration_Minutes,Rating,Score,Scored_Users,Ranked,Popularity,Members,Favorites
0,16498,Shingeki no Kyojin,"AoT, SnK",進撃の巨人,Attack on Titan,"Centuries ago, mankind was slaughtered to near...",TV,25.0,Finished Airing,"Apr 7, 2013","Sep 29, 2013",Spring 2013,Sundays at 0158 (JST),"Production I.G, Dentsu, Mainichi Broadcasting ...",Funimation,Wit Studio,Manga,"Action, Drama","Gore, Military, Survival",Shounen,24.0,R - 17+ (violence & profanity),8.531,519803.0,1002.0,1,3524109,155695
1,1535,Death Note,DN,デスノート,Death Note,"Brutal murders, petty thefts, and senseless vi...",TV,37.0,Finished Airing,"Oct 4, 2006","Jun 27, 2007",Fall 2006,Wednesdays at 0056 (JST),"VAP, Konami, Ashi Productions, Nippon Televisi...",VIZ Media,Madhouse,Manga,"Supernatural, Suspense",Psychological,Shounen,23.0,R - 17+ (violence & profanity),8.621,485487.0,732.0,2,3504535,159701
2,5114,Fullmetal Alchemist: Brotherhood,"Hagane no Renkinjutsushi Fullmetal Alchemist, ...",鋼の錬金術師 FULLMETAL ALCHEMIST,Fullmetal Alchemist Brotherhood,After a horrific alchemy experiment goes wrong...,TV,64.0,Finished Airing,"Apr 5, 2009","Jul 4, 2010",Spring 2009,Sundays at 1700 (JST),"Aniplex, Square Enix, Mainichi Broadcasting Sy...","Funimation, Aniplex of America",Bones,Manga,"Action, Adventure, Drama, Fantasy",Military,Shounen,24.0,R - 17+ (violence & profanity),9.131,900398.0,12.0,3,2978455,207772
3,30276,One Punch Man,"One Punch-Man, One-Punch Man, OPM",ワンパンマン,One Punch Man,The seemingly unimpressive Saitama has a rathe...,TV,12.0,Finished Airing,"Oct 5, 2015","Dec 21, 2015",Fall 2015,Mondays at 0105 (JST),"TV Tokyo, Bandai Visual, Lantis, Asatsu DK, Ba...",VIZ Media,Madhouse,Web manga,"Action, Comedy","Parody, Super Power",Seinen,24.0,R - 17+ (violence & profanity),8.511,19066.0,1112.0,4,2879907,59651
4,11757,Sword Art Online,"S.A.O, SAO",ソードアート・オンライン,Sword Art Online,Ever since the release of the innovative Nerve...,TV,25.0,Finished Airing,"Jul 8, 2012","Dec 23, 2012",Summer 2012,Sundays at 0000 (JST),"Aniplex, Genco, DAX Production, ASCII Media Wo...",Aniplex of America,A-1 Pictures,Light novel,"Action, Adventure, Fantasy, Romance","Love Polygon, Video Game",Unknown,23.0,PG-13 - Teens 13 or older,7.201,990254.0,29562.0,5,2813565,64997


In [49]:
pd.set_option('display.max_columns', 10)

In [50]:
# how many missing titles in the datatset
i = (anime_df['English'] == 'Unknown').sum()
print("Number of unknown titles:", i)

i = (anime_df['Genres'] == 'Unknown').sum()
print("Number of unknown genres:", i)

i = (anime_df['Themes'] == 'Unknown').sum()
print("Number of unknown themes:", i)

anime_df = anime_df[
    (anime_df['Genres'] != 'Unknown') &
    (anime_df['Themes'] != 'Unknown') &
    (anime_df['English'] != 'Unknown')
].reset_index(drop=True)


i = (anime_df['English'] == 'Unknown').sum()
print("Number of unknown titles after dropping:", i)

i = (anime_df['Genres'] == 'Unknown').sum()
print("Number of unknown genres after dropping:", i)

i = (anime_df['Themes'] == 'Unknown').sum()
print("Number of unknown themes after dropping:", i)



Number of unknown titles: 12378
Number of unknown genres: 3699
Number of unknown themes: 9648
Number of unknown titles after dropping: 0
Number of unknown genres after dropping: 0
Number of unknown themes after dropping: 0


In [63]:
anime_df.shape

(4591, 28)

In [64]:
anime_df['ID'].nunique()

4591

Viewing Top 50 Ranked animes

In [6]:
Top_animes = anime_df.sort_values(by="Ranked") 
Names = Top_animes["Title"].reset_index()
Names = Names.drop('index', axis=1)
Names.index = Names.index + 1
Names.head(50)

Unnamed: 0,Title
1,Fullmetal Alchemist: Brotherhood
2,Kaguya-sama wa Kokurasetai: Ultra Romantic
3,Gintama°
4,Steins;Gate
5,Shingeki no Kyojin Season 3 Part 2
...,...
46,Rurouni Kenshin: Meiji Kenkaku Romantan - Tsui...
47,86 Part 2
48,Ashita no Joe 2
49,Code Geass: Hangyaku no Lelouch


All Possible Genres

In [7]:
possible_genres = anime_df['Genres'].str.split(',').explode().str.strip().unique()
possible_genres

array(['Action', 'Drama', 'Supernatural', 'Suspense', 'Adventure',
       'Fantasy', 'Comedy', 'Romance', 'Horror', 'Sci-Fi', 'Ecchi',
       'Mystery', 'Sports', 'Award Winning', 'Avant Garde',
       'Slice of Life', 'Gourmet', 'Boys Love', 'Unknown', 'Girls Love',
       'Hentai', 'Erotica'], dtype=object)

All Possible Themes

In [8]:
possible_themes = anime_df['Themes'].str.split(',').explode().str.strip().unique()
possible_themes

array(['Gore', 'Military', 'Survival', 'Psychological', 'Parody',
       'Super Power', 'Love Polygon', 'Video Game', 'School',
       'Martial Arts', 'Historical', 'Unknown', 'Romantic Subtext',
       'Time Travel', 'Isekai', 'Strategy Game', 'Mecha', 'Music',
       'Mythology', 'High Stakes Game', 'Team Sports', 'Reincarnation',
       'Adult Cast', 'Space', 'Detective', 'Vampire', 'Harem',
       'Mahou Shoujo', 'Visual Arts', 'Samurai', 'Crossdressing',
       'Reverse Harem', 'Childcare', 'Delinquents', 'CGDCT', 'Gag Humor',
       'Organized Crime', 'Otaku Culture', 'Workplace', 'Iyashikei',
       'Anthropomorphic', 'Educational', 'Medical', 'Showbiz',
       'Combat Sports', 'Idols (Female)', 'Performing Arts', 'Racing',
       'Magical Sex Shift', 'Idols (Male)', 'Pets'], dtype=object)

Possible Age Rating

In [9]:
possible_dem = anime_df['Rating'].str.split(',').explode().str.strip().unique()
possible_dem

array(['R - 17+ (violence & profanity)', 'PG-13 - Teens 13 or older',
       'R+ - Mild Nudity', 'PG - Children', 'G - All Ages', nan,
       'Rx - Hentai'], dtype=object)

In [3]:
df_content = anime_df[["English", "Synopsis"]]
df_content.head()

Unnamed: 0,English,Synopsis
0,Attack on Titan,"Centuries ago, mankind was slaughtered to near..."
1,Death Note,"Brutal murders, petty thefts, and senseless vi..."
2,Fullmetal Alchemist Brotherhood,After a horrific alchemy experiment goes wrong...
3,One Punch Man,The seemingly unimpressive Saitama has a rathe...
4,Sword Art Online,Ever since the release of the innovative Nerve...


In [4]:
df_content = df_content[(df_content['Synopsis'] != 'Unknown') & (df_content['English'] != 'Unknown')]

In [6]:
df_content.shape

(8409, 2)

In [7]:
# Instantiate a TfidfVectorizer with parameters to remove stop words and filter out very common/rare terms
vectorizer = TfidfVectorizer(min_df=2, max_df=0.7, stop_words='english')

In [8]:
# Fit the vectorizer on the 'plot' column and transform the text into TF-IDF vectors
vectorized_data = vectorizer.fit_transform(df_content['Synopsis'])

In [9]:
# Explore the generated features (vocabulary terms)
print(vectorizer.get_feature_names_out())
#some japanese words here

['00' '000' '001' ... 'éclair' 'état' 'être']


In [10]:
# Convert the TF-IDF sparse matrix to a DataFrame with feature names as columns
tfidf_df = pd.DataFrame(
    vectorized_data.toarray(),
    columns=vectorizer.get_feature_names_out()
)

In [11]:
tfidf_df.head()
# max columns LOOOOL

Unnamed: 0,00,000,001,0079,0083,...,zutto,zwei,éclair,état,être
0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0


In [12]:
# Assign the show titles to the DataFrame's index for easier reference
tfidf_df.index = df_content['English']
# tfidf_df.head()

In [13]:
# no need because we pickled
cosine_similarity_array = cosine_similarity(tfidf_df)


In [14]:
# store locally
with open("coSim_ESdrop.pkl", "wb") as f:
    pickle.dump(cosine_similarity_array, f)

In [None]:
# to load
with open("coSim_ESdrop.pkl", "rb") as f:
    cosine_similarity_array = pickle.load(f)

print(cosine_similarity_array)

In [15]:
# Convert the cosine similarity array into a DataFrame with show titles as both index and columns
cosine_similarity_df = pd.DataFrame(
    cosine_similarity_array,
    index=tfidf_df.index,
    columns=tfidf_df.index
)


In [60]:
cosine_similarity_df.head()

English,Attack on Titan,Death Note,Fullmetal Alchemist Brotherhood,One Punch Man,Sword Art Online,...,Mumu Tribe,Cloud Bread,Cloud Bread 2,Tobot Movie Attack of Robot Force,Dino King
English,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Attack on Titan,1.0,0.036339,0.036858,0.011122,0.016195,...,0.0,0.0,0.0,0.0,0.0
Death Note,0.036339,1.0,0.028416,0.012076,0.03124,...,0.0,0.0,0.0,0.0,0.0
Fullmetal Alchemist Brotherhood,0.036858,0.028416,1.0,0.01212,0.016767,...,0.0,0.0,0.0,0.0,0.0
One Punch Man,0.011122,0.012076,0.01212,1.0,0.022714,...,0.0,0.0,0.0,0.0,0.0
Sword Art Online,0.016195,0.03124,0.016767,0.022714,1.0,...,0.0,0.0,0.0,0.0,0.0


In [16]:
cosine_similarity_series = cosine_similarity_df.loc["One Punch Man"]
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)


In [17]:
# pd.set_option('display.max_rows', None)
# print(ordered_similarities)

# for i in range(0,20):
#     if ordered_similarities.values[i]>0.1:
#         print(ordered_similarities.index[i])
#         print(ordered_similarities.values[i])
#         print("")

ordered_similarities


English
One Punch Man                                                 1.000000
One Punch Man Season 2                                        0.298783
Ninja Slayer From Animation                                   0.152687
One Outs                                                      0.126590
Yuki Yuna is a Hero                                           0.096680
                                                                ...   
Zatch Bell!                                                   0.000000
Chidori RSC Nationals Are Right Before Us                     0.000000
Hareluya II Boy                                               0.000000
Tales of Demons and Gods Season 5                             0.000000
Heaven's Lost Property the Movie The Angeloid of Clockwork    0.000000
Name: One Punch Man, Length: 8409, dtype: float64

One Punch Man                                   1.000000
Unknown                                         0.334311
Unknown                                         0.321005
One Punch Man Season 2                          0.271798
Unknown                                         0.247187

<center><h2>Jaccard</h2></center>

In [5]:
df_jaccard = anime_df[["English", "Genres"]]
df_jaccard = df_jaccard[(df_jaccard["Genres"] != 'Unknown') & (df_jaccard['English'] != 'Unknown')]


In [6]:
df_jaccard.head()

Unnamed: 0,English,Genres
0,Attack on Titan,"Action, Drama"
1,Death Note,"Supernatural, Suspense"
2,Fullmetal Alchemist Brotherhood,"Action, Adventure, Drama, Fantasy"
3,One Punch Man,"Action, Comedy"
4,Sword Art Online,"Action, Adventure, Fantasy, Romance"


In [7]:
# Create a cross-tabulated DataFrame (contingency table) with show titles as rows and genres as columns
animes_cross_table = pd.crosstab(df_jaccard['English'], df_jaccard['Genres'])
animes_cross_table

Genres,Action,"Action, Adventure","Action, Adventure, Avant Garde, Mystery, Supernatural","Action, Adventure, Comedy","Action, Adventure, Comedy, Drama",...,"Sports, Suspense",Supernatural,"Supernatural, Hentai","Supernatural, Suspense",Suspense
English,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"""Deji"" Meets Girl",0,0,0,0,0,...,0,0,0,0,0
"""Stop! Piracy"" Sgt. Frog x No More Movie Thief",0,0,0,0,0,...,0,0,0,0,0
.Koni-chan,0,0,0,0,0,...,0,0,0,0,0
.hack//G.U. Trilogy,0,0,0,0,0,...,0,0,0,0,0
.hack//G.U. Trilogy Parody Mode,0,0,0,0,0,...,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
xxxHOLiC,0,0,0,0,0,...,0,0,0,0,0
xxxHOLiC The Movie A Midsummer Night's Dream,0,0,0,0,0,...,0,0,0,0,0
∀ Gundam,0,0,0,0,0,...,0,0,0,0,0
∀ Gundam I Earth Light,0,0,0,0,0,...,0,0,0,0,0


In [8]:
#Calculate all pairwise Jaccard distances between shows in the cross-tabulated table
# # we pickled
jaccard_distances = pdist(animes_cross_table.values, metric='jaccard')

In [9]:
# store locally
with open("dropJgenres.pkl", "wb") as f:
    pickle.dump(jaccard_distances, f)

In [74]:
# to load
with open("jaccard_distances_genres_dropped.pkl", "rb") as f:
    jaccard_distances_genres = pickle.load(f)

print(jaccard_distances_genres)

[1. 1. 1. ... 1. 1. 0.]


In [75]:
# Convert distances to a similarity measure (1 - distance) and create a square matrix
jaccard_similarity_array_genres = 1 - squareform(jaccard_distances_genres)

In [76]:
# Convert the similarity array into a DataFrame with show titles as both index and columns
jaccard_similarity_genres_df = pd.DataFrame(
    jaccard_similarity_array_genres,
    index = animes_cross_table.index,
    columns = animes_cross_table.index
)

In [77]:
jaccard_similarity_genres_df.head()

English,.hack//G.U. Trilogy,.hack//G.U. Trilogy Parody Mode,.hack//Gift,.hack//Legend Of The Twilight,.hack//Legend of the Twilight Offline Meeting Special,...,xxxHOLiC,xxxHOLiC The Movie A Midsummer Night's Dream,∀ Gundam,∀ Gundam I Earth Light,∀ Gundam II Moonlight Butterfly
English,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
.hack//G.U. Trilogy,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
.hack//G.U. Trilogy Parody Mode,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
.hack//Gift,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
.hack//Legend Of The Twilight,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0
.hack//Legend of the Twilight Offline Meeting Special,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0


In [79]:
# Sort the similarities for a given anime in descending order
jaccardPreds_genres = jaccard_similarity_genres_df['One Punch Man'].sort_values(ascending=False)
print(jaccardPreds_genres)

English
Assassination Classroom The Movie 365 Days' Time                    1.0
Full Metal Panic? Fumoffu                                           1.0
Tiger & Bunny Pilot                                                 1.0
Code Geass Akito the Exiled 2 - The Torn-Up Wyvern Picture Drama    1.0
Air Master                                                          1.0
                                                                   ... 
Hades Project Zeorymer                                              0.0
Haganai A Round-Robin Story's Ending Is Way Extreme                 0.0
Haganai I don't have many friends                                   0.0
Haganai I don't have many friends NEXT                              0.0
∀ Gundam II Moonlight Butterfly                                     0.0
Name: One Punch Man, Length: 4511, dtype: float64


CB Character Nagai Gou World                                                              1.0
Gurren Lagann The Movie Childhood's End                                                   1.0
Cells at Work!! The Return of the Strongest Enemy. A Huge Uproar in the Body’s Bowels!    1.0
One Punch Man 3                                                                           1.0
Inferno Cop Fact Files                                                                    1.0
                                                                                         ... 
Hanamaru Kindergarten                                                                     0.0
Hanako                                                                                    0.0
Hanakappa Adventure in the Butterfly Kingdom                                              0.0
Hanabi-chan The Girl Who Popped Out of the Game World                                     0.0
∀ Gundam II Moonlight Butterfly                                                           0.0
Name: One Punch Man, Length: 8897, dtype: float64

<h2>jaccard with themes</h2>

In [7]:
df_jaccard = anime_df[["English", "Themes"]]
df_jaccard = df_jaccard[(df_jaccard["Themes"] != 'Unknown') & (df_jaccard['English'] != 'Unknown')]
df_jaccard.head()
df_jaccard.shape

(5279, 2)

In [8]:
animes_cross_table = pd.crosstab(df_jaccard['English'], df_jaccard['Themes'])


In [9]:
#Calculate all pairwise Jaccard distances between shows in the cross-tabulated table
jaccard_distances_themes = pdist(animes_cross_table.values, metric='jaccard')

In [10]:
# store locally
with open("dropJthemes.pkl", "wb") as f:
    pickle.dump(jaccard_distances_themes, f)

In [None]:
with open("jaccard_distances_themes.pkl", "rb") as f:
    jaccard_distances_themes = pickle.load(f)
print(jaccard_distances_themes)

In [84]:
# Convert distances to a similarity measure (1 - distance) and create a square matrix
jaccard_similarity_array_themes = 1 - squareform(jaccard_distances_themes)

# Convert the similarity array into a DataFrame with show titles as both index and columns
jaccard_similarity_themes_df = pd.DataFrame(
    jaccard_similarity_array_themes,
    index = animes_cross_table.index,
    columns = animes_cross_table.index
)

jaccard_similarity_themes_df.head()

# Sort the similarities for a given anime in descending order
jaccardPreds_themes = jaccard_similarity_themes_df['One Punch Man'].sort_values(ascending=False)
print(jaccardPreds_themes)

English
Eagle Talon                                                1.0
GJ8man "Highlights"                                        1.0
Samurai Flamenco                                           1.0
Venus 5                                                    1.0
One Punch Man                                              1.0
                                                          ... 
Gurazeni Money Pitch                                       0.0
Gurren Lagann                                              0.0
Gurren Lagann Kirameki★Yoko Box - Pieces of Sweet Stars    0.0
Gurren Lagann The Movie Childhood's End                    0.0
∀ Gundam II Moonlight Butterfly                            0.0
Name: One Punch Man, Length: 4511, dtype: float64


English
One Punch Man Specials                                                         1.0
One Punch Man Season 2 Specials                                                1.0
Venus 5                                                                        1.0
Samurai Flamenco                                                               1.0
Eagle Talon Golden Spell

<center><h2>New User</h2></center>

In [86]:
def newUser():
    print("Welcome to the Anime Recommender!")
    print("Please enter your favorite anime:")
    user_input = input()
    print("You entered: " + user_input)
    print("Here are some recommendations based on your input:")

    # Check if the entered anime is in the DataFrame
    if user_input in cosine_similarity_df.index:
        cosine_similarity_series = cosine_similarity_df.loc[user_input]
        ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

        for i in range(0, 20):
            if ordered_similarities.values[i] > 0.15:
                print(ordered_similarities.index[i])
                print(ordered_similarities.values[i])
                print("")
    else:
        print("Anime not found in the database.")

In [87]:
newUser()

Welcome to the Anime Recommender!
Please enter your favorite anime:
You entered: Fullmetal Alchemist The Movie - Conqueror of Shamballa
Here are some recommendations based on your input:
Fullmetal Alchemist The Movie - Conqueror of Shamballa
1.0000000000000002

Fullmetal Alchemist
0.40513728737177496

Fullmetal Alchemist Brotherhood
0.332166343815961

Fullmetal Alchemist The Sacred Star of Milos
0.2386785589915926



In [92]:
# both tfidf and jaccard(genres) 
# Priority given to tfidf

cosine_similarity_series = cosine_similarity_df.loc["One Punch Man"]
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

jaccardPreds = jaccard_similarity_genres_df['One Punch Man'].sort_values(ascending=False)
top_n = 100

# Get top N from each
tfidf_top = ordered_similarities.head(top_n)
jaccard_top = jaccardPreds.head(top_n)

# Filter TF-IDF list to keep only those also in top N Jaccard
common_indices = [idx for idx in tfidf_top.index if idx in jaccard_top.index]

top_10_common = common_indices[:10]

for idx in top_10_common:
    print(f"{idx}")
    print("")


One Punch Man

One Punch Man Season 2

One Punch Man 3

Samurai Flamenco

Hyper Speed GranDoll

Outburst Dreamer Boys

Qin's Moon Hundred Schools of Thought

Problem Children Are Coming from Another World, Aren't They?

One Punch Man Specials

One Punch Man 2nd Season Commemorative Special



In [None]:
# both tfidf and jaccard(themes) 
# Priority given to tfidf

cosine_similarity_series = cosine_similarity_df.loc["One Punch Man"]
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

jaccardPreds = jaccard_similarity_themes_df['One Punch Man'].sort_values(ascending=False)
top_n = 100

# Get top N from each
tfidf_top = ordered_similarities.head(top_n)
jaccard_top = jaccardPreds.head(top_n)

# Filter TF-IDF list to keep only those also in top N Jaccard
common_indices = [idx for idx in tfidf_top.index if idx in jaccard_top.index]

top_10_common = common_indices[:10]

for idx in top_10_common:
    print(f"{idx}")
    print("")


One Punch Man

One Punch Man Season 2

One Punch Man 3

Samurai Flamenco

Qin's Moon Hundred Schools of Thought

Problem Children Are Coming from Another World, Aren't They?

One Punch Man Specials

One Punch Man 2nd Season Commemorative Special



In [None]:
# # # Simple weighted hybrid (e.g., 0.6 TF-IDF + 0.4 Jaccard)
# # hybrid_similarity = 0.6 * tfidf_similarity_matrix + 0.4 * jaccard_similarity_matrix

# import numpy as np

# # Combine Genres and Themes into one set per anime
# def get_set(row):
#     genres = row['Genres'].split(',') if row['Genres'] != 'Unknown' else []
#     themes = row['Themes'].split(',') if row['Themes'] != 'Unknown' else []
#     return set(g.strip().lower() for g in genres + themes)

# sets = anime_hybrid_df.apply(get_set, axis=1)

# # Jaccard similarity function
# def jaccard(set1, set2):
#     if not set1 or not set2:
#         return 0.0
#     return len(set1 & set2) / len(set1 | set2)

# # Build Jaccard similarity matrix
# jaccard_similarity_matrix = np.zeros((len(sets), len(sets)))

# for i in range(len(sets)):
#     for j in range(len(sets)):
#         jaccard_similarity_matrix[i][j] = jaccard(sets[i], sets[j])

# from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler()

# tfidf_similarity_matrix = scaler.fit_transform(tfidf_similarity_matrix)
# jaccard_similarity_matrix = scaler.fit_transform(jaccard_similarity_matrix)

# alpha = 0.6  # weight for TF-IDF
# beta = 0.4   # weight for Jaccard

# hybrid_similarity_matrix = alpha * tfidf_similarity_matrix + beta * jaccard_similarity_matrix

# def get_top_recommendations(index, sim_matrix, df, top_n=5):
#     scores = list(enumerate(sim_matrix[index]))
#     scores = sorted(scores, key=lambda x: x[1], reverse=True)
#     top_indices = [i for i, score in scores[1:top_n+1]]
#     return df.iloc[top_indices][['Title', 'English', 'Genres', 'Themes']]

# # Example: Get top 5 similar anime to the first one
# get_top_recommendations(0, hybrid_similarity_matrix, anime_hybrid_df)

# # Get top 10 recommendations from TF-IDF (on English subset)
# top_tfidf_recs = get_top_recommendations(index_tfidf, tfidf_sim_matrix, anime_tfidf_df, top_n=10)

# # Get top 10 recommendations from Jaccard (on Theme subset)
# top_jaccard_recs = get_top_recommendations(index_jaccard, jaccard_sim_matrix, anime_theme_df, top_n=10)

# # Find common recommendations (based on Title, or other ID)
# common_recs = pd.merge(top_tfidf_recs, top_jaccard_recs, on="English")


<h1>NEW JOURNEY<h1>

In [38]:
old = pd.read_csv('AnimeNEW.csv')
old.head()

Unnamed: 0,ID,Title,Synonyms,Japanese,English,...,Scored_Users,Ranked,Popularity,Members,Favorites
0,16498,Shingeki no Kyojin,"AoT, SnK",進撃の巨人,Attack on Titan,...,519803.0,1002.0,1,3524109,155695
1,1535,Death Note,DN,デスノート,Death Note,...,485487.0,732.0,2,3504535,159701
2,5114,Fullmetal Alchemist: Brotherhood,"Hagane no Renkinjutsushi Fullmetal Alchemist, ...",鋼の錬金術師 FULLMETAL ALCHEMIST,Fullmetal Alchemist Brotherhood,...,900398.0,12.0,3,2978455,207772
3,30276,One Punch Man,"One Punch-Man, One-Punch Man, OPM",ワンパンマン,One Punch Man,...,19066.0,1112.0,4,2879907,59651
4,11757,Sword Art Online,"S.A.O, SAO",ソードアート・オンライン,Sword Art Online,...,990254.0,29562.0,5,2813565,64997


In [4]:
df=pd.read_csv("anime_meta.csv")

In [5]:
dff = pd.read_csv("ratings.csv")

In [6]:
df.head()

Unnamed: 0,anime_name,anime_id,Genres,Score,Synopsis
0,Cowboy Bebop,1,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",8.78,"In the year 2071, humanity has colonized sever..."
1,Cowboy Bebop: Tengoku no Tobira,5,"Action, Drama, Mystery, Sci-Fi, Space",8.39,"other day, another bounty—such is the life of ..."
2,Trigun,6,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",8.24,"Vash the Stampede is the man with a $$60,000,0..."
3,Witch Hunter Robin,7,"Action, Mystery, Police, Supernatural, Drama, ...",7.27,ches are individuals with special powers like ...
4,Bouken Ou Beet,8,"Adventure, Fantasy, Shounen, Supernatural",6.98,It is the dark century and the people are suff...


In [54]:
df.loc[df["anime_id"] == 34185]

Unnamed: 0,anime_name,anime_id,Genres,Score,Synopsis
12288,Fushou Shita Senro to Tsuki,34185,Kids,Unknown,Based on a children's book of the same name. T...


In [13]:
dff.head()

Unnamed: 0,user_id,anime_id,rating,anime_name
0,0,430,9,Fullmetal Alchemist: The Conqueror of Shamballa
1,0,1004,5,Kanojo to Kanojo no Neko
2,0,3010,7,Kaiketsu Zorro
3,0,570,7,Jin-Rou
4,0,2762,9,Igano Kabamaru


In [22]:
# Create a dictionary mapping each user to the list of movies they have rated
user2movie = dff.groupby('user_id')['anime_id'].apply(list).to_dict()

In [32]:
user2movie[0]

[430,
 1004,
 3010,
 570,
 2762,
 431,
 578,
 433,
 1571,
 121,
 356,
 1250,
 2913,
 1689,
 68,
 1829,
 600,
 3418,
 164,
 1894,
 415,
 2236,
 448,
 2034,
 2547,
 169,
 199,
 174,
 2543,
 466,
 459,
 1047,
 2248,
 4086,
 419]

In [None]:
# Create a dictionary mapping each movie to the list of users who have rated it
movie2user = dff.groupby('anime_id')['user_id'].apply(list).to_dict()

In [33]:
movie2user[430]

[0,
 6,
 18,
 19,
 33,
 41,
 46,
 52,
 53,
 57,
 88,
 89,
 90,
 124,
 145,
 147,
 148,
 153,
 154,
 156,
 165,
 172,
 195,
 198,
 202,
 204,
 205,
 209,
 210,
 214,
 224,
 232,
 248,
 262,
 265,
 275,
 285,
 289,
 296,
 297,
 314,
 324,
 326,
 360,
 363,
 365,
 371,
 374,
 381,
 386,
 389,
 395,
 398,
 417,
 423,
 426,
 435,
 436,
 439,
 440,
 445,
 446,
 459,
 461,
 464,
 479,
 481,
 483,
 485,
 502,
 505,
 522,
 530,
 536,
 545,
 546,
 562,
 563,
 564,
 567,
 571,
 584,
 592,
 608,
 610,
 616,
 618,
 626,
 632,
 636,
 639,
 664,
 666,
 667,
 668,
 694,
 699,
 709,
 717,
 719,
 721,
 724,
 725,
 730,
 741,
 744,
 756,
 759,
 764,
 770,
 773,
 783,
 785,
 786,
 791,
 797,
 802,
 819,
 836,
 845,
 847,
 853,
 884,
 887,
 890,
 892,
 906,
 917,
 921,
 927,
 931,
 933,
 936,
 943,
 948,
 949,
 981,
 985,
 989,
 992,
 998,
 1004,
 1010,
 1014,
 1034,
 1036,
 1046,
 1048,
 1049,
 1060,
 1066,
 1072,
 1089,
 1092,
 1094,
 1098,
 1114,
 1115,
 1117,
 1142,
 1150,
 1157,
 1161,
 1170,
 1177,
 

In [35]:
# Create a dictionary mapping each (user, movie) pair to its rating
user_movie = zip(dff['user_id'], dff['anime_id'])
user_movie_rating = zip(user_movie, dff['rating'])
user_movie2rating = dict(user_movie_rating)

In [36]:
user_movie2rating[(0,430)]

9