In [35]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text data into TF-IDF vectors
from sklearn.metrics.pairwise import cosine_similarity  # For computing cosine similarity between vectors
from scipy.spatial.distance import pdist, squareform  # For pairwise distance computations and converting to a square matrix

# pd.set_option('display.max_columns', None)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)


In [2]:
anime_df = pd.read_csv('AnimeNEW.csv')

Exploratory Data Analysis

In [6]:
pd.set_option('display.max_columns', None)
anime_df.head()

Unnamed: 0,ID,Title,Synonyms,Japanese,English,Synopsis,Type,Episodes,Status,Start_Aired,End_Aired,Premiered,Broadcast,Producers,Licensors,Studios,Source,Genres,Themes,Demographics,Duration_Minutes,Rating,Score,Scored_Users,Ranked,Popularity,Members,Favorites
0,16498,Shingeki no Kyojin,"AoT, SnK",進撃の巨人,Attack on Titan,"Centuries ago, mankind was slaughtered to near...",TV,25.0,Finished Airing,"Apr 7, 2013","Sep 29, 2013",Spring 2013,Sundays at 0158 (JST),"Production I.G, Dentsu, Mainichi Broadcasting ...",Funimation,Wit Studio,Manga,"Action, Drama","Gore, Military, Survival",Shounen,24.0,R - 17+ (violence & profanity),8.531,519803.0,1002.0,1,3524109,155695
1,1535,Death Note,DN,デスノート,Death Note,"Brutal murders, petty thefts, and senseless vi...",TV,37.0,Finished Airing,"Oct 4, 2006","Jun 27, 2007",Fall 2006,Wednesdays at 0056 (JST),"VAP, Konami, Ashi Productions, Nippon Televisi...",VIZ Media,Madhouse,Manga,"Supernatural, Suspense",Psychological,Shounen,23.0,R - 17+ (violence & profanity),8.621,485487.0,732.0,2,3504535,159701
2,5114,Fullmetal Alchemist: Brotherhood,"Hagane no Renkinjutsushi Fullmetal Alchemist, ...",鋼の錬金術師 FULLMETAL ALCHEMIST,Fullmetal Alchemist Brotherhood,After a horrific alchemy experiment goes wrong...,TV,64.0,Finished Airing,"Apr 5, 2009","Jul 4, 2010",Spring 2009,Sundays at 1700 (JST),"Aniplex, Square Enix, Mainichi Broadcasting Sy...","Funimation, Aniplex of America",Bones,Manga,"Action, Adventure, Drama, Fantasy",Military,Shounen,24.0,R - 17+ (violence & profanity),9.131,900398.0,12.0,3,2978455,207772
3,30276,One Punch Man,"One Punch-Man, One-Punch Man, OPM",ワンパンマン,One Punch Man,The seemingly unimpressive Saitama has a rathe...,TV,12.0,Finished Airing,"Oct 5, 2015","Dec 21, 2015",Fall 2015,Mondays at 0105 (JST),"TV Tokyo, Bandai Visual, Lantis, Asatsu DK, Ba...",VIZ Media,Madhouse,Web manga,"Action, Comedy","Parody, Super Power",Seinen,24.0,R - 17+ (violence & profanity),8.511,19066.0,1112.0,4,2879907,59651
4,11757,Sword Art Online,"S.A.O, SAO",ソードアート・オンライン,Sword Art Online,Ever since the release of the innovative Nerve...,TV,25.0,Finished Airing,"Jul 8, 2012","Dec 23, 2012",Summer 2012,Sundays at 0000 (JST),"Aniplex, Genco, DAX Production, ASCII Media Wo...",Aniplex of America,A-1 Pictures,Light novel,"Action, Adventure, Fantasy, Romance","Love Polygon, Video Game",Unknown,23.0,PG-13 - Teens 13 or older,7.201,990254.0,29562.0,5,2813565,64997


In [7]:
pd.set_option('display.max_columns', 10)

In [4]:
anime_df.shape

(21460, 28)

In [5]:
anime_df['ID'].nunique()

21460

Viewing Top 50 Ranked animes

In [6]:
Top_animes = anime_df.sort_values(by="Ranked") 
Names = Top_animes["Title"].reset_index()
Names = Names.drop('index', axis=1)
Names.index = Names.index + 1
Names.head(50)

Unnamed: 0,Title
1,Fullmetal Alchemist: Brotherhood
2,Kaguya-sama wa Kokurasetai: Ultra Romantic
3,Gintama°
4,Steins;Gate
5,Shingeki no Kyojin Season 3 Part 2
...,...
46,Rurouni Kenshin: Meiji Kenkaku Romantan - Tsui...
47,86 Part 2
48,Ashita no Joe 2
49,Code Geass: Hangyaku no Lelouch


All Possible Genres

In [7]:
possible_genres = anime_df['Genres'].str.split(',').explode().str.strip().unique()
possible_genres

array(['Action', 'Drama', 'Supernatural', 'Suspense', 'Adventure',
       'Fantasy', 'Comedy', 'Romance', 'Horror', 'Sci-Fi', 'Ecchi',
       'Mystery', 'Sports', 'Award Winning', 'Avant Garde',
       'Slice of Life', 'Gourmet', 'Boys Love', 'Unknown', 'Girls Love',
       'Hentai', 'Erotica'], dtype=object)

All Possible Themes

In [8]:
possible_themes = anime_df['Themes'].str.split(',').explode().str.strip().unique()
possible_themes

array(['Gore', 'Military', 'Survival', 'Psychological', 'Parody',
       'Super Power', 'Love Polygon', 'Video Game', 'School',
       'Martial Arts', 'Historical', 'Unknown', 'Romantic Subtext',
       'Time Travel', 'Isekai', 'Strategy Game', 'Mecha', 'Music',
       'Mythology', 'High Stakes Game', 'Team Sports', 'Reincarnation',
       'Adult Cast', 'Space', 'Detective', 'Vampire', 'Harem',
       'Mahou Shoujo', 'Visual Arts', 'Samurai', 'Crossdressing',
       'Reverse Harem', 'Childcare', 'Delinquents', 'CGDCT', 'Gag Humor',
       'Organized Crime', 'Otaku Culture', 'Workplace', 'Iyashikei',
       'Anthropomorphic', 'Educational', 'Medical', 'Showbiz',
       'Combat Sports', 'Idols (Female)', 'Performing Arts', 'Racing',
       'Magical Sex Shift', 'Idols (Male)', 'Pets'], dtype=object)

Possible Age Rating

In [9]:
possible_dem = anime_df['Rating'].str.split(',').explode().str.strip().unique()
possible_dem

array(['R - 17+ (violence & profanity)', 'PG-13 - Teens 13 or older',
       'R+ - Mild Nudity', 'PG - Children', 'G - All Ages', nan,
       'Rx - Hentai'], dtype=object)

In [10]:
df_content = anime_df[["English", "Synopsis"]]
df_content.head()

Unnamed: 0,English,Synopsis
0,Attack on Titan,"Centuries ago, mankind was slaughtered to near..."
1,Death Note,"Brutal murders, petty thefts, and senseless vi..."
2,Fullmetal Alchemist Brotherhood,After a horrific alchemy experiment goes wrong...
3,One Punch Man,The seemingly unimpressive Saitama has a rathe...
4,Sword Art Online,Ever since the release of the innovative Nerve...


In [11]:
# Instantiate a TfidfVectorizer with parameters to remove stop words and filter out very common/rare terms
vectorizer = TfidfVectorizer(min_df=2, max_df=0.7, stop_words='english')

In [12]:
# Fit the vectorizer on the 'plot' column and transform the text into TF-IDF vectors
vectorized_data = vectorizer.fit_transform(df_content['Synopsis'])

In [13]:
# Explore the generated features (vocabulary terms)
print(vectorizer.get_feature_names_out())
#some japanese words here

['00' '000' '000th' ... 'アニメ' 'コミック' 'レンタマン']


In [14]:
# Convert the TF-IDF sparse matrix to a DataFrame with feature names as columns
tfidf_df = pd.DataFrame(
    vectorized_data.toarray(),
    columns=vectorizer.get_feature_names_out()
)

In [15]:
tfidf_df.head()
# max columns LOOOOL

Unnamed: 0,00,000,000th,001,007,...,źooļ,おとぎ話,アニメ,コミック,レンタマン
0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0


In [15]:
# Assign the show titles to the DataFrame's index for easier reference
tfidf_df.index = df_content['English']
# tfidf_df.head()

In [16]:
cosine_similarity_array = cosine_similarity(tfidf_df)


In [17]:
print(cosine_similarity_array)

[[1.         0.03776806 0.03859335 ... 0.         0.         0.        ]
 [0.03776806 1.         0.03100892 ... 0.         0.         0.        ]
 [0.03859335 0.03100892 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         1.         1.        ]
 [0.         0.         0.         ... 1.         1.         1.        ]
 [0.         0.         0.         ... 1.         1.         1.        ]]


In [18]:
# Convert the cosine similarity array into a DataFrame with show titles as both index and columns
cosine_similarity_df = pd.DataFrame(
    cosine_similarity_array,
    index=tfidf_df.index,
    columns=tfidf_df.index
)


In [None]:
cosine_similarity_df.head()

English,Attack on Titan,Death Note,Fullmetal Alchemist Brotherhood,One Punch Man,Sword Art Online,...,Princess and the Kingdom,Avera and the Mystical Kingdom 2,Brave City,The Juvenile of King Yu,Dino King
English,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Attack on Titan,1.0,0.037768,0.038593,0.013473,0.017222,...,0.0,0.0,0.0,0.0,0.0
Death Note,0.037768,1.0,0.031009,0.015352,0.034031,...,0.0,0.0,0.0,0.0,0.0
Fullmetal Alchemist Brotherhood,0.038593,0.031009,1.0,0.01527,0.01875,...,0.0,0.0,0.0,0.0,0.0
One Punch Man,0.013473,0.015352,0.01527,1.0,0.024467,...,0.0,0.0,0.0,0.0,0.0
Sword Art Online,0.017222,0.034031,0.01875,0.024467,1.0,...,0.0,0.0,0.0,0.0,0.0


In [56]:
cosine_similarity_series = cosine_similarity_df.loc["One Punch Man"]
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)


In [57]:
# pd.set_option('display.max_rows', None)
# print(ordered_similarities)

# for i in range(0,20):
#     if ordered_similarities.values[i]>0.1:
#         print(ordered_similarities.index[i])
#         print(ordered_similarities.values[i])
#         print("")

ordered_similarities


English
One Punch Man                                   1.000000
Unknown                                         0.334311
Unknown                                         0.321005
One Punch Man Season 2                          0.271798
Unknown                                         0.247187
                                                  ...   
Sisters of Wellber - Elegy for a Sad Warrior    0.000000
Taro's Monster Hunt                             0.000000
A Place Where There Are Moths                   0.000000
Unknown                                         0.000000
Dino King                                       0.000000
Name: One Punch Man, Length: 21460, dtype: float64

<center><h2>Jaccard</h2></center>

In [22]:
df_jaccard = anime_df[["English", "Genres"]]


In [23]:
df_jaccard.head()

Unnamed: 0,English,Genres
0,Attack on Titan,"Action, Drama"
1,Death Note,"Supernatural, Suspense"
2,Fullmetal Alchemist Brotherhood,"Action, Adventure, Drama, Fantasy"
3,One Punch Man,"Action, Comedy"
4,Sword Art Online,"Action, Adventure, Fantasy, Romance"


In [34]:
# Create a cross-tabulated DataFrame (contingency table) with show titles as rows and genres as columns
animes_cross_table = pd.crosstab(df_jaccard['English'], df_jaccard['Genres'])
animes_cross_table
# Drop the unkowwn genre column?

Genres,Action,"Action, Adventure","Action, Adventure, Avant Garde, Mystery, Supernatural","Action, Adventure, Boys Love, Fantasy, Mystery, Supernatural","Action, Adventure, Comedy",...,Supernatural,"Supernatural, Hentai","Supernatural, Suspense",Suspense,Unknown
English,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"""Deji"" Meets Girl",0,0,0,0,0,...,0,0,0,0,0
"""LIP×LIP"" Interview Report by Hiyori Suzumi!",0,0,0,0,0,...,0,0,0,0,1
"""Stop! Piracy"" Sgt. Frog x No More Movie Thief",0,0,0,0,0,...,0,0,0,0,0
-OutsideRRequieM-,0,0,0,0,0,...,0,0,0,0,1
.Koni-chan,0,0,0,0,0,...,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
xxxHOLiC,0,0,0,0,0,...,0,0,0,0,0
xxxHOLiC The Movie A Midsummer Night's Dream,0,0,0,0,0,...,0,0,0,0,0
∀ Gundam,0,0,0,0,0,...,0,0,0,0,0
∀ Gundam I Earth Light,0,0,0,0,0,...,0,0,0,0,0


In [36]:
#Calculate all pairwise Jaccard distances between shows in the cross-tabulated table
jaccard_distances = pdist(animes_cross_table.values, metric='jaccard')

In [37]:
# Convert distances to a similarity measure (1 - distance) and create a square matrix
jaccard_similarity_array = 1 - squareform(jaccard_distances)

In [38]:
# Convert the similarity array into a DataFrame with show titles as both index and columns
jaccard_similarity_df = pd.DataFrame(
    jaccard_similarity_array,
    index = animes_cross_table.index,
    columns = animes_cross_table.index
)

In [39]:
jaccard_similarity_df.head()

English,"""Deji"" Meets Girl","""LIP×LIP"" Interview Report by Hiyori Suzumi!","""Stop! Piracy"" Sgt. Frog x No More Movie Thief",-OutsideRRequieM-,.Koni-chan,...,xxxHOLiC,xxxHOLiC The Movie A Midsummer Night's Dream,∀ Gundam,∀ Gundam I Earth Light,∀ Gundam II Moonlight Butterfly
English,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"""Deji"" Meets Girl",1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
"""LIP×LIP"" Interview Report by Hiyori Suzumi!",0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0
"""Stop! Piracy"" Sgt. Frog x No More Movie Thief",0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0
-OutsideRRequieM-,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0
.Koni-chan,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0


In [58]:
# Sort the similarities for a given anime in descending order
jaccardPreds = jaccard_similarity_df['One Punch Man'].sort_values(ascending=False)
print(jaccardPreds)

English
CB Character Nagai Gou World                                                              1.0
Gurren Lagann The Movie Childhood's End                                                   1.0
Cells at Work!! The Return of the Strongest Enemy. A Huge Uproar in the Body’s Bowels!    1.0
One Punch Man 3                                                                           1.0
Inferno Cop Fact Files                                                                    1.0
                                                                                         ... 
Hanamaru Kindergarten                                                                     0.0
Hanako                                                                                    0.0
Hanakappa Adventure in the Butterfly Kingdom                                              0.0
Hanabi-chan The Girl Who Popped Out of the Game World                                     0.0
∀ Gundam II Moonlight Butterfly                     

<center><h2>New User</h2></center>

In [None]:
def newUser():
    print("Welcome to the Anime Recommender!")
    print("Please enter your favorite anime:")
    user_input = input()
    print("You entered: " + user_input)
    print("Here are some recommendations based on your input:")

    # Check if the entered anime is in the DataFrame
    if user_input in cosine_similarity_df.index:
        cosine_similarity_series = cosine_similarity_df.loc[user_input]
        ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

        for i in range(0, 20):
            if ordered_similarities.values[i] > 0.15:
                print(ordered_similarities.index[i])
                print(ordered_similarities.values[i])
                print("")
    else:
        print("Anime not found in the database.")

In [21]:
newUser()

Welcome to the Anime Recommender!
Please enter your favorite anime:
You entered: Fullmetal Alchemist The Movie - Conqueror of Shamballa
Here are some recommendations based on your input:
Fullmetal Alchemist The Movie - Conqueror of Shamballa
0.9999999999999998

Fullmetal Alchemist
0.397226243751022

Fullmetal Alchemist Brotherhood
0.32948594319269814

Fullmetal Alchemist The Sacred Star of Milos
0.2078281883920075

For Whom the Alchemist Exists
0.18656582161913288

Unknown
0.17315753348267146

Qin's Moon The Emperor Under Heaven
0.15199229503148176



In [66]:
# both tfidf and jaccard(genres) 
# Priority given to tfidf

cosine_similarity_series = cosine_similarity_df.loc["One Punch Man"]
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

jaccardPreds = jaccard_similarity_df['One Punch Man'].sort_values(ascending=False)
top_n = 50

# Get top N from each
tfidf_top = ordered_similarities.head(top_n)
jaccard_top = jaccardPreds.head(top_n)

# Filter TF-IDF list to keep only those also in top N Jaccard
common_indices = [idx for idx in tfidf_top.index if idx in jaccard_top.index]

top_10_common = common_indices[:10]

for idx in top_10_common:
    print(f"{idx}")
    print(f"  TF-IDF similarity: {ordered_similarities[idx]}")
    print(f"  Jaccard similarity: {jaccardPreds[idx]}")
    print("")


One Punch Man
  TF-IDF similarity: 1.0000000000000002
  Jaccard similarity: 1.0

One Punch Man Season 2
  TF-IDF similarity: 0.2717981440489888
  Jaccard similarity: 1.0

One Punch Man 3
  TF-IDF similarity: 0.0761229884017375
  Jaccard similarity: 1.0

