In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text data into TF-IDF vectors
from sklearn.metrics.pairwise import cosine_similarity  # For computing cosine similarity between vectors

# pd.set_option('display.max_columns', None)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)


In [2]:
anime_df = pd.read_csv('AnimeNEW.csv')

Exploratory Data Analysis

In [3]:
anime_df.head()

Unnamed: 0,ID,Title,Synonyms,Japanese,English,...,Scored_Users,Ranked,Popularity,Members,Favorites
0,16498,Shingeki no Kyojin,"AoT, SnK",進撃の巨人,Attack on Titan,...,519803.0,1002.0,1,3524109,155695
1,1535,Death Note,DN,デスノート,Death Note,...,485487.0,732.0,2,3504535,159701
2,5114,Fullmetal Alchemist: Brotherhood,"Hagane no Renkinjutsushi Fullmetal Alchemist, ...",鋼の錬金術師 FULLMETAL ALCHEMIST,Fullmetal Alchemist Brotherhood,...,900398.0,12.0,3,2978455,207772
3,30276,One Punch Man,"One Punch-Man, One-Punch Man, OPM",ワンパンマン,One Punch Man,...,19066.0,1112.0,4,2879907,59651
4,11757,Sword Art Online,"S.A.O, SAO",ソードアート・オンライン,Sword Art Online,...,990254.0,29562.0,5,2813565,64997


In [4]:
anime_df.shape

(21460, 28)

In [5]:
anime_df['ID'].nunique()

21460

Viewing Top 50 Ranked animes

In [6]:
Top_animes = anime_df.sort_values(by="Ranked") 
Names = Top_animes["Title"].reset_index()
Names = Names.drop('index', axis=1)
Names.index = Names.index + 1
Names.head(50)

Unnamed: 0,Title
1,Fullmetal Alchemist: Brotherhood
2,Kaguya-sama wa Kokurasetai: Ultra Romantic
3,Gintama°
4,Steins;Gate
5,Shingeki no Kyojin Season 3 Part 2
...,...
46,Rurouni Kenshin: Meiji Kenkaku Romantan - Tsui...
47,86 Part 2
48,Ashita no Joe 2
49,Code Geass: Hangyaku no Lelouch


All Possible Genres

In [7]:
possible_genres = anime_df['Genres'].str.split(',').explode().str.strip().unique()
possible_genres

array(['Action', 'Drama', 'Supernatural', 'Suspense', 'Adventure',
       'Fantasy', 'Comedy', 'Romance', 'Horror', 'Sci-Fi', 'Ecchi',
       'Mystery', 'Sports', 'Award Winning', 'Avant Garde',
       'Slice of Life', 'Gourmet', 'Boys Love', 'Unknown', 'Girls Love',
       'Hentai', 'Erotica'], dtype=object)

All Possible Themes

In [8]:
possible_themes = anime_df['Themes'].str.split(',').explode().str.strip().unique()
possible_themes

array(['Gore', 'Military', 'Survival', 'Psychological', 'Parody',
       'Super Power', 'Love Polygon', 'Video Game', 'School',
       'Martial Arts', 'Historical', 'Unknown', 'Romantic Subtext',
       'Time Travel', 'Isekai', 'Strategy Game', 'Mecha', 'Music',
       'Mythology', 'High Stakes Game', 'Team Sports', 'Reincarnation',
       'Adult Cast', 'Space', 'Detective', 'Vampire', 'Harem',
       'Mahou Shoujo', 'Visual Arts', 'Samurai', 'Crossdressing',
       'Reverse Harem', 'Childcare', 'Delinquents', 'CGDCT', 'Gag Humor',
       'Organized Crime', 'Otaku Culture', 'Workplace', 'Iyashikei',
       'Anthropomorphic', 'Educational', 'Medical', 'Showbiz',
       'Combat Sports', 'Idols (Female)', 'Performing Arts', 'Racing',
       'Magical Sex Shift', 'Idols (Male)', 'Pets'], dtype=object)

Possible Age Rating

In [9]:
possible_dem = anime_df['Rating'].str.split(',').explode().str.strip().unique()
possible_dem

array(['R - 17+ (violence & profanity)', 'PG-13 - Teens 13 or older',
       'R+ - Mild Nudity', 'PG - Children', 'G - All Ages', nan,
       'Rx - Hentai'], dtype=object)

In [10]:
df_content = anime_df[["English", "Synopsis"]]
df_content.head()

Unnamed: 0,English,Synopsis
0,Attack on Titan,"Centuries ago, mankind was slaughtered to near..."
1,Death Note,"Brutal murders, petty thefts, and senseless vi..."
2,Fullmetal Alchemist Brotherhood,After a horrific alchemy experiment goes wrong...
3,One Punch Man,The seemingly unimpressive Saitama has a rathe...
4,Sword Art Online,Ever since the release of the innovative Nerve...


In [11]:
# Instantiate a TfidfVectorizer with parameters to remove stop words and filter out very common/rare terms
vectorizer = TfidfVectorizer(min_df=2, max_df=0.7, stop_words='english')

In [12]:
# Fit the vectorizer on the 'plot' column and transform the text into TF-IDF vectors
vectorized_data = vectorizer.fit_transform(df_content['Synopsis'])

In [13]:
# Explore the generated features (vocabulary terms)
print(vectorizer.get_feature_names_out())
#some japanese words here

['00' '000' '000th' ... 'アニメ' 'コミック' 'レンタマン']


In [14]:
# Convert the TF-IDF sparse matrix to a DataFrame with feature names as columns
tfidf_df = pd.DataFrame(
    vectorized_data.toarray(),
    columns=vectorizer.get_feature_names_out()
)

In [15]:
tfidf_df.head()
# max columns LOOOOL

Unnamed: 0,00,000,000th,001,007,...,źooļ,おとぎ話,アニメ,コミック,レンタマン
0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0


In [16]:
# Assign the show titles to the DataFrame's index for easier reference
tfidf_df.index = df_content['English']
# tfidf_df.head()

In [17]:
cosine_similarity_array = cosine_similarity(tfidf_df)


In [18]:
print(cosine_similarity_array)

[[1.         0.03776806 0.03859335 ... 0.         0.         0.        ]
 [0.03776806 1.         0.03100892 ... 0.         0.         0.        ]
 [0.03859335 0.03100892 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         1.         1.        ]
 [0.         0.         0.         ... 1.         1.         1.        ]
 [0.         0.         0.         ... 1.         1.         1.        ]]


In [19]:
# Convert the cosine similarity array into a DataFrame with show titles as both index and columns
cosine_similarity_df = pd.DataFrame(
    cosine_similarity_array,
    index=tfidf_df.index,
    columns=tfidf_df.index
)


In [20]:
cosine_similarity_df.head()


English,Attack on Titan,Death Note,Fullmetal Alchemist Brotherhood,One Punch Man,Sword Art Online,...,Princess and the Kingdom,Avera and the Mystical Kingdom 2,Brave City,The Juvenile of King Yu,Dino King
English,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Attack on Titan,1.0,0.037768,0.038593,0.013473,0.017222,...,0.0,0.0,0.0,0.0,0.0
Death Note,0.037768,1.0,0.031009,0.015352,0.034031,...,0.0,0.0,0.0,0.0,0.0
Fullmetal Alchemist Brotherhood,0.038593,0.031009,1.0,0.01527,0.01875,...,0.0,0.0,0.0,0.0,0.0
One Punch Man,0.013473,0.015352,0.01527,1.0,0.024467,...,0.0,0.0,0.0,0.0,0.0
Sword Art Online,0.017222,0.034031,0.01875,0.024467,1.0,...,0.0,0.0,0.0,0.0,0.0


In [26]:
cosine_similarity_series = cosine_similarity_df.loc["Fullmetal Alchemist Brotherhood"]
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)


In [27]:
pd.set_option('display.max_rows', None)
# print(ordered_similarities)

for i in range(0,20):
    if ordered_similarities.values[i]>0.1:
        print(ordered_similarities.index[i])
        print(ordered_similarities.values[i])
        print("")


Fullmetal Alchemist Brotherhood
1.0000000000000004

Fullmetal Alchemist
0.5195248433264401

Fullmetal Alchemist The Movie - Conqueror of Shamballa
0.32948594319269814

Fullmetal Alchemist The Sacred Star of Milos
0.2920986234783643

Fullmetal Alchemist The Sacred Star of Milos PV
0.24123720318108682

Fullmetal Alchemist Brotherhood OVA Collection
0.16670494501800837

Unknown
0.1293388714004816

Armor Shop for Ladies & Gentlemen
0.10944650804141147

Ronin Warriors Legend of Kikoutei
0.10779497189717217

Donten Laughing Under the Clouds - Gaiden Chapter 3 - Conspiracy of the Military
0.10604408374321264

Diabolik Lovers II More,Blood
0.10145054545222647

Unknown
0.10035316730890212

