<a href="https://colab.research.google.com/github/AntBap23/AnimeRecommendations/blob/main/AnimeRecommendations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Necessary Imports
import pandas as pd
import numpy as np
import nltk
nltk.download("wordnet")
nltk.download("stopwords")
nltk.download("punkt")
import string
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack, vstack, save_npz, load_npz
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#read data in
df = pd.read_csv('anime-dataset-2023.csv')

#getting needed columns
df = df[['Name','anime_id','Genres','Synopsis','Studios','Type','Rating','Image URL']]

#google colab problems lol
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 50)

print(df.describe())
print(df.info())
print(df.head())


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


           anime_id
count  24905.000000
mean   29776.709014
std    17976.076290
min        1.000000
25%    10507.000000
50%    34628.000000
75%    45240.000000
max    55735.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24905 entries, 0 to 24904
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Name       24905 non-null  object
 1   anime_id   24905 non-null  int64 
 2   Genres     24905 non-null  object
 3   Synopsis   24905 non-null  object
 4   Studios    24905 non-null  object
 5   Type       24905 non-null  object
 6   Rating     24905 non-null  object
 7   Image URL  24905 non-null  object
dtypes: int64(1), object(7)
memory usage: 1.5+ MB
None
                              Name  anime_id                                Genres                                           Synopsis         Studios   Type                          Rating                                          Image URL
0                     Cowbo

In [2]:
import pandas as pd
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords


#subsetting types that are not similar to other types
df = df[~df['Type'].isin(['OVA', 'Music', 'UNKNOWN', 'Special'])]
print(df['Type'].unique())

new_df = df.copy()

#Synopsis Preprocess
print(new_df['Synopsis'].describe()) #Too many unknown values

drop_unknown = 'No description available for this anime.'
new_df = new_df[new_df['Synopsis'] != drop_unknown]
new_df.reset_index(inplace=True)



new_df['Synopsis'] = new_df['Synopsis'].str.lower()

stop_words = set(stopwords.words('english'))
new_df['Synopsis'] = new_df['Synopsis'].apply(lambda x: nltk.word_tokenize(x))
new_df['Synopsis'].head()
new_df['Synopsis'] = new_df['Synopsis'].apply(lambda x:' '.join([word for word in x if word not in stop_words and word not in string.punctuation and word.isalpha()]))

lem = WordNetLemmatizer()
new_df['Synopsis'] = new_df['Synopsis'].apply(lambda syn:' '.join([lem.lemmatize(word) for word in syn.split()]))

print(new_df['Synopsis'].describe())
print(new_df['Synopsis'].head())

['TV' 'Movie' 'ONA']
count                                        15511
unique                                       11353
top       No description available for this anime.
freq                                          3983
Name: Synopsis, dtype: object


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


count                  11528
unique                 11301
top       furukawa taku film
freq                      12
Name: Synopsis, dtype: object
0    crime timeless year humanity expanded across g...
1    another day another life often unlucky crew be...
2    vash stampede man bounty head reason merciless...
3    robin sena powerful craft user drafted group s...
4    dark century people suffering rule devil vande...
Name: Synopsis, dtype: object


In [3]:
# Genre Preprocessing
new_df['Genres'] = new_df['Genres'].apply(lambda x: [genre.strip() for genre in x.split(',')])
print(new_df['Genres'].head())

0           [Action, Award Winning, Sci-Fi]
1                          [Action, Sci-Fi]
2               [Action, Adventure, Sci-Fi]
3    [Action, Drama, Mystery, Supernatural]
4        [Adventure, Fantasy, Supernatural]
Name: Genres, dtype: object


In [4]:
# Studios Preprocessing
new_df['Studios'] = new_df['Studios'].apply(lambda x: [studio.strip() for studio in x.split(',')])
print(new_df['Studios'].head())

0           [Sunrise]
1             [Bones]
2          [Madhouse]
3           [Sunrise]
4    [Toei Animation]
Name: Studios, dtype: object


In [5]:
# Clean copy
clean_df = new_df.copy()
print(clean_df.head())
print(clean_df.tail())
print(clean_df.info())
print(clean_df.describe())

   index                             Name  anime_id                                  Genres                                           Synopsis           Studios   Type                          Rating                                          Image URL
0      0                     Cowboy Bebop         1         [Action, Award Winning, Sci-Fi]  crime timeless year humanity expanded across g...         [Sunrise]     TV  R - 17+ (violence & profanity)  https://cdn.myanimelist.net/images/anime/4/196...
1      1  Cowboy Bebop: Tengoku no Tobira         5                        [Action, Sci-Fi]  another day another life often unlucky crew be...           [Bones]  Movie  R - 17+ (violence & profanity)  https://cdn.myanimelist.net/images/anime/1439/...
2      2                           Trigun         6             [Action, Adventure, Sci-Fi]  vash stampede man bounty head reason merciless...        [Madhouse]     TV       PG-13 - Teens 13 or older  https://cdn.myanimelist.net/images/anime/7/203

In [6]:
# Encode Genres
# Create an instance of MultiLabelBinarizer and fit_transform the genre

mlb = MultiLabelBinarizer()
genres_encoded = csr_matrix(mlb.fit_transform(clean_df['Genres'].tolist()))
#display(genres_encoded)

# Create a DataFrame with the one-hot encoded data
genre_df = pd.DataFrame.sparse.from_spmatrix(genres_encoded, columns=[f'GENRE_{genre}' for genre in mlb.classes_])
genre_df.reset_index(drop=True, inplace=True)
display(genre_df.info())
print()
display(genre_df.head())
print()
display(genre_df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11528 entries, 0 to 11527
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype           
---  ------               --------------  -----           
 0   GENRE_Action         1 non-null      Sparse[int64, 0]
 1   GENRE_Adventure      1 non-null      Sparse[int64, 0]
 2   GENRE_Avant Garde    1 non-null      Sparse[int64, 0]
 3   GENRE_Award Winning  1 non-null      Sparse[int64, 0]
 4   GENRE_Boys Love      1 non-null      Sparse[int64, 0]
 5   GENRE_Comedy         1 non-null      Sparse[int64, 0]
 6   GENRE_Drama          1 non-null      Sparse[int64, 0]
 7   GENRE_Ecchi          1 non-null      Sparse[int64, 0]
 8   GENRE_Erotica        1 non-null      Sparse[int64, 0]
 9   GENRE_Fantasy        1 non-null      Sparse[int64, 0]
 10  GENRE_Girls Love     1 non-null      Sparse[int64, 0]
 11  GENRE_Gourmet        1 non-null      Sparse[int64, 0]
 12  GENRE_Hentai         1 non-null      Sparse[int64, 0]
 13  G

None




Unnamed: 0,GENRE_Action,GENRE_Adventure,GENRE_Avant Garde,GENRE_Award Winning,GENRE_Boys Love,GENRE_Comedy,GENRE_Drama,GENRE_Ecchi,GENRE_Erotica,GENRE_Fantasy,...,GENRE_Hentai,GENRE_Horror,GENRE_Mystery,GENRE_Romance,GENRE_Sci-Fi,GENRE_Slice of Life,GENRE_Sports,GENRE_Supernatural,GENRE_Suspense,GENRE_UNKNOWN
0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,1,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0





(11528, 22)

In [7]:
# Encode Studios
# Create an instance of MultiLabelBinarizer and fit_transform the studios
mlb = MultiLabelBinarizer()
studios_encoded = csr_matrix(mlb.fit_transform(clean_df['Studios'].tolist()))
studios_encoded

# Create a DataFrame with the one-hot encoded data
studios_df = pd.DataFrame.sparse.from_spmatrix(studios_encoded, columns=[f'STUDIO_{studio}' for studio in mlb.classes_])
studios_df.info()
studios_df.reset_index(drop=True, inplace=True)
display(studios_df.head())
print()
display(studios_df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11528 entries, 0 to 11527
Columns: 775 entries, STUDIO_100studio to STUDIO_yell
dtypes: Sparse[int64, 0](775)
memory usage: 144.1 KB


Unnamed: 0,STUDIO_100studio,STUDIO_10Gauge,STUDIO_1IN,STUDIO_2:10 AM Animation,STUDIO_33 Collective,STUDIO_5 Inc.,STUDIO_6pucks,STUDIO_7doc,STUDIO_8bit,STUDIO_A-1 Pictures,...,STUDIO_l-a-unch・BOX,STUDIO_monofilmo,STUDIO_pH Studio,STUDIO_production doA,STUDIO_studio MOTHER,STUDIO_studio hb,STUDIO_team Yamahitsuji,STUDIO_teamKG,STUDIO_ufotable,STUDIO_yell
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0





(11528, 775)

In [8]:
# Encoding synopsis
vec = TfidfVectorizer()
synopsis_encoded = csr_matrix(vec.fit_transform((clean_df['Synopsis'])))
#display(type(synopsis_encoded))

# To dataframe
synopsis_df = pd.DataFrame.sparse.from_spmatrix(synopsis_encoded, columns=[f'WORD_{synopsis}' for synopsis in vec.get_feature_names_out()])
display(synopsis_df.info())
print()
#display(synopsis_df.head())
print()
display(synopsis_df.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11528 entries, 0 to 11527
Columns: 36438 entries, WORD_aa to WORD_ﬁnal
dtypes: Sparse[float64, 0](36438)
memory usage: 5.1 MB


None





Index(['WORD_aa', 'WORD_aachi', 'WORD_aah', 'WORD_aak', 'WORD_aanimationn', 'WORD_aaron', 'WORD_ab', 'WORD_aback', 'WORD_abale', 'WORD_abandon',
       ...
       'WORD_猪猪侠', 'WORD_眉村', 'WORD_鈴木伸一', 'WORD_闘龍極意書', 'WORD_青青草原', 'WORD_鳥羽和一', 'WORD_박사', 'WORD_카프', 'WORD_코믹톰', 'WORD_ﬁnal'], dtype='object', length=36438)

In [9]:
ani_idx = 11
display(new_df['Synopsis'][ani_idx])
print()
synopsis_df.loc[ani_idx].idxmax()

'gol roger known pirate king strongest infamous sailed grand line capture execution roger world government brought change throughout world last word death revealed existence greatest treasure world one piece revelation brought grand age pirate men dreamed finding one promise unlimited amount rich quite possibly pinnacle glory title pirate king enter monkey luffy boy defies standard definition pirate rather popular persona wicked hardened toothless pirate ransacking village fun luffy reason pirate one pure wonder thought exciting adventure lead intriguing people ultimately promised treasure following footstep childhood hero luffy crew travel across grand line experiencing crazy adventure unveiling dark mystery battling strong enemy order reach coveted piece'




'WORD_pirate'

In [10]:
item_profile = hstack([1.2*genres_encoded, studios_encoded, 2*synopsis_encoded])
display(item_profile)


<11528x37235 sparse matrix of type '<class 'numpy.float64'>'
	with 477781 stored elements in Compressed Sparse Row format>

In [11]:
# User profile
def create_user_profile(anime_list):
        idx_list = []
        for anime in anime_list:
            idx_list.append(clean_df.index[clean_df['Name'] == anime][0])
        #display(idx_list)
        user_profile = vstack([item_profile[idx] for idx in idx_list]).mean(axis=0)
        return csr_matrix(user_profile)
user_profile = create_user_profile(['Bleach','Haikyuu!!', 'Slam Dunk'])
display(user_profile)

<1x37235 sparse matrix of type '<class 'numpy.float64'>'
	with 219 stored elements in Compressed Sparse Row format>

In [12]:
# Cosine Similarity
sim_matrix = cosine_similarity(user_profile, item_profile)
#sim_matrix.shape
#display(sim_matrix.argsort()[0][::-1])
display(sim_matrix[0])

array([0.09970802, 0.11113079, 0.19326609, ..., 0.0049341 , 0.011826  ,
       0.103943  ])

In [13]:
def get_recommendations(anime_list, cosine_sim, topN=10):
    # Create user profile based on the anime list
    user_profile = create_user_profile(anime_list)

    # Calculate cosine similarity between user profile and item profiles
    sim_matrix = cosine_similarity(user_profile, item_profile)

    # Get similarity scores for the user
    sim_scores = sim_matrix[0]

    # Sort animes based on similarity scores
    anime_indices = sim_scores.argsort()[::-1]

    # Get the indices of the top N most similar animes
    top_anime_indices = anime_indices[1:topN+1]

    # Return the top N most similar anime names
    return clean_df['Name'].iloc[top_anime_indices]

recommendations = get_recommendations(['Naruto'], sim_matrix) # Pass the sim_matrix calculated earlier to the function
print(recommendations)

11459                                        Naruto (2023)
1025                                    Naruto: Shippuuden
6735                       Boruto: Naruto Next Generations
3818            Naruto: Shippuuden Movie 6 - Road to Ninja
5203                              Boruto: Naruto the Movie
2512     Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...
2947           Naruto: Shippuuden Movie 4 - The Lost Tower
2083                   Naruto: Shippuuden Movie 2 - Kizuna
626      Naruto Movie 2: Dai Gekitotsu! Maboroshi no Ch...
3428             Naruto: Shippuuden Movie 5 - Blood Prison
Name: Name, dtype: object
