In [15]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics.pairwise import cosine_distances

In [2]:
df = pd.read_csv("datasets/anime_cleaned.csv")

In [3]:
df.columns

Index(['anime_id', 'title', 'title_english', 'title_japanese',
       'title_synonyms', 'image_url', 'type', 'source', 'episodes', 'status',
       'airing', 'aired_string', 'aired', 'duration', 'rating', 'score',
       'scored_by', 'rank', 'popularity', 'members', 'favorites', 'background',
       'premiered', 'broadcast', 'related', 'producer', 'licensor', 'studio',
       'genre', 'opening_theme', 'ending_theme', 'duration_min',
       'aired_from_year'],
      dtype='object')

In [4]:
df.drop(columns=["title_japanese", "title_synonyms","airing", "popularity", "members", "background", "premiered", "broadcast", "opening_theme", "ending_theme", "related"], inplace=True)

In [5]:
df['genre'] = df['genre'].apply(lambda x: [g.strip() for g in x.split(",")] if isinstance(x, str) else [])

In [6]:
print(df.iloc[0])

anime_id                                                       11013
title                                                  Inu x Boku SS
title_english                              Inu X Boku Secret Service
image_url          https://myanimelist.cdn-dena.com/images/anime/...
type                                                              TV
source                                                         Manga
episodes                                                          12
status                                               Finished Airing
aired_string                            Jan 13, 2012 to Mar 30, 2012
aired                     {'from': '2012-01-13', 'to': '2012-03-30'}
duration                                             24 min. per ep.
rating                                     PG-13 - Teens 13 or older
score                                                           7.63
scored_by                                                     139250
rank                              

In [7]:
df['genre'].head(10)


0             [Comedy, Supernatural, Romance, Shounen]
1           [Comedy, Parody, Romance, School, Shounen]
2                      [Comedy, Magic, School, Shoujo]
3             [Comedy, Drama, Magic, Romance, Fantasy]
4                    [Comedy, Drama, Romance, Shounen]
5                               [Kids, School, Shoujo]
6             [Magic, Comedy, Romance, School, Shoujo]
7    [Action, Drama, Fantasy, Romance, School, Supe...
8       [Music, Slice of Life, Comedy, Romance, Josei]
9             [Comedy, Harem, Romance, School, Shoujo]
Name: genre, dtype: object

In [8]:
df = df[['episodes', 'score', 'genre', 'title' ]]

In [9]:
df.head(3)

Unnamed: 0,episodes,score,genre,title
0,12,7.63,"[Comedy, Supernatural, Romance, Shounen]",Inu x Boku SS
1,26,7.89,"[Comedy, Parody, Romance, School, Shounen]",Seto no Hanayome
2,51,7.55,"[Comedy, Magic, School, Shoujo]",Shugo Chara!! Doki


In [10]:

df_genre = df['genre'].str.join('|').str.get_dummies()

df_genre.head()


Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
numeriche = df[['score', 'episodes']].fillna(0)

scaler = StandardScaler()
numeriche_scaled = scaler.fit_transform(numeriche)

In [12]:
X = np.hstack([df_genre.values, numeriche_scaled])

print(X.shape)

(6668, 45)


In [13]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=10, random_state=42)
cluster_labels = kmeans.fit_predict(X)

df['cluster'] = cluster_labels

# Controlliamo
df[['cluster']].head()

Unnamed: 0,cluster
0,6
1,6
2,5
3,6
4,5


In [14]:
df 

Unnamed: 0,episodes,score,genre,title,cluster
0,12,7.63,"[Comedy, Supernatural, Romance, Shounen]",Inu x Boku SS,6
1,26,7.89,"[Comedy, Parody, Romance, School, Shounen]",Seto no Hanayome,6
2,51,7.55,"[Comedy, Magic, School, Shoujo]",Shugo Chara!! Doki,5
3,38,8.21,"[Comedy, Drama, Magic, Romance, Fantasy]",Princess Tutu,6
4,25,8.67,"[Comedy, Drama, Romance, Shounen]",Bakuman. 3rd Season,5
...,...,...,...,...,...
6663,0,9.52,[Hentai],Dokidoki Little Ooyasan,5
6664,1,0.00,"[Slice of Life, Drama, Romance]",Wo Shi Jiang Xiaobai (2018),4
6665,1,4.20,"[Music, Kids]",Genki Genki Non-tan: Obake Mura Meiro,4
6666,0,4.57,[Kids],Mr. Men Little Miss,4


In [16]:
def suggerisci_anime(nome_anime, n_suggerimenti=5):
    anime_corrente = df[df['title'].str.lower() == nome_anime.lower()]
    
    if anime_corrente.empty:
        print("Anime non trovato")
        return
    
    indice_anime = anime_corrente.index[0]
    cluster_corrente = anime_corrente['cluster'].values[0]

    cluster_anime = df[df['cluster'] == cluster_corrente]
    cluster_indices = cluster_anime.index
    
    X_cluster = X[cluster_indices]

    anime_feature = X[indice_anime].reshape(1, -1)
    
    distanze = cosine_distances(anime_feature, X_cluster)[0]
    
    cluster_anime = cluster_anime.copy()
    cluster_anime['distanza'] = distanze

    suggerimenti = cluster_anime[cluster_anime['title'].str.lower() != nome_anime.lower()]
    suggerimenti = suggerimenti.sort_values('distanza').head(n_suggerimenti)
    
    return suggerimenti[['title', 'distanza', 'score', 'episodes', 'genre']]


In [None]:

suggerisci_anime("Hunter x Hunter")

Unnamed: 0,title,distanza,score,episodes,genre
333,Hunter x Hunter: Greed Island Final,0.085846,8.38,14,"[Action, Adventure, Shounen, Super Power]"
1358,Hunter x Hunter: Yorkshin City Kanketsu-hen,0.109998,8.39,8,"[Action, Adventure, Super Power, Shounen]"
2120,Hunter x Hunter: Greed Island,0.111707,8.3,8,"[Action, Adventure, Shounen, Super Power]"
88,Rekka no Honoo,0.163049,7.42,42,"[Action, Adventure, Martial Arts, Shounen, Sup..."
53,D.Gray-man Hallow,0.180717,7.92,13,"[Action, Adventure, Super Power, Demons, Shounen]"


In [21]:

df.to_csv('datasets/anime_suggestions.csv', index=False)

import pickle
with open('matrice_X.pkl', 'wb') as f:
    pickle.dump(X, f)
