In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.preprocessing import LabelEncoder,MultiLabelBinarizer

In [2]:
df = pd.read_csv("../../anime.csv")
df = df.fillna('')

In [11]:
df[df==""]

Unnamed: 0,Aired,Broadcast,Characters,Demographic,Duration,Endings,English,Episodes,Favorites,Genres,...,Studios,Themes,Type,id,members,popularity,ranking,rating,summary,title
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9711,,,,,,,,,,,...,,,,,,,,,,
9712,,,,,,,,,,,...,,,,,,,,,,
9713,,,,,,,,,,,...,,,,,,,,,,
9714,,,,,,,,,,,...,,,,,,,,,,


In [4]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['summary'])
tfidf_matrix.shape

(9716, 29818)

In [5]:
def preprocess_col(column):
    set_genre = set()
    for _, row in enumerate(column):
        try:
            genres = row.split(',')
            for genre in genres:
                set_genre.add(genre)
        except Exception:
            genre = row

            set_genre.add(genre)
    return set_genre


def encode_categories(df, column_name):
    set_genres = preprocess_col(df[column_name])
    set_genres.remove('')
    genres_list = list(set_genres)
    genres_array = np.array(genres_list).reshape(1,-1)

    le = MultiLabelBinarizer()
    le.fit(genres_array)


    def apply_le(x):
        if len(x)==0:
            return [0] * len(le.classes_)
            

        res = le.transform([x.split(",")])
        return res[0]
    

    return df[column_name].apply(apply_le)




In [6]:
genres_encoded = encode_categories(df, 'Genres')
studios_encoded = encode_categories(df, 'Studios')
themes_encoded = encode_categories(df, 'Themes')


In [7]:
genres_encoded.shape

(9716,)

In [8]:
cosine_sim_genres = cosine_similarity(pd.DataFrame(list(genres_encoded)))
cosine_sim_studios = cosine_similarity(pd.DataFrame(list(studios_encoded)))
cosine_sim_encoded = cosine_similarity(pd.DataFrame(list(themes_encoded)))
cosine_sim_summary = cosine_similarity(tfidf_matrix, tfidf_matrix)


KeyboardInterrupt: 

In [12]:
cosine_sim_summary = cosine_similarity(tfidf_matrix, tfidf_matrix)
res = np.apply_along_axis(lambda x: zip(df.id, x), 1, cosine_sim_summary)
print(*zip(df.id, res))

('34895', <zip object at 0x00000162B1D1F980>) ('48035', <zip object at 0x00000162B1D1CAC0>) ('52566', <zip object at 0x00000162B1D1FF00>) ('1290', <zip object at 0x00000162B1D1F3C0>) ('1411', <zip object at 0x00000162B1D1F080>) ('53136', <zip object at 0x00000162B1D1F640>) ('50204', <zip object at 0x00000162B1D1F880>) ('44650', <zip object at 0x00000162B1D1EF00>) ('32559', <zip object at 0x00000162B1D1F540>) ('46229', <zip object at 0x00000162B1D1F400>) ('23889', <zip object at 0x00000162B1D1F240>) ('33618', <zip object at 0x00000162B1D1F300>) ('16970', <zip object at 0x00000162B1D1F500>) ('3384', <zip object at 0x00000162B1D05AC0>) ('52382', <zip object at 0x00000162E6BAF900>) ('42619', <zip object at 0x00000162E85FD600>) ('18679', <zip object at 0x00000162E85FE740>) ('36755', <zip object at 0x00000162E85FCAC0>) ('22295', <zip object at 0x00000162E85FFAC0>) ('1991', <zip object at 0x00000162E85FDFC0>) ('33531', <zip object at 0x00000162E86401C0>) ('4119', <zip object at 0x00000162E864

In [9]:
indices = pd.Series(df.index, index=df['title'])


In [14]:
indices.sample(100).index

Index(['Hitohira',
       'Fate/Grand Order: Zettai Majuu Sensen Babylonia - Initium Iter',
       'Xiha Youji', 'Cai Liu',
       'Arifureta Shokugyou de Sekai Saikyou: Arifureta Yorimachi de Sekai Saikyou',
       'Jiqiren Dou Dou: Huanle Dou Dou Ji',
       'Free! Movie 3: Road to the World - Yume', 'Angel ga Tonda Hi',
       'Yingxiong Qi Ge Ban',
       'Detective Conan Movie 09: Strategy Above the Depths',
       'Bai She II: Qing She Jie Qi', 'Oyayubi-hime', 'Kigyou Senshi Arslan',
       'Dai 13-kai Indies Anime Festa OP', 'Metal Fight Beyblade',
       'Queen's Blade: Rurou no Senshi Specials',
       'Mashin Mukashi Banashi Gekijou', '77Danui Bimil', 'Bbasha Mecard S',
       'Ryuudouji Shimon no Inbou', 'Hitozuma Cosplay Kissa',
       'Chokotto Kamen feat. Inoue Kikuko', 'Aikagi The Animation',
       'Date A Live II', 'Chinzei Hachirou Tametomo (2021)',
       'Tokubetsu Byoutou', 'Feng Ji Yun Nu', 'Futon', 'Cofun Gal no Coffy',
       'Date A Live: Date to Date',
       

In [11]:
def get_recommendations(title, cosine_sim, num_recommend = 10):
    idx = indices[title]
    print(idx[0])
# Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
# Sort the movies based on the similarity scores
    print(cosine_sim.shape)
    print(sim_scores)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Get the scores of the 10 most similar movies
    top_similar = sim_scores[1:num_recommend+1]
# Get the movie indices
    movie_indices = [i[0] for i in top_similar]
# Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]


In [19]:
indices[[10,152,2357]].index[0]

'Pyuu to Fuku! Jaguar: Return of Yaku Ichinenburi'

In [74]:
get_recommendations('Yahari Ore no Seishun Love Comedy wa Machigatteiru.', cosine_sim_summary,num_recommend = 20)


IndexError: invalid index to scalar variable.