In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text data into TF-IDF vectors
from sklearn.metrics.pairwise import cosine_similarity  # For computing cosine similarity between vectors
from scipy.spatial.distance import pdist, squareform  # For pairwise distance computations and converting to a square matrix
import pickle
import os

# pd.set_option('display.max_columns', None)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)


In [4]:
anime_df = pd.read_csv('AnimeNEW.csv')

In [21]:

def recommend_by_jaccard(
    title,
    anime_df,
    top_n=5,
    type='Genres',  # 'Genres' or 'Themes'
    combine=False,
    precomputed=None  # None, DataFrame (for single), or dict {'genre': df, 'theme': df}
):
    """
    Recommend similar anime using Jaccard similarity on Genres, Themes, or both.

    Parameters:
    -----------
    title : str
        The anime title (English) to base recommendations on.

    anime_df : pd.DataFrame
        Dataset containing at least 'English', 'Genres', and 'Themes'.

    top_n : int
        Number of top similar results to return.

    type : str
        Either 'Genres' or 'Themes'. Only used if combine=False.

    combine : bool
        If True, calculate Jaccard on both Genres and Themes and return intersection.

    precomputed : DataFrame or dict, optional
        If combine=False, pass a Jaccard similarity DataFrame.
        If combine=True, pass a dict: {'genre': df, 'theme': df}.

    Returns:
    --------
    dict
        If combine=False:
            { "top": pd.Series }
        If combine=True:
            {
              "genre_top": pd.Series,
              "theme_top": pd.Series,
              "common": list
            }
    """

    # Check if title exists in the dataset
    if title not in anime_df['English'].values:
        raise ValueError(f"'{title}' not found in dataset.")

    def compute_jaccard(df, col):
        cross_tab = pd.crosstab(df['English'], df[col])
        distances = pdist(cross_tab.values, metric='jaccard')
        similarity = 1 - squareform(distances)
        return pd.DataFrame(similarity, index=cross_tab.index, columns=cross_tab.index)
    
    def compute_jaccard_array(df, col, arr):
        cross_tab = pd.crosstab(df['English'], df[col])
        distances = arr
        similarity = 1 - squareform(distances)
        return pd.DataFrame(similarity, index=cross_tab.index, columns=cross_tab.index)

    # === Combine mode ===
    if combine:

        # if (not isinstance(precomputed, dict)) or ('genre' not in precomputed or 'theme' not in precomputed) or (precomputed['genre'] is None or precomputed['theme'] is None):
        #     raise ValueError("precomputed must be a dict with 'genre' and 'theme' keys.")
        
        if precomputed == None: # they didnt pass anything
            df_jaccard_genres = anime_df[["English", 'Genres']]
            df_filtered_genres = df_jaccard_genres[(df_jaccard_genres['Genres'] != 'Unknown') & (df_jaccard_genres['English'] != 'Unknown')]

            genres_df = compute_jaccard(df_filtered_genres, 'Genres')

            df_jaccard_themes = anime_df[["English", 'Themes']]
            df_filtered_themes = df_jaccard_themes[(df_jaccard_themes['Themes'] != 'Unknown') & (df_jaccard_themes['English'] != 'Unknown')]

            themes_df = compute_jaccard(df_filtered_themes, 'Genres')

            genre_top = genres_df.loc[title].sort_values(ascending=False)[1:top_n+1]
            theme_top = themes_df.loc[title].sort_values(ascending=False)[1:top_n+1]
            common = list(set(genre_top.index) & set(theme_top.index))

            return {
                "genre_top": genre_top,
                "theme_top": theme_top,
                "common": common
            }
        
        else:
            df_jaccard_themes = anime_df[["English", 'Themes']]
            df_filtered_themes = df_jaccard_themes[(df_jaccard_themes['Themes'] != 'Unknown') & (df_jaccard_themes['English'] != 'Unknown')]

            df_jaccard_genres = anime_df[["English", 'Genres']]
            df_filtered_genres = df_jaccard_genres[(df_jaccard_genres['Genres'] != 'Unknown') & (df_jaccard_genres['English'] != 'Unknown')]

            genre_sim = precomputed.get('genre')
            genre_sim_df = compute_jaccard_array(df_filtered_genres, 'Genres', genre_sim)

            theme_sim = precomputed.get('theme')
            theme_sim_df = compute_jaccard_array(df_filtered_themes, 'Themes', theme_sim)

            if title not in genre_sim_df.index:
                raise ValueError(f"'{title}' not found in genre similarity data.")
            if title not in theme_sim_df.index:
                raise ValueError(f"'{title}' not found in theme similarity data.")

            genre_top = genre_sim_df.loc[title].sort_values(ascending=False)[1:top_n+1]
            theme_top = theme_sim_df.loc[title].sort_values(ascending=False)[1:top_n+1]
            common = list(set(genre_top.index) & set(theme_top.index))
   
            return {
                # "genre_top": genre_top,
                # "theme_top": theme_top,
                "common": common
            }
        

    # === Single mode ===
    else:
        if type not in ['Genres', 'Themes']:
            raise ValueError("type must be either 'Genres' or 'Themes'")
        
        df_jaccard = anime_df[["English", type]]
        df_filtered = df_jaccard[(df_jaccard[type] != 'Unknown') & (df_jaccard['English'] != 'Unknown')]




        sim = precomputed
        if sim is None:
            sim = compute_jaccard(df_filtered, type)
            if title not in sim.index:
                raise ValueError(f"'{title}' not found in similarity data.")
            top = sim.loc[title].sort_values(ascending=False)[1:top_n+1]
            return {"top": top}
        
        else:
            sim_df = compute_jaccard_array(df_filtered, type, sim)

            if title not in sim_df.index:
                raise ValueError(f"'{title}' not found in similarity data.")
            top = sim_df.loc[title].sort_values(ascending=False)[1:top_n+1]
            return {"top": top}
            
        


        
        



In [22]:
res = recommend_by_jaccard("John Wax", anime_df, type='Themes', combine=False)

ValueError: 'John Wax' not found in dataset.

In [23]:
with open("dropJthemes.pkl", "rb") as f:
    d = pickle.load(f)
with open("dropJgenres.pkl", "rb") as f:
    e = pickle.load(f)

In [24]:
ress = recommend_by_jaccard("One Punch Man", anime_df, type='Themes', combine=False, precomputed=d)


In [41]:
res

{'top': English
 Samurai Flamenco                   1.0
 Nanako SOS                         1.0
 One Punch Man Specials             1.0
 One Punch Man Season 2 Specials    1.0
 GJ8man "Highlights"                1.0
 Name: One Punch Man, dtype: float64}

In [25]:
ress

{'top': English
 Samurai Flamenco                   1.0
 Nanako SOS                         1.0
 One Punch Man Specials             1.0
 One Punch Man Season 2 Specials    1.0
 GJ8man "Highlights"                1.0
 Name: One Punch Man, dtype: float64}

In [26]:
spot = {'genre': e, 'theme': d}

In [27]:
resss = recommend_by_jaccard("One Punch Man", anime_df, top_n=30, combine=True, precomputed=spot)

In [28]:
resss

{'common': ['One Punch Man Specials',
  'One Punch Man 3',
  'One Punch Man Season 2',
  'One Punch Man',
  'One Punch Man 2nd Season Commemorative Special']}

In [19]:
def recommend_similar_anime(title, anime_df, top_n, precomputed=None):
    '''
    Recommend the most similar anime based on TF-IDF cosine similarity of the synopsis.
    title: str
        The English title of the anime for which to find similar recommendations.

    anime_df: pd.DataFrame
        The full anime dataset containing at least the 'English' and 'Synopsis' columns.

    top_n: int
        The number of top similar anime to return (excluding the anime itself).

    precomputed=None: np array
        A precomputed cosine similarity matrix. If None, the function will compute it.

    Returns:
    pd.Series
        A pandas Series of the top N most similar anime titles and their similarity scores,
        sorted in descending order.
    '''
    if title not in anime_df['English'].values:
        raise ValueError(f"'{title}' not found in dataset.")
    

    df_content = anime_df[["English", "Synopsis"]]
    
    df_content = df_content[(df_content['Synopsis'] != 'Unknown') & (df_content['English'] != 'Unknown')]

    cosine_similarity_array = precomputed
    if cosine_similarity_array is not None:
        vectorizer = TfidfVectorizer(min_df=2, max_df=0.7, stop_words='english')
        vectorized_data = vectorizer.fit_transform(df_content['Synopsis'])

        tfidf_df = pd.DataFrame(
        vectorized_data.toarray(),
        columns=vectorizer.get_feature_names_out()
        )

        tfidf_df.index = df_content['English']

        cosine_similarity_df = pd.DataFrame(
        cosine_similarity_array,
        index=tfidf_df.index,
        columns=tfidf_df.index
        )
        cosine_similarity_series = cosine_similarity_df.loc[title]
        ordered_similarities = cosine_similarity_series.sort_values(ascending=False)
        return {"top": ordered_similarities}

        
    else:
        vectorizer = TfidfVectorizer(min_df=2, max_df=0.7, stop_words='english')
        vectorized_data = vectorizer.fit_transform(df_content['Synopsis'])

        tfidf_df = pd.DataFrame(
        vectorized_data.toarray(),
        columns=vectorizer.get_feature_names_out()
        )

        tfidf_df.index = df_content['English']
        cosine_similarity_array = cosine_similarity(tfidf_df)

        cosine_similarity_df = pd.DataFrame(
        cosine_similarity_array,
        index=tfidf_df.index,
        columns=tfidf_df.index
        )
        cosine_similarity_series = cosine_similarity_df.loc[title]
        ordered_similarities = cosine_similarity_series.sort_values(ascending=False)[1:top_n+1]
        return {"top": ordered_similarities}


In [12]:
# to load
with open("coSim_ESdrop.pkl", "rb") as f:
    cosine_similarity_array = pickle.load(f)



In [20]:
recommend_similar_anime("Death Note", anime_df, 5, cosine_similarity_array)

{'top': English
 Death Note                                     1.000000
 Death Note Relight                             0.324576
 gdgd men's Party                               0.130101
 Screechers Wild!                               0.128386
 Yadamon Magical Dreamer                        0.123091
                                                  ...   
 One Room 2nd Season Hanasaka Yui's Prologue    0.000000
 Evil Woman Executive                           0.000000
 Alice in Voodooland                            0.000000
 Hot Minute Gudetama                            0.000000
 Scumbag System Season 2                        0.000000
 Name: Death Note, Length: 8409, dtype: float64}