# Content-based recommendation

In this notebook, we will address the fundamental question of our research topic: 'What anime should you watch next?' This time, our approach will exclusively utilize a content-based recommendation system.

In [73]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
import html

# Load datasets
animes = pd.read_csv("data/anime.csv")
ratings = pd.read_csv("data/rating.csv")

# Clean and preprocess the data

# clean the names of animes
for i in range(len(animes["name"])):
    animes["name"][i] = html.unescape(animes["name"][i])
    
animes = animes.dropna() # Handle missing genres
animes['genre'] = animes['genre'].str.split(', ') # Split genres

# One-hot encode genres
mlb = MultiLabelBinarizer()
genre_encoded = pd.DataFrame(mlb.fit_transform(animes['genre']), columns=mlb.classes_, index=animes.index)

# Merge genres back with anime data
animes = pd.concat([animes, genre_encoded], axis=1)

# Filter ratings to include only anime_id already in animes dataframe
valid_anime_ids = set(animes['anime_id'])
ratings = ratings[ratings['anime_id'].isin(valid_anime_ids)]

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  animes["name"][i] = html.unescape(animes["name"][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  animes["na

In [75]:
animes

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,Action,Adventure,Cars,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,32281,Kimi no Na wa.,"[Drama, Romance, School, Supernatural]",Movie,1,9.37,200630,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,"[Action, Adventure, Drama, Fantasy, Magic, Mil...",TV,64,9.26,793665,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,"[Action, Comedy, Historical, Parody, Samurai, ...",TV,51,9.25,114262,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,"[Sci-Fi, Thriller]",TV,24,9.17,673572,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,9969,Gintama',"[Action, Comedy, Historical, Parody, Samurai, ...",TV,51,9.16,151266,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,[Hentai],OVA,1,4.15,211,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,5543,Under World,[Hentai],OVA,1,4.28,183,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,5621,Violence Gekiga David no Hoshi,[Hentai],OVA,4,4.88,219,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,[Hentai],OVA,1,4.98,175,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [77]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Compute item-item similarity based on content features
# anime_features = animes.drop(['anime_id', 'name', 'genre', 'type', 'episodes'], axis=1)
similarity_matrix = cosine_similarity(anime_features)

In [83]:
def split_data(animes, test_fraction=0.2):
    test_data = animes.sample(frac=test_fraction, random_state=42)
    return test_data

In [95]:
# Recommendation function
def recommend_content_based(anime_id, top_k=5):
    if anime_id not in animes['anime_id'].values:
        return pd.DataFrame()  # Return an empty DataFrame if anime_id is invalid

    print(animes[animes['anime_id'] == anime_id])
    anime_idx = animes[animes['anime_id'] == anime_id].index[0]
    similarities = similarity_matrix[anime_idx]
    similar_anime_indices = np.argsort(similarities)[::-1][1:top_k+1]
    return animes.iloc[similar_anime_indices][['anime_id', 'name']]

In [91]:
# from sklearn.metrics import mean_squared_error, roc_auc_score

# Precision@k calculation
def precision_at_k(predicted, ground_truth, k):
    correct_predictions = 0
    for user_id, items in predicted.items():
        true_items = ground_truth.get(user_id, [])
        top_k_items = items[:k]
        correct_predictions += len(set(top_k_items) & set(true_items))
    total_predictions = len(predicted) * k
    return correct_predictions / total_predictions
    
# Diversity calculation
def calculate_diversity(recommendations, item_features):
    diversities = []
    for items in recommendations.values():
        pairwise_distances = cosine_similarity(item_features.loc[items])
        avg_distance = 1 - pairwise_distances.mean()
        diversities.append(avg_distance)
    return np.mean(diversities)

# Coverage calculation
def calculate_coverage(recommendations, total_items):
    recommended_items = set(item for user_items in recommendations.values() for item in user_items)
    return len(recommended_items) / total_items

# Personalization calculation
def calculate_personalization(recommendations, item_features):
    user_vectors = [item_features.loc[items].mean(axis=0) for items in recommendations.values()]
    pairwise_distances = cosine_similarity(user_vectors)
    avg_distance = 1 - pairwise_distances.mean()
    return avg_distance


In [97]:
import matplotlib.pyplot as plt
from collections import defaultdict
import numpy as np

# Split data into train and test sets
train_data, test_data = split_data(ratings, test_fraction=0.2)

# Build recommendation system
recommendations = defaultdict(list)
for user_id in test_data['user_id'].unique():
    user_ratings = train_data[train_data['user_id'] == user_id]
    for _, row in user_ratings.iterrows():
        recs = recommend_content_based(row['anime_id'], top_k=5)
        if not recs.empty:
            recommendations[user_id].extend(recs['anime_id'].tolist())

# Calculate metrics
ground_truth = {user_id: test_data[test_data['user_id'] == user_id]['anime_id'].tolist()
                for user_id in test_data['user_id'].unique()}

precision = precision_at_k(recommendations, ground_truth, k=5)
diversity = calculate_diversity(recommendations, animes.set_index('anime_id'))
coverage = calculate_coverage(recommendations, len(animes))
personalization = calculate_personalization(recommendations, animes.set_index('anime_id'))

# Plot and interpret results
metrics = {
    'Precision@5': precision,
    'Diversity': diversity,
    'Coverage': coverage,
    'Personalization': personalization,
}

plot_metrics(metrics)

def plot_metrics(metrics):
    # Plot bar chart for evaluation metrics
    plt.figure(figsize=(12, 6))
    plt.bar(metrics.keys(), metrics.values())
    plt.title("Recommendation System Evaluation Metrics")
    plt.ylabel("Score")
    plt.xlabel("Metrics")
    plt.show()

    # Interpretation
    interpret_metrics(metrics)

def interpret_metrics(metrics):
    print("\n**Metrics Interpretation**")
    print(f"RMSE (Root Mean Square Error): {metrics['RMSE']:.2f}")
    print(" - Lower RMSE indicates better prediction accuracy.\n")
    
    print(f"Precision@5: {metrics['Precision@5']:.2f}")
    print(" - Higher Precision@5 means the system is recommending relevant items more often.\n")
    
    print(f"Diversity: {metrics['Diversity']:.2f}")
    print(" - Higher Diversity indicates recommendations are less similar to each other, which is preferable.\n")
    
    print(f"Coverage: {metrics['Coverage']:.2f}")
    print(" - Higher Coverage means more items are being recommended at least once.\n")
    
    print(f"Personalization: {metrics['Personalization']:.2f}")
    print(" - Higher Personalization indicates recommendations are more tailored to individual users.\n")

     anime_id           name                               genre type  \
464        24  School Rumble  [Comedy, Romance, School, Shounen]   TV   

    episodes  rating  members  Action  Adventure  Cars  ...  Shounen Ai  \
464       26    8.06   178553       0          0     0  ...           0   

     Slice of Life  Space  Sports  Super Power  Supernatural  Thriller  \
464              0      0       0            0             0         0   

     Vampire  Yaoi  Yuri  
464        0     0     0  

[1 rows x 50 columns]
worked for user anime_id 24
      anime_id              name  \
5438        51  Tenshi Kinryouku   

                                               genre type episodes  rating  \
5438  [Action, Drama, Romance, Shoujo, Supernatural]  OVA        3    6.46   

      members  Action  Adventure  Cars  ...  Shounen Ai  Slice of Life  Space  \
5438    36352       1          0     0  ...           0              0      0   

      Sports  Super Power  Supernatural  Thriller  Vamp

IndexError: index 12164 is out of bounds for axis 0 with size 12017