In [1]:
# Anime Recommendation System using Cosine Similarity

In [2]:
# Importing Required Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [3]:
# Load Dataset
df = pd.read_csv("C:\\Users\\saine\\Downloads\\Excelr_Assignments\\Questions\\Recommendation System\\anime.csv")

In [4]:
# Display first few rows
print("Initial Data:")
print(df.head())

Initial Data:
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          GintamaÂ°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [5]:
print("\nDataset Info:")
print(df.info())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None


In [6]:
# Data Preprocessing
# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Drop rows where 'genre' or 'name' is missing
df.dropna(subset=['genre', 'name'], inplace=True)

# Fill missing values in 'rating' or 'members' with mean
df['rating'] = df['rating'].fillna(df['rating'].mean())
df['members'] = df['members'].fillna(df['members'].mean())

# Reset index
df.reset_index(drop=True, inplace=True)


Missing values per column:
anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [7]:
# Feature Extraction
# Using 'genre' column for similarity
# Convert genre text data into numerical features using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['genre'])

# Shape of TF-IDF matrix
print(f"\nTF-IDF Matrix Shape: {tfidf_matrix.shape}")


TF-IDF Matrix Shape: (12232, 46)


In [8]:
# Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print("\nCosine similarity matrix calculated!")



Cosine similarity matrix calculated!


In [9]:
# Recommendation Function
def recommend_anime(title, top_n=5):
    """
    Recommend similar anime based on cosine similarity
    """
    # Get index of the anime that matches the title
    if title not in df['name'].values:
        return "Anime not found in the dataset."

    idx = df[df['name'] == title].index[0]

    # Get similarity scores for all anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top_n recommendations (excluding itself)
    sim_scores = sim_scores[1:top_n+1]

    # Get anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return recommended anime titles
    return df['name'].iloc[anime_indices]

In [10]:
# Example
print("\nRecommended Anime for 'Naruto':")
print(recommend_anime('Naruto', top_n=5))


Recommended Anime for 'Naruto':
615                                    Naruto: Shippuuden
841                                                Naruto
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
1343                                          Naruto x UT
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
Name: name, dtype: object


In [11]:
# Evaluation
# Since this is a content-based recommendation, evaluation is different.
# We can simulate by splitting and checking how similar unseen titles perform.

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# For demonstration, we'll assume top recommendations are relevant if same genre appears.
def evaluate_model(test_sample=20):
    precision_list, recall_list, f1_list = [], [], []

    for i in range(test_sample):
        anime = test_df.iloc[i]['name']
        true_genre = test_df.iloc[i]['genre']
        recs = recommend_anime(anime, top_n=5)

        if isinstance(recs, str):
            continue  # skip if anime not found

        # True if recommended anime share at least one genre
        true_genres = set(true_genre.split(', '))
        pred_genres = [set(df[df['name'] == r]['genre'].iloc[0].split(', ')) for r in recs]

        y_true = [1] * len(pred_genres)
        y_pred = [1 if len(true_genres & g) > 0 else 0 for g in pred_genres]

        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)

        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    print("\nModel Evaluation Results:")
    print(f"Average Precision: {np.mean(precision_list):.2f}")
    print(f"Average Recall: {np.mean(recall_list):.2f}")
    print(f"Average F1-Score: {np.mean(f1_list):.2f}")

# Evaluate
evaluate_model()



Model Evaluation Results:
Average Precision: 1.00
Average Recall: 1.00
Average F1-Score: 1.00


Interview Questions and Answers

1. Can you explain the difference between user-based and item-based collaborative filtering?
User-based collaborative filtering focuses on finding users who have similar preferences or behaviors to the target user. The system then recommends items that those similar users have liked but the target user has not yet experienced. In contrast, item-based collaborative filtering looks at the relationships between items themselves. It finds items that are similar based on user ratings or interactions and recommends those similar items to the user. In short, user-based filtering finds similar users, while item-based filtering finds similar items.

2. What is collaborative filtering, and how does it work?
Collaborative filtering is a recommendation technique that relies on the past behavior and preferences of users to predict what they might like in the future. It assumes that users who have shown similar interests in the past will continue to have similar preferences. The system uses user-item interaction data (such as ratings, clicks, or purchases) to identify patterns of similarity either between users or between items. Based on these patterns, it recommends items that similar users have enjoyed, even if the target user has never directly interacted with those items before.
