In [1]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
# Step 2: Load the dataset
anime_df = pd.read_csv('anime.csv')
print(anime_df.head())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [3]:
# Step 3: Check dataset shape and missing values
print('Shape of dataset:', anime_df.shape)
print('Missing values in each column:\n', anime_df.isna().sum())

Shape of dataset: (12294, 7)
Missing values in each column:
 anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [7]:
# Step 4: Handle missing values
anime_df['rating'] = anime_df['rating'].fillna(anime_df['rating'].mean())
anime_df['genre'] = anime_df['genre'].fillna('Unknown')
anime_df['type'] = anime_df['type'].fillna('Unknown')
print('Missing values after filling:\n', anime_df.isna().sum())

Missing values after filling:
 anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64


In [5]:
# Step 5: Select important features
working_df = anime_df[['name', 'genre', 'type', 'episodes', 'rating']].copy()

In [9]:
# Step 6: Encode categorical values and fix numeric ones
working_df['genre'] = working_df['genre'].astype('category').cat.codes
working_df['type'] = working_df['type'].astype('category').cat.codes
working_df['episodes'] = working_df['episodes'].replace('Unknown', np.nan)
working_df['episodes'] = pd.to_numeric(working_df['episodes'], errors='coerce')
working_df['episodes'] = working_df['episodes'].fillna(working_df['episodes'].median())
print('Data after cleaning:\n', working_df.head())

Data after cleaning:
                                name  genre  type  episodes  rating
0                    Kimi no Na wa.   2686     0       1.0    9.37
1  Fullmetal Alchemist: Brotherhood    161     5      64.0    9.26
2                          Gintama°    534     5      51.0    9.25
3                       Steins;Gate   3240     5      24.0    9.17
4                     Gintama&#039;    534     5      51.0    9.16


In [10]:
  # Step 7: Normalize numeric columns
scaler = StandardScaler()
working_df[['episodes', 'rating']] = scaler.fit_transform(working_df[['episodes', 'rating']])
print('After scaling:\n', working_df.head())

After scaling:
                                name  genre  type  episodes    rating
0                    Kimi no Na wa.   2686     0 -0.239941  2.847535
1  Fullmetal Alchemist: Brotherhood    161     5  1.122451  2.739380
2                          Gintama°    534     5  0.841323  2.729547
3                       Steins;Gate   3240     5  0.257440  2.650889
4                     Gintama&#039;    534     5  0.841323  2.641057


In [11]:
# Step 8: Compute cosine similarity between anime
feature_cols = ['genre', 'type', 'episodes', 'rating']
sim_matrix = cosine_similarity(working_df[feature_cols])
print('Similarity matrix shape:', sim_matrix.shape)

Similarity matrix shape: (12294, 12294)


In [13]:
# Step 9: Function to recommend anime
def recommend_similar(title, data_df=working_df, similarity=sim_matrix, top_k=5):
  if title not in data_df['name'].values:
    return f"Title '{title}' not found in dataset."
  idx = data_df[data_df['name'] == title].index[0]
  scores = list(enumerate(similarity[idx]))
  scores_sorted = sorted(scores, key=lambda x: x[1], reverse=True)
  top_indices = [i for i, _ in scores_sorted[1:top_k+1]]
  return data_df['name'].iloc[top_indices].tolist()


# Example
print('Recommendations for Fullmetal Alchemist: Brotherhood:')
print(recommend_similar('Fullmetal Alchemist: Brotherhood'))

Recommendations for Fullmetal Alchemist: Brotherhood:
['Berserk', 'Claymore', 'Arslan Senki (TV)', 'Wolf&#039;s Rain', 'Lupin III (2015)']


In [14]:
# Step 10: Split into training and test sets for evaluation
train_df, test_df = train_test_split(working_df, test_size=0.2, random_state=42)
sim_matrix_train = cosine_similarity(train_df[feature_cols])
print('Train set size:', train_df.shape)

Train set size: (9835, 5)


In [17]:
# Step 11: Evaluate system by matching genres
def evaluate_genre_match(ref_df, similarity_matrix, top_k=5):
  y_true, y_pred = [], []
  for title in ref_df['name']:
    idx = ref_df.index.get_loc(ref_df[ref_df['name'] == title].index[0])
    scores = list(enumerate(similarity_matrix[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    rec_indices = [i for i, _ in scores[1:top_k+1]]
    target_genre = ref_df.iloc[idx]['genre']
    for rec_i in rec_indices:
      rec_genre = ref_df.iloc[rec_i]['genre']
      y_true.append(1)
      y_pred.append(1 if rec_genre == target_genre else 0)
  print('Precision:', precision_score(y_true, y_pred, zero_division=0))
  print('Recall:', recall_score(y_true, y_pred, zero_division=0))
  print('F1 Score:', f1_score(y_true, y_pred, zero_division=0))


# Evaluate
evaluate_genre_match(train_df.reset_index(drop=True), sim_matrix_train)

Precision: 1.0
Recall: 0.17826131164209455
F1 Score: 0.30258366269135845


Current system is limited because genres and types are encoded as integers, and evaluation only checks exact genre match, so precision and recall are low.

Improvement: Use one-hot or multi-label encoding for genres/types and consider weighting features or overlapping genres to boost recommendation quality.

# Interview Questions:

 1. Can you explain the difference between user-based and item-based collaborative filtering?

- User-based CF: Recommends items to a user based on what similar users liked. Focuses on user-to-user similarity.

- Item-based CF: Recommends items similar to what the user already liked. Focuses on item-to-item similarity.

 2. What is collaborative filtering, and how does it work? Give each in 2 bullet points?

- It predicts a user’s preferences by analyzing past behavior or ratings from multiple users.
- Works by finding patterns or similarities between users or items to generate recommendations.