####Data Preprocessing

In [45]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score

df = pd.read_csv("/content/sample_data/anime.csv")

df.columns = df.columns.str.strip().str.lower()

if "name" in df.columns:
    df = df.rename(columns={"name": "title"})

required_columns = {"title", "genre", "rating", "members", "episodes"}
missing_columns = required_columns - set(df.columns)
print(df.head())
print(df.info())
print(df.describe())

df = df.copy()
df["genre"] = df["genre"].fillna("")
df["rating"] = df["rating"].fillna(df["rating"].mean())
df["episodes"] = df["episodes"].replace("Unknown", np.nan)
df["episodes"] = pd.to_numeric(df["episodes"], errors="coerce")
df["episodes"] = df["episodes"].fillna(df["episodes"].median())


   anime_id                             title  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  

####Feature Extraction

In [46]:
scaler = MinMaxScaler()
df[["rating", "members", "episodes"]] = scaler.fit_transform(df[["rating", "members", "episodes"]])

df["genre"] = df["genre"].str.lower().str.replace(", ", ",")
genre_dummies = df["genre"].str.get_dummies(sep=",")

features = pd.concat([df[["rating", "members", "episodes"]], genre_dummies], axis=1)


####Recommendation System

In [47]:
similarity_matrix = cosine_similarity(features)

def recommend_anime(anime_title, num_recommendations=5):
    if anime_title not in df["title"].values:
        return f"Anime '{anime_title}' not found!"

    idx = df[df["title"] == anime_title].index[0]

    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    main_title_words = set(anime_title.lower().split())
    filtered_indices = [
        i[0]
        for i in similarity_scores[1:]
        if not any(word in df["title"].iloc[i[0]].lower() for word in main_title_words)
    ]

    top_indices = filtered_indices[:num_recommendations]
    return df["title"].iloc[top_indices].tolist()


In [52]:
anime_to_recommend = "Naruto"
recommended_anime = recommend_anime(anime_to_recommend)

print(f"Recommended Anime for '{anime_to_recommend}':")
for i, anime in enumerate(recommended_anime, 1):
    print(f"   {i}. {anime}")

Recommended Anime for 'Naruto':
   1. Katekyo Hitman Reborn!
   2. Kyutai Panic Adventure!
   3. Battle Spirits: Ryuuko no Ken
   4. Dragon Ball Z
   5. Dragon Ball Kai


####Evaluation

In [49]:
ground_truth = {
    "Naruto": ["Bleach", "Dragon Ball Z", "One Piece", "Fairy Tail", "Hunter x Hunter"],
    "Attack on Titan": ["Death Note", "Tokyo Ghoul", "Fullmetal Alchemist", "Code Geass", "Steins;Gate"],
    "Sword Art Online": ["Log Horizon", "No Game No Life", "Overlord", "Re:Zero", "Accel World"]
}

precision_list = []
recall_list = []
f1_list = []

for anime, true_recommendations in ground_truth.items():
    predicted_recommendations = recommend_anime(anime)

    y_true = [1 if rec in true_recommendations else 0 for rec in predicted_recommendations]
    y_pred = [1] * len(predicted_recommendations)

    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred, zero_division=1)
    f1 = f1_score(y_true, y_pred, zero_division=1)

    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

if precision_list:
    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)
    avg_f1 = np.mean(f1_list)
else:
    avg_precision = avg_recall = avg_f1 = 0


In [53]:
print("Evaluation Metrics:")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-Score: {avg_f1:.4f}")

Evaluation Metrics:
Average Precision: 0.0667
Average Recall: 1.0000
Average F1-Score: 0.1111


####Interview Questions:
1. Can you explain the difference between user-based and item-based collaborative filtering?

  User-based collaborative filtering finds users with similar preferences and recommends items that those similar users have liked, relying on user similarity. In contrast, item-based collaborative filtering analyzes the similarity between items based on user interactions and recommends items similar to those a user has already engaged with. Item-based filtering is generally more scalable and stable since item relationships change less frequently than user preferences.

2. What is collaborative filtering, and how does it work?

  Collaborative filtering is a recommendation technique that suggests items based on user interactions and the preferences of similar users, without relying on item attributes. It creates a user-item matrix and applies similarity measures like cosine similarity or Pearson correlation to identify relationships. Based on these relationships, it predicts a user's preference for an item by analyzing either similar users (user-based) or similar items (item-based). While widely used in platforms like Netflix and Amazon, it faces challenges like the cold start problem and data sparsity.