# Step 1: Import libraries & Load dataset

In [2]:
# Import libraries & Load Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("anime.csv")

# Show basic info
print("Shape of dataset:", df.shape)
df.head()


Shape of dataset: (12294, 7)


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


# Step 2: Data Preprocessing – Missing values

In [3]:
# Step 2: Check for missing values
print("Missing values in each column:\n", df.isnull().sum())

# Drop rows with missing essential info (like title, genre, rating, members)
df = df.dropna(subset=['name', 'genre', 'type', 'episodes', 'rating', 'members'])

# Reset index after dropping
df = df.reset_index(drop=True)

print("Shape after removing missing values:", df.shape)
df.head()


Missing values in each column:
 anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64
Shape after removing missing values: (12017, 7)


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


# Step 3: Feature Extraction

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler

# Convert genre text into multi-hot encoding 
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
genre_matrix = vectorizer.fit_transform(df['genre'])

# Handle numerical features
scaler = MinMaxScaler()
num_features = scaler.fit_transform(df[['rating', 'members']])

# Combine features 
import scipy.sparse as sp
feature_matrix = sp.hstack([genre_matrix, num_features], format='csr')

print("Feature matrix shape:", feature_matrix.shape)


Feature matrix shape: (12017, 45)




# Step 4: Recommendation System Function

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity
cosine_sim = cosine_similarity(feature_matrix, feature_matrix)

# Function to recommend anime
def recommend_anime(title, top_n=10):
    # Check if the anime exists in dataset
    if title not in df['name'].values:
        return f"Anime '{title}' not found in dataset."
    
    # Get index of the anime
    idx = df[df['name'] == title].index[0]
    
    # Get similarity scores for this anime
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort by similarity (descending), skip the first (itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    
    # Get anime indices
    anime_indices = [i[0] for i in sim_scores]
    
    # Return top recommended anime
    return df[['name', 'genre', 'type', 'rating']].iloc[anime_indices]

# Example test
recommend_anime("Naruto", top_n=5)


Unnamed: 0,name,genre,type,rating
615,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,7.94
1472,Naruto: Shippuuden Movie 4 - The Lost Tower,"Action, Comedy, Martial Arts, Shounen, Super P...",Movie,7.53
1573,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,"Action, Comedy, Martial Arts, Shounen, Super P...",Movie,7.5
486,Boruto: Naruto the Movie,"Action, Comedy, Martial Arts, Shounen, Super P...",Movie,8.03
1343,Naruto x UT,"Action, Comedy, Martial Arts, Shounen, Super P...",OVA,7.58


# Step 5: Evaluation

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Split dataset (80% train, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Function to get "true similar" based on genre overlap (fixed warning)
def true_similar(anime_name, top_n=10):
    if anime_name not in df['name'].values:
        return []
    target_genres = set(df[df['name'] == anime_name]['genre'].values[0].split(", "))
    candidates = df[df['name'] != anime_name].copy()  # ✅ make a copy
    candidates.loc[:, 'overlap'] = candidates['genre'].apply(
        lambda x: len(set(x.split(", ")) & target_genres)
    )
    return candidates.sort_values('overlap', ascending=False)['name'].head(top_n).tolist()


# Evaluate precision, recall, F1 on 20 random test samples
y_true = []
y_pred = []

sampled_test = test_df.sample(20, random_state=42)
for anime in sampled_test['name']:
    true_list = true_similar(anime, top_n=5)
    pred_list = recommend_anime(anime, top_n=5)['name'].tolist()
    
    # Binary match: 1 if overlap exists, else 0
    y_true.extend([1 if item in true_list else 0 for item in pred_list])
    y_pred.extend([1]*len(pred_list))

# Calculate metrics
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

print(" Evaluation Metrics:")
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


 Evaluation Metrics:
Precision: 0.18
Recall: 1.0
F1-score: 0.3050847457627119


# Interview Questions & Answers

### 1. What is lift and why is it important in Association rules?
- **Lift** measures how much more likely two items are bought together compared to being bought independently.  
- Formula:  
  \[
  \text{Lift(A→B)} = \frac{P(A \cap B)}{P(A) \times P(B)}
  \]
- **Importance:** A lift value > 1 means A and B occur together more often than expected, indicating a strong association.



### 2. What is support and confidence? How do you calculate them?
- **Support:** Proportion of transactions that contain an itemset.  
  \[
  \text{Support(A)} = \frac{\text{Transactions containing A}}{\text{Total transactions}}
  \]
- **Confidence:** Probability that B is purchased given A is purchased.  
  \[
  \text{Confidence(A→B)} = \frac{\text{Support(A ∪ B)}}{\text{Support(A)}}
  \]



### 3. What are some limitations or challenges of Association Rules Mining?
- **Scalability:** Large datasets can make computation expensive.  
- **Sparsity:** Many items may rarely co-occur, leading to weak rules.  
- **Interpretability:** Too many rules can overwhelm decision-making.  
- **Cold Start Problem:** New items with few transactions may not generate useful rules.


### 4. Can you explain the difference between user-based and item-based collaborative filtering?
- **User-based filtering:** Finds users similar to the target user and recommends items they liked.  
- **Item-based filtering:** Finds items similar to the ones a user liked and recommends those.  
- **Key difference:** User-based compares **users**, item-based compares **items**.



### 5. What is collaborative filtering, and how does it work?
- **Collaborative filtering** is a recommendation approach that uses historical interactions (ratings, clicks, purchases) to suggest new items.  
- It assumes that:
  - Users with similar past preferences will like similar items.  
  - Items liked by similar users are good recommendations.  
- Implemented using **user-user similarity** or **item-item similarity**, often with cosine similarity, Pearson correlation, or matrix factorization.
