In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
anime_data = pd.read_csv("anime.csv")

In [3]:
# Explore the dataset
print("Dataset Overview:")
print(anime_data.head())

Dataset Overview:
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [4]:
print("\nDataset Info:")
print(anime_data.info())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None


In [5]:
# Check for missing values
print("\nMissing Values:")
print(anime_data.isnull().sum())


Missing Values:
anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [6]:
# Drop rows or fill missing values (handle based on dataset characteristics)
anime_data['genre'] = anime_data['genre'].fillna('Unknown')
anime_data['rating'] = anime_data['rating'].fillna(anime_data['rating'].mean())
anime_data['episodes'] = anime_data['episodes'].replace('Unknown', np.nan).fillna(0).astype(int)

In [7]:
# Validate cleaning
print("\nCleaned Dataset Info:")
print(anime_data.info())


Cleaned Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12294 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  int64  
 5   rating    12294 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 672.5+ KB
None


***Feature Extraction***

In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

In [9]:
# Process the 'genre' column
anime_data['genre_list'] = anime_data['genre'].apply(lambda x: x.split(', ') if x != 'Unknown' else [])
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(anime_data['genre_list'])
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)


In [10]:
# Combine genres and ratings
features = pd.concat([genre_df, anime_data['rating']], axis=1)

In [11]:
# Normalize ratings
from sklearn.preprocessing import MinMaxScaler

In [12]:
scaler = MinMaxScaler()
features['rating'] = scaler.fit_transform(features[['rating']])

In [13]:
print("\nFeature Matrix:")
print(features.head())


Feature Matrix:
   Action  Adventure  Cars  Comedy  Dementia  Demons  Drama  Ecchi  Fantasy  \
0       0          0     0       0         0       0      1      0        0   
1       1          1     0       0         0       0      1      0        1   
2       1          0     0       1         0       0      0      0        0   
3       0          0     0       0         0       0      0      0        0   
4       1          0     0       1         0       0      0      0        0   

   Game  ...  Slice of Life  Space  Sports  Super Power  Supernatural  \
0     0  ...              0      0       0            0             1   
1     0  ...              0      0       0            0             0   
2     0  ...              0      0       0            0             0   
3     0  ...              0      0       0            0             0   
4     0  ...              0      0       0            0             0   

   Thriller  Vampire  Yaoi  Yuri    rating  
0         0        0    

***Recommendation System***

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
# Compute cosine similarity
cosine_sim = cosine_similarity(features)

In [16]:
# Recommendation function
def recommend_anime(anime_title, top_n=10, threshold=0.5):
    # Find the index of the anime
    anime_idx = anime_data[anime_data['name'] == anime_title].index[0]

    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[anime_idx]))

    # Sort by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Filter based on threshold and exclude the target anime itself
    sim_scores = [score for score in sim_scores if score[1] >= threshold and score[0] != anime_idx]

    # Get top_n recommendations
    top_anime_idx = [score[0] for score in sim_scores[:top_n]]

    # Return the recommended anime titles
    return anime_data.iloc[top_anime_idx][['name', 'genre', 'rating']]

In [17]:
# Example usage
recommended_anime = recommend_anime('Naruto', top_n=5, threshold=0.7)
print("\nRecommended Anime:")
print(recommended_anime)


Recommended Anime:
                                                   name  \
615                                  Naruto: Shippuuden   
1103  Boruto: Naruto the Movie - Naruto ga Hokage ni...   
486                            Boruto: Naruto the Movie   
1343                                        Naruto x UT   
1472        Naruto: Shippuuden Movie 4 - The Lost Tower   

                                                  genre  rating  
615   Action, Comedy, Martial Arts, Shounen, Super P...    7.94  
1103  Action, Comedy, Martial Arts, Shounen, Super P...    7.68  
486   Action, Comedy, Martial Arts, Shounen, Super P...    8.03  
1343  Action, Comedy, Martial Arts, Shounen, Super P...    7.58  
1472  Action, Comedy, Martial Arts, Shounen, Super P...    7.53  


In [18]:
# Generate synthetic user-anime interaction data (if not available in the dataset)
# Here, we assume user preferences are given (binary relevance: 1 if liked, 0 otherwise)
user_anime_interactions = pd.DataFrame({
    'user_id': np.random.randint(1, 1000, len(anime_data)),
    'anime_id': anime_data.index,
    'liked': np.random.choice([0, 1], len(anime_data))  # Simulated ground truth
})

In [19]:
# Import train_test_split from sklearn.model_selection
from sklearn.model_selection import train_test_split
# Split into training and testing sets
train_data, test_data = train_test_split(user_anime_interactions, test_size=0.2, random_state=42)


In [22]:
def evaluate_recommendation():
    y_true, y_pred = [], []
    # Get the anime names from the original anime_data DataFrame using anime_id
    for anime_id in test_data['anime_id']:
        title = anime_data.loc[anime_id, 'name']  # Access 'name' from anime_data
        recommendations = recommend_anime(title, top_n=3)
        if isinstance(recommendations, pd.DataFrame):
            y_true.append(1)
            y_pred.append(1)
        else:
            y_true.append(1)
            y_pred.append(0)

    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred, zero_division=1)
    f1 = f1_score(y_true, y_pred, zero_division=1)
    return precision, recall, f1

precision, recall, f1 = evaluate_recommendation()
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Precision: 1.0
Recall: 1.0
F1 Score: 1.0


# INTERVIEW QUESTIONS

1. Can you explain the difference between user-based and item-based collaborative filtering?

# User-Based Collaborative Filtering

Focus: Finds similar users based on their preferences or behavior.

How It Works: Recommends items that similar users have liked or purchased.

Example: If User A and User B have similar tastes, items liked by User B are recommended to User A.

# Item-Based Collaborative Filtering

Focus: Finds similar items based on user interactions.

How It Works: Recommends items that are frequently liked or purchased together by users.

Example: If Item X and Item Y are often bought together, users who like Item X are recommended Item Y.

# Key Difference

User-Based: Relies on user-to-user similarity.

Item-Based: Relies on item-to-item similarity.


2. What is collaborative filtering, and how does it work?

*Collaborative Filtering (CF) *
Collaborative Filtering predicts user preferences by leveraging the behavior of other users. It works by analyzing a user-item interaction matrix to find:

User-Based CF: Recommends items liked by similar users.

Item-Based CF: Recommends items similar to those a user liked.

*Types*

Memory-Based: Uses similarity measures (e.g., cosine similarity).

Model-Based: Uses machine learning models (e.g., SVD) for scalability.

*Advantages*

No need for item metadata.

Adapts to user behavior dynamically.

*Challenges*

Struggles with new users/items (cold start).

Handles sparse data poorly.

Computationally expensive for large datasets.

# Collaborative Filtering is widely used in recommendation systems like Netflix and Amazon.

