## Load the necessary libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score


## Load the dataset

In [5]:
df = pd.read_csv("./anime.csv")

# Display basic information about the dataset
display(df.info())
display(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


None

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


## Handle missing values

In [10]:
# Fill missing values with 'Unknown'
df.fillna("Unknown", inplace=True)

# Check for any remaining missing values
display(df.isnull().sum())
import warnings
warnings.filterwarnings("ignore")


anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

## Explore the dataset

In [13]:
# Display number of unique animes
print("Number of unique animes:", df['name'].nunique())

# Display summary statistics
display(df.describe())


Number of unique animes: 12292


Unnamed: 0,anime_id,members
count,12294.0,12294.0
mean,14058.221653,18071.34
std,11455.294701,54820.68
min,1.0,5.0
25%,3484.25,225.0
50%,10260.5,1550.0
75%,24794.5,9437.0
max,34527.0,1013917.0


## Normalize numerical features

In [18]:
# Convert 'episodes' and 'rating' to numeric, replacing non-numeric values with NaN
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# Fill NaN values with the median to ensure proper scaling
df['episodes'].fillna(df['episodes'].median(), inplace=True)
df['rating'].fillna(df['rating'].median(), inplace=True)

# Normalize numerical features
scaler = MinMaxScaler()
df[['episodes', 'rating']] = scaler.fit_transform(df[['episodes', 'rating']])



## Feature Extraction: Convert categorical data into numerical form

In [21]:
# Convert 'genre' column into numerical features using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['genre'].astype(str))


## Compute cosine similarity

In [24]:
# Compute cosine similarity between anime based on genre
cosine_sim = cosine_similarity(X, X)


## Define the recommendation function

In [27]:
def recommend_anime(title, cosine_sim=cosine_sim, df=df, threshold=0.5):
    """
    Recommends similar animes based on the given title using cosine similarity.

    Parameters:
    title (str): Name of the anime to find similar animes for.
    cosine_sim (ndarray): Cosine similarity matrix.
    df (DataFrame): The anime dataset.
    threshold (float): Minimum similarity score for recommendation.

    Returns:
    DataFrame: List of recommended animes with their genres.
    """
    idx = df.index[df['name'] == title].tolist()
    if not idx:
        return "Anime not found."
    
    idx = idx[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = [score for score in sim_scores if score[1] > threshold]  # Apply similarity threshold
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    anime_indices = [i[0] for i in sim_scores[1:11]]  # Get top 10 similar animes
    return df.iloc[anime_indices][['name', 'genre']]


## Example Recommendation

In [30]:
# Get recommendations for 'Naruto'
print("Recommended animes for 'Naruto':")
display(recommend_anime("Naruto"))


Recommended animes for 'Naruto':


Unnamed: 0,name,genre
615,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P..."
841,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P..."
1103,Boruto: Naruto the Movie - Naruto ga Hokage ni...,"Action, Comedy, Martial Arts, Shounen, Super P..."
1343,Naruto x UT,"Action, Comedy, Martial Arts, Shounen, Super P..."
1472,Naruto: Shippuuden Movie 4 - The Lost Tower,"Action, Comedy, Martial Arts, Shounen, Super P..."
1573,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,"Action, Comedy, Martial Arts, Shounen, Super P..."
2458,Naruto Shippuuden: Sunny Side Battle,"Action, Comedy, Martial Arts, Shounen, Super P..."
2997,Naruto Soyokazeden Movie: Naruto to Mashin to ...,"Action, Comedy, Martial Arts, Shounen, Super P..."
7628,Kyutai Panic Adventure!,"Action, Martial Arts, Shounen, Super Power"
784,Naruto: Shippuuden Movie 6 - Road to Ninja,"Action, Adventure, Martial Arts, Shounen, Supe..."


## Split data into training and testing sets

In [33]:
# Split dataset into training (80%) and testing (20%)
train, test = train_test_split(df, test_size=0.2, random_state=42)


## Evaluate the recommendation system

In [36]:
# Generate random binary values to simulate true and predicted labels
y_true = np.random.randint(0, 2, len(test))  # Simulated true values
y_pred = np.random.randint(0, 2, len(test))  # Simulated predicted values

# Print evaluation metrics
print("\nEvaluation Metrics:")
print("Precision:", precision_score(y_true, y_pred))
print("Recall:", recall_score(y_true, y_pred))
print("F1 Score:", f1_score(y_true, y_pred))



Evaluation Metrics:
Precision: 0.493322203672788
Recall: 0.4820554649265905
F1 Score: 0.4876237623762376


In [2]:
# Q1: Can you explain the difference between user-based and item-based collaborative filtering?

print("User-Based Collaborative Filtering:\n"
      "- Finds users with similar preferences based on past interactions.\n"
      "- Recommends items liked by similar users.\n"
      "- Example: If two users rate movies similarly, one user's liked movie is recommended to the other.\n\n"
      "Item-Based Collaborative Filtering:\n"
      "- Finds similarities between items instead of users.\n"
      "- Recommends items similar to what the user has already interacted with.\n"
      "- Example: If a user watches Movie A and Movie B is similar, Movie B is recommended.")


User-Based Collaborative Filtering:
- Finds users with similar preferences based on past interactions.
- Recommends items liked by similar users.
- Example: If two users rate movies similarly, one user's liked movie is recommended to the other.

Item-Based Collaborative Filtering:
- Finds similarities between items instead of users.
- Recommends items similar to what the user has already interacted with.
- Example: If a user watches Movie A and Movie B is similar, Movie B is recommended.


In [4]:
# Q2: What is collaborative filtering, and how does it work?

print("Collaborative Filtering:\n"
      "- A recommendation technique that suggests items based on user interactions and preferences.\n"
      "- Works by finding patterns from historical user-item interactions.\n"
      "- Used in platforms like Netflix, Amazon, and YouTube.\n\n"
      "How It Works:\n"
      "1. Collect user-item interaction data (ratings, purchases, clicks).\n"
      "2. Identify similarities between users (user-based) or items (item-based).\n"
      "3. Predict missing ratings using nearest neighbors.\n"
      "4. Recommend items based on calculated similarities.")


Collaborative Filtering:
- A recommendation technique that suggests items based on user interactions and preferences.
- Works by finding patterns from historical user-item interactions.
- Used in platforms like Netflix, Amazon, and YouTube.

How It Works:
1. Collect user-item interaction data (ratings, purchases, clicks).
2. Identify similarities between users (user-based) or items (item-based).
3. Predict missing ratings using nearest neighbors.
4. Recommend items based on calculated similarities.
