# Data Preprocessing

In [2]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

# Step 2: Load and preprocess data
# Read CSV (assuming file is named 'anime.csv')
df = pd.read_csv('anime.csv')
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [3]:
#find did any null values exists
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [4]:
# finding different datatypes in data set
print(df.dtypes)

anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object


In [5]:
# finding column names
print(df.columns)

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')


In [6]:
# finding info
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None


In [7]:
# describing dataset
# finding column names
print(df.describe())

           anime_id        rating       members
count  12294.000000  12064.000000  1.229400e+04
mean   14058.221653      6.473902  1.807134e+04
std    11455.294701      1.026746  5.482068e+04
min        1.000000      1.670000  5.000000e+00
25%     3484.250000      5.880000  2.250000e+02
50%    10260.500000      6.570000  1.550000e+03
75%    24794.500000      7.180000  9.437000e+03
max    34527.000000     10.000000  1.013917e+06


In [8]:
# Handle missing values
df['episodes'] = df['episodes'].replace('Unknown', 0).astype(int)
df['rating'] = df['rating'].fillna(df['rating'].mean())
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


# Feature Engineering

In [10]:

# One-hot encode genres
genre_features = df['genre'].str.get_dummies(sep=', ')

genre_features

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# One-hot encode anime type
type_features = pd.get_dummies(df['type'])

type_features

Unnamed: 0,Movie,Music,ONA,OVA,Special,TV
0,True,False,False,False,False,False
1,False,False,False,False,False,True
2,False,False,False,False,False,True
3,False,False,False,False,False,True
4,False,False,False,False,False,True
...,...,...,...,...,...,...
12289,False,False,False,True,False,False
12290,False,False,False,True,False,False
12291,False,False,False,True,False,False
12292,False,False,False,True,False,False


In [12]:
# Normalize numerical features
scaler = MinMaxScaler()
numerical_features = df[['episodes', 'rating', 'members']]
scaled_numerical = scaler.fit_transform(numerical_features)
scaled_numerical_df = pd.DataFrame(scaled_numerical, 
                                 columns=['episodes', 'rating', 'members'])

# Combine all features
features = pd.concat([genre_features, type_features, scaled_numerical_df], axis=1)
features

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Yuri,Movie,Music,ONA,OVA,Special,TV,episodes,rating,members
0,0,0,0,0,0,0,1,0,0,0,...,0,True,False,False,False,False,False,0.000550,0.924370,0.197872
1,1,1,0,0,0,0,1,0,1,0,...,0,False,False,False,False,False,True,0.035204,0.911164,0.782770
2,1,0,0,1,0,0,0,0,0,0,...,0,False,False,False,False,False,True,0.028053,0.909964,0.112689
3,0,0,0,0,0,0,0,0,0,0,...,0,False,False,False,False,False,True,0.013201,0.900360,0.664325
4,1,0,0,1,0,0,0,0,0,0,...,0,False,False,False,False,False,True,0.028053,0.899160,0.149186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,0,0,0,0,0,0,0,0,0,0,...,0,False,False,False,True,False,False,0.000550,0.297719,0.000203
12290,0,0,0,0,0,0,0,0,0,0,...,0,False,False,False,True,False,False,0.000550,0.313325,0.000176
12291,0,0,0,0,0,0,0,0,0,0,...,0,False,False,False,True,False,False,0.002200,0.385354,0.000211
12292,0,0,0,0,0,0,0,0,0,0,...,0,False,False,False,True,False,False,0.000550,0.397359,0.000168


In [13]:
# Compute cosine similarity matrix
cosine_sim = cosine_similarity(features, features)


# Recommendation function

In [15]:
def get_recommendations(title, cosine_sim=cosine_sim, df=df, top_n=10):
    try:
        idx = df[df['name'] == title].index[0]
    except IndexError:
        return "Anime not found in dataset"
        
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Skip self-similarity
    anime_indices = [i[0] for i in sim_scores]
    return df[['name', 'genre', 'type', 'rating']].iloc[anime_indices]

In [16]:
# Test the system
print("Recommendations for 'Kimi no Na wa.':")
print(get_recommendations('Kimi no Na wa.', top_n=5))



Recommendations for 'Kimi no Na wa.':
                                       name  \
1111  Aura: Maryuuin Kouga Saigo no Tatakai   
208           Kokoro ga Sakebitagatterunda.   
1494                               Harmonie   
1959                              Air Movie   
60                       Hotarubi no Mori e   

                                             genre   type  rating  
1111  Comedy, Drama, Romance, School, Supernatural  Movie    7.67  
208                         Drama, Romance, School  Movie    8.32  
1494                   Drama, School, Supernatural  Movie    7.52  
1959                  Drama, Romance, Supernatural  Movie    7.39  
60            Drama, Romance, Shoujo, Supernatural  Movie    8.61  


In [17]:
# Simple Evaluation (Precision@k demonstration)
# This is a simplified evaluation without proper test data
def evaluate_recommendations(target_anime, expected_genres, k=5):
    recommendations = get_recommendations(target_anime, top_n=k)
    matches = sum(recommendations['genre'].apply(
        lambda x: any(genre in x for genre in expected_genres)
    ))
    precision = matches / k
    print(f"Precision@{k}: {precision:.2f}")


In [18]:
# Example evaluation for "Kimi no Na wa."
expected_genres = ['Drama', 'Romance', 'Supernatural']
evaluate_recommendations('Kimi no Na wa.', expected_genres)

Precision@5: 1.00


# interview questions

1. User-based vs Item-based Collaborative Filtering

User-based: Recommends items liked by users with similar preferences to the target user. Focuses on user similarity.
    
Item-based: Recommends items similar to those the target user already liked. Focuses on item similarity.
    
Key Difference: User-based scales poorly with user growth, while item-based handles large user bases better.

2. Collaborative Filtering (CF)
A recommendation technique that predicts user preferences by analyzing patterns in user-item interactions.

 working:
Uses a user-item interaction matrix (e.g., ratings).
Finds similarities between users/items (memory-based) or learns latent features (model-based, e.g., matrix factorization).
Recommends items based on these similarities.
Strengths: No need for item metadata; relies purely on user behavior.