<a href="https://colab.research.google.com/github/Akash-mahandargi/PCA/blob/main/Recommendation_Systems.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# If you're using Jupyter Notebook or Google Colab, run this cell first
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')


In [8]:
# Load dataset
df = pd.read_csv("anime.csv")

# Show first few rows
df.head()


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [9]:
# Check basic info
df.info()

# Check for missing values
df.isnull().sum()

# Basic stats
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [10]:
# Step 1: Drop rows with missing genre or rating
df_clean = df.dropna(subset=['genre', 'rating'])

# Step 2: Convert episodes to numeric, coerce errors to NaN
df_clean['episodes'] = pd.to_numeric(df_clean['episodes'], errors='coerce')

# Step 3: Fill missing episode values with the median (safe default)
df_clean['episodes'].fillna(df_clean['episodes'].median(), inplace=True)

# Step 4: Convert genre into list (split by comma)
df_clean['genre'] = df_clean['genre'].apply(lambda x: [g.strip() for g in x.split(',')])

# Step 5: Use MultiLabelBinarizer to one-hot encode the genres
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(df_clean['genre'])

# Step 6: Combine the genre matrix with other useful numeric features
features = pd.DataFrame(genre_encoded, columns=mlb.classes_)
features['episodes'] = df_clean['episodes'].values
features['rating'] = df_clean['rating'].values
features['members'] = df_clean['members'].values


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

# Step 3: Compute cosine similarity
similarity_matrix = cosine_similarity(features)

# Convert it to a DataFrame for better readability
similarity_df = pd.DataFrame(similarity_matrix, index=df_clean['name'], columns=df_clean['name'])

# Show a sample of similarity scores
similarity_df.iloc[:5, :5]


name,Kimi no Na wa.,Fullmetal Alchemist: Brotherhood,Gintama°,Steins;Gate,Gintama&#039;
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Kimi no Na wa.,1.0,1.0,1.0,1.0,1.0
Fullmetal Alchemist: Brotherhood,1.0,1.0,1.0,1.0,1.0
Gintama°,1.0,1.0,1.0,1.0,1.0
Steins;Gate,1.0,1.0,1.0,1.0,1.0
Gintama&#039;,1.0,1.0,1.0,1.0,1.0


In [12]:
# Re-run preprocessing steps to define 'features' again for this execution environment

# Drop rows with missing genre or rating
df_clean = df.dropna(subset=['genre', 'rating'])

# Convert episodes to numeric and fill missing values with median
df_clean['episodes'] = pd.to_numeric(df_clean['episodes'], errors='coerce')
df_clean['episodes'].fillna(df_clean['episodes'].median(), inplace=True)

# Convert genre string to list
df_clean['genre'] = df_clean['genre'].apply(lambda x: [g.strip() for g in x.split(',')])

# One-hot encode the genre column
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(df_clean['genre'])

# Combine genre, episodes, rating, and members into a feature set
features = pd.DataFrame(genre_encoded, columns=mlb.classes_)
features['episodes'] = df_clean['episodes'].values
features['rating'] = df_clean['rating'].values
features['members'] = df_clean['members'].values

# Compute cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(features)

# Create a similarity DataFrame for easy lookup
similarity_df = pd.DataFrame(similarity_matrix, index=df_clean['name'], columns=df_clean['name'])

# Show a sample
similarity_df.iloc[:5, :5]


name,Kimi no Na wa.,Fullmetal Alchemist: Brotherhood,Gintama°,Steins;Gate,Gintama&#039;
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Kimi no Na wa.,1.0,1.0,1.0,1.0,1.0
Fullmetal Alchemist: Brotherhood,1.0,1.0,1.0,1.0,1.0
Gintama°,1.0,1.0,1.0,1.0,1.0
Steins;Gate,1.0,1.0,1.0,1.0,1.0
Gintama&#039;,1.0,1.0,1.0,1.0,1.0


In [13]:
# Sample 1000 anime for demonstration to avoid memory error
df_sample = df_clean.sample(n=1000, random_state=42).reset_index(drop=True)

# Recreate the genre encoding for the sample
genre_sample_encoded = mlb.transform(df_sample['genre'])

# Combine genre + numerical features
features_sample = pd.DataFrame(genre_sample_encoded, columns=mlb.classes_)
features_sample['episodes'] = df_sample['episodes'].values
features_sample['rating'] = df_sample['rating'].values
features_sample['members'] = df_sample['members'].values

# Compute cosine similarity for the sample
similarity_sample = cosine_similarity(features_sample)

# Convert to DataFrame for readability
similarity_sample_df = pd.DataFrame(similarity_sample, index=df_sample['name'], columns=df_sample['name'])

# Show a sample of similarity scores
similarity_sample_df.iloc[:5, :5]


name,Koutetsu Tenshi Kurumi Zero,Fight Ippatsu! Juuden-chan!! OVA,Examurai Sengoku Recap,Saint Beast: Seijuu Kourin-hen,Macross: Do You Remember Love?
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Koutetsu Tenshi Kurumi Zero,1.0,0.999993,0.998881,1.0,0.999999
Fight Ippatsu! Juuden-chan!! OVA,0.999993,1.0,0.999034,0.99999,0.999988
Examurai Sengoku Recap,0.998881,0.999034,1.0,0.998857,0.998822
Saint Beast: Seijuu Kourin-hen,1.0,0.99999,0.998857,1.0,0.999999
Macross: Do You Remember Love?,0.999999,0.999988,0.998822,0.999999,1.0


In [14]:
# Sample 1000 anime to avoid memory issues
df_sample = df_clean.sample(n=1000, random_state=42).reset_index(drop=True)

# Reuse the genre encoder
genre_sample_encoded = mlb.transform(df_sample['genre'])

# Create feature matrix
features_sample = pd.DataFrame(genre_sample_encoded, columns=mlb.classes_)
features_sample['episodes'] = df_sample['episodes'].values
features_sample['rating'] = df_sample['rating'].values
features_sample['members'] = df_sample['members'].values

# Compute cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity_sample = cosine_similarity(features_sample)

# Create a DataFrame for similarity matrix
similarity_df = pd.DataFrame(similarity_sample, index=df_sample['name'], columns=df_sample['name'])

# Check similarity scores
similarity_df.iloc[:5, :5]


name,Koutetsu Tenshi Kurumi Zero,Fight Ippatsu! Juuden-chan!! OVA,Examurai Sengoku Recap,Saint Beast: Seijuu Kourin-hen,Macross: Do You Remember Love?
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Koutetsu Tenshi Kurumi Zero,1.0,0.999993,0.998881,1.0,0.999999
Fight Ippatsu! Juuden-chan!! OVA,0.999993,1.0,0.999034,0.99999,0.999988
Examurai Sengoku Recap,0.998881,0.999034,1.0,0.998857,0.998822
Saint Beast: Seijuu Kourin-hen,1.0,0.99999,0.998857,1.0,0.999999
Macross: Do You Remember Love?,0.999999,0.999988,0.998822,0.999999,1.0


In [15]:
def recommend_anime(anime_name, top_n=5):
    # Check if anime is in the dataset
    if anime_name not in similarity_df.columns:
        print(f"'{anime_name}' not found in the dataset.")
        return []

    # Get similarity scores
    scores = similarity_df[anime_name]

    # Sort scores in descending order and exclude the anime itself
    similar_anime = scores.sort_values(ascending=False).drop(anime_name)

    # Return the top N similar anime
    return similar_anime.head(top_n)


In [18]:
recommend_anime("hack//Versus: The Thanatos Report", top_n=5)


'hack//Versus: The Thanatos Report' not found in the dataset.


[]

In [19]:
# Strip whitespace and fix HTML characters in names
df_sample['name'] = df_sample['name'].str.strip().str.replace("&#039;", "'")

# Rebuild the similarity_df with clean names
similarity_df = pd.DataFrame(similarity_sample, index=df_sample['name'], columns=df_sample['name'])


In [20]:
# List a few available anime names
print(df_sample['name'].sample(10).values)


["Glass no Kantai: La Legende du Vent de l'Univers"
 'Sarasoujuu no Hana no Iro' 'Lost Universe' 'Gene'
 'Peter Pan no Bouken Specials' 'Dragon Pink'
 'Kawarazaki-ke no Ichizoku The Animation' 'Mabeobsaui Adeul Koli'
 "What's Michael? (TV)" 'HenSemi (TV)']


In [21]:
recommend_anime("Sen to Chihiro no Kamikakushi", top_n=5)


'Sen to Chihiro no Kamikakushi' not found in the dataset.


[]

In [22]:
# Check if the anime is present in the full dataset
df_clean[df_clean['name'].str.contains("Sen to Chihiro", case=False, na=False)]


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
15,199,Sen to Chihiro no Kamikakushi,"[Adventure, Drama, Supernatural]",Movie,1.0,8.93,466254


In [23]:
# Filter out the target anime
df_target = df_clean[df_clean['name'].str.contains("Sen to Chihiro", case=False, na=False)]

# Sample 999 other anime (excluding target)
df_others = df_clean.drop(df_target.index).sample(n=999, random_state=42)

# Combine target + random to get 1000 rows
df_sample = pd.concat([df_target, df_others]).reset_index(drop=True)


In [24]:
# Fix anime names (remove extra spaces or special HTML chars)
df_sample['name'] = df_sample['name'].str.strip().str.replace("&#039;", "'")


In [25]:
recommend_anime("Sen to Chihiro no Kamikakushi")


'Sen to Chihiro no Kamikakushi' not found in the dataset.


[]

In [26]:
# Show the exact name(s) of 'Sen to Chihiro' from the sample
df_sample[df_sample['name'].str.contains("Sen to Chihiro", case=False, na=False)]['name'].values


array(['Sen to Chihiro no Kamikakushi'], dtype=object)

In [31]:
recommend_anime("Sen to Chihiro no Kamikakushi", top_n=5)


'Sen to Chihiro no Kamikakushi' not found in the dataset.


[]

In [32]:
# Print all names available in the similarity matrix
print(similarity_df.index.tolist()[:50])  # print first 50 for now


['Koutetsu Tenshi Kurumi Zero', 'Fight Ippatsu! Juuden-chan!! OVA', 'Examurai Sengoku Recap', 'Saint Beast: Seijuu Kourin-hen', 'Macross: Do You Remember Love?', 'Blind Night', 'Cobra The Animation', 'Shaolin Wuzang', 'California Crisis: Tsuigeki no Juuka', 'Amada Anime Series: Super Mario Brothers', 'Ginga Tetsudou 999: Glass no Clair', 'Private Eye Dol', 'Hamelin no Violin Hiki', 'Steady x Study', 'Oretachi ni Tsubasa wa Nai: Hadairo Ritsu Kyuuwari Zou!?', "Norabbits' Minutes", 'Little Charo 2', 'Hoshigari Hime no Bouken', 'Kinken Chochiku Shiobara Tasuke', 'Satsujinkyou Jidai', 'Kiriya Hakushaku Ke no Roku Shimai', 'Aru Hi Inu no Kuni kara Tegami ga Kite', 'Suisei no Gargantia: Meguru Kouro, Haruka', 'Turning Girls', 'Kimagure Robot', 'Jungle Book Shounen Mowgli', 'D.I.C.E.', 'Mobile Suit Gundam I', 'Xiao Taiji', 'Tanoshii Muumin Ikka Bouken Nikki', 'Toshi Densetsu Monogatari Hikiko', 'Dokkaebi Bangmang-I', 'Buta Himesama', 'Kuroshitsuji Recap', 'KY Kei JC Kuukichan', 'Fue', 'Gakuen

In [34]:
# Find anime names containing "Chihiro" in similarity_df
[name for name in similarity_df.index if "Chihiro" in name]


[]

In [39]:
print(similarity_df.index.tolist()[:50])


['Koutetsu Tenshi Kurumi Zero', 'Fight Ippatsu! Juuden-chan!! OVA', 'Examurai Sengoku Recap', 'Saint Beast: Seijuu Kourin-hen', 'Macross: Do You Remember Love?', 'Blind Night', 'Cobra The Animation', 'Shaolin Wuzang', 'California Crisis: Tsuigeki no Juuka', 'Amada Anime Series: Super Mario Brothers', 'Ginga Tetsudou 999: Glass no Clair', 'Private Eye Dol', 'Hamelin no Violin Hiki', 'Steady x Study', 'Oretachi ni Tsubasa wa Nai: Hadairo Ritsu Kyuuwari Zou!?', "Norabbits' Minutes", 'Little Charo 2', 'Hoshigari Hime no Bouken', 'Kinken Chochiku Shiobara Tasuke', 'Satsujinkyou Jidai', 'Kiriya Hakushaku Ke no Roku Shimai', 'Aru Hi Inu no Kuni kara Tegami ga Kite', 'Suisei no Gargantia: Meguru Kouro, Haruka', 'Turning Girls', 'Kimagure Robot', 'Jungle Book Shounen Mowgli', 'D.I.C.E.', 'Mobile Suit Gundam I', 'Xiao Taiji', 'Tanoshii Muumin Ikka Bouken Nikki', 'Toshi Densetsu Monogatari Hikiko', 'Dokkaebi Bangmang-I', 'Buta Himesama', 'Kuroshitsuji Recap', 'KY Kei JC Kuukichan', 'Fue', 'Gakuen

In [40]:
recommend_anime("Henna Omamesan", top_n=5)

Unnamed: 0_level_0,Henna Omamesan
name,Unnamed: 1_level_1
Hana,0.999977
Ski Jumping Pairs: Road to Torino 2006,0.99997
Seiyuu Deka,0.999966
Asa da yo! Kaishain,0.999961
Tezuka Osamu Monogatari: Boku wa Son Gokuu,0.999961


In [37]:
def recommend_anime(title, top_n=5):
    # Check if title exists in the similarity_df
    if title not in similarity_df.index:
        return f"'{title}' not found in the dataset."

    # Get similarity scores for that title, sort in descending order
    sim_scores = similarity_df[title].sort_values(ascending=False)

    # Remove the anime itself from the result
    sim_scores = sim_scores.drop(title)

    # Return top N most similar animes
    return sim_scores.head(top_n)


In [41]:
recommend_anime("Henna Omamesan", top_n=5)

Unnamed: 0_level_0,Henna Omamesan
name,Unnamed: 1_level_1
Hana,0.999977
Ski Jumping Pairs: Road to Torino 2006,0.99997
Seiyuu Deka,0.999966
Asa da yo! Kaishain,0.999961
Tezuka Osamu Monogatari: Boku wa Son Gokuu,0.999961


In [42]:
# Optionally combine similarity with rating (bonus idea)
df_clean['normalized_rating'] = df_clean['rating'] / df_clean['rating'].max()

# Now combine similarity with rating later in recommend_anime()
# For example: final_score = 0.7 * similarity + 0.3 * rating


In [43]:
# Replace missing genres with a placeholder
df['genre'] = df['genre'].fillna('unknown')


In [44]:
# RELOAD full dataset
df_multifeature = df_clean.copy()  # this assumes df_clean = cleaned full data

# Repeat cleaning and encoding on the full dataset as we did earlier:
df_multifeature['genre'] = df_multifeature['genre'].fillna('unknown')
df_multifeature['type'] = df_multifeature['type'].fillna('unknown')
df_multifeature['episodes'] = pd.to_numeric(df_multifeature['episodes'], errors='coerce')
df_multifeature['episodes'] = df_multifeature['episodes'].fillna(df_multifeature['episodes'].median())
df_multifeature['rating'] = df_multifeature['rating'].fillna(df_multifeature['rating'].mean())
df_multifeature['members'] = df_multifeature['members'].fillna(df_multifeature['members'].mean())


In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from scipy.sparse import hstack

# Convert the list of genres back to a string for TF-IDF
df_multifeature['genre_str'] = df_multifeature['genre'].apply(lambda x: ' '.join(x))

# Genre vectorization
tfidf = TfidfVectorizer(stop_words='english')
genre_matrix = tfidf.fit_transform(df_multifeature['genre_str'])

# One-hot encode type
ohe = OneHotEncoder()
type_matrix = ohe.fit_transform(df_multifeature[['type']])

# Normalize numeric features
scaler = MinMaxScaler()
num_matrix = scaler.fit_transform(df_multifeature[['episodes', 'rating', 'members']])

# Combine all into one final feature matrix
combined_matrix = hstack([genre_matrix, type_matrix, num_matrix])

In [48]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim_multi = cosine_similarity(combined_matrix, combined_matrix)


In [49]:
def recommend_anime_multi(title, top_n=5):
    if title not in df_multifeature['name'].values:
        return f"'{title}' not found in the dataset."

    index = df_multifeature[df_multifeature['name'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim_multi[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    recommended = df_multifeature.iloc[[i[0] for i in sim_scores]][['name', 'genre', 'type', 'rating']]
    return recommended.reset_index(drop=True)


In [54]:
recommend_anime_multi("Naruto", top_n=5)
recommend_anime_multi("Shingeki no Kyojin", top_n=10)


Unnamed: 0,name,genre,type,rating
0,One Piece,"[Action, Adventure, Comedy, Drama, Fantasy, Sh...",TV,8.58
1,Hunter x Hunter (2011),"[Action, Adventure, Shounen, Super Power]",TV,9.13
2,Bleach,"[Action, Comedy, Shounen, Super Power, Superna...",TV,7.95
3,Guilty Crown,"[Action, Drama, Sci-Fi, Super Power]",TV,7.81
4,Katekyo Hitman Reborn!,"[Action, Comedy, Shounen, Super Power]",TV,8.37
5,Kill la Kill,"[Action, Comedy, School, Super Power]",TV,8.23
6,Naruto,"[Action, Comedy, Martial Arts, Shounen, Super ...",TV,7.81
7,Boku no Hero Academia,"[Action, Comedy, School, Shounen, Super Power]",TV,8.36
8,Code Geass: Hangyaku no Lelouch R2,"[Action, Drama, Mecha, Military, Sci-Fi, Super...",TV,8.98
9,Hunter x Hunter,"[Action, Adventure, Shounen, Super Power]",TV,8.48


In [51]:
def find_similar_titles(query, top_n=10):
    # Convert query to lowercase for matching
    query = query.lower()
    matches = df_multifeature[df_multifeature['name'].str.lower().str.contains(query, na=False)]
    return matches[['name', 'genre', 'type']].head(top_n).reset_index(drop=True)


In [52]:
find_similar_titles("attack")  # try "titan", "naruto", "gate", etc.


Unnamed: 0,name,genre,type
0,Mobile Suit Gundam: Char&#039;s Counterattack,"[Drama, Mecha, Military, Sci-Fi, Space]",Movie
1,Attack No.1,"[Drama, Shoujo, Sports]",TV
2,Ashita e Attack!,"[Drama, School, Sports]",TV
3,Attacker You!,"[Action, Romance, Shoujo, Sports]",TV
4,Mobile Suit SD Gundam&#039;s Counterattack,"[Comedy, Mecha, Parody]",OVA
5,Attack No.1: Namida no Sekai Senshuken,"[Drama, Shoujo, Sports]",Movie
6,Attack No.1 (1970),"[Drama, Shoujo, Sports]",Movie
7,Attack No.1: Namida no Kaiten Receive,"[Drama, Shoujo, Sports]",Movie
8,Attack No.1: Namida no Fushichou,"[Drama, Shoujo, Sports]",Movie
9,Zoku Attacker You! Kin Medal e no Michi,"[Romance, Sports]",TV
