# Task 1: Data Preprocessing

In [16]:
import pandas as pd
# Load the dataset
anime_df = pd.read_csv(r"D:\Excelr\Assignments\Recommendation System\anime.csv")
anime_df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [18]:
# Handle missing values
anime_df['genre'].fillna('Unknown', inplace=True)
anime_df['type'].fillna('Unknown', inplace=True)
anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)
anime_df['episodes'] = pd.to_numeric(anime_df['episodes'], errors='coerce')
anime_df['episodes'].fillna(anime_df['episodes'].median(), inplace=True)

# Task 2: Feature Extraction

In [21]:
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
import numpy as np

# Convert genre strings to list
anime_df['genre'] = anime_df['genre'].apply(lambda x: [i.strip() for i in x.split(',')])

# Binarize genre
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(anime_df['genre'])

# Normalize numerical features
scaler = MinMaxScaler()
numerical_features = scaler.fit_transform(anime_df[['rating', 'members', 'episodes']])

# Combine genre and numerical features
features = np.hstack((genre_matrix, numerical_features))

# Task 3: Recommendation System (Cosine Similarity)

In [26]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(features, features)

In [28]:
# Recommendation function
def recommend_anime(title, top_n=5):
    idx = anime_df[anime_df['name'].str.lower() == title.lower()].index
    if len(idx) == 0:
        return f"No anime with title '{title}' found."
    idx = idx[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # exclude the anime itself
    anime_indices = [i[0] for i in sim_scores]
    return anime_df.iloc[anime_indices][['name', 'genre', 'rating']]

In [30]:
# Example usage
print(recommend_anime("Steins;Gate"))

                                                    name  \
59            Steins;Gate Movie: Fuka Ryouiki no Déjà vu   
126                Steins;Gate: Oukoubakko no Poriomania   
196    Steins;Gate: Kyoukaimenjou no Missing Link - D...   
10898                                      Steins;Gate 0   
5126                                       Under the Dog   

                            genre    rating  
59             [Sci-Fi, Thriller]  8.610000  
126            [Sci-Fi, Thriller]  8.460000  
196            [Sci-Fi, Thriller]  8.340000  
10898          [Sci-Fi, Thriller]  6.473902  
5126   [Action, Sci-Fi, Thriller]  6.550000  


# Task 4: Evaluation

**Since the dataset lacks individual user ratings or interaction history,
traditional collaborative filtering metrics (based on user preferences) were not applicable.
Therefore, we evaluated the recommendation system using a genre-overlap heuristic,
which compares the genres of the recommended anime with the genres of the test anime.**

In [47]:
# Reset index to align with features_test row positions
test_df_reset = test_df.reset_index(drop=True)

precision_list, recall_list, f1_list = [], [], []

for i, test_row in test_df_reset.iterrows():
    true_genres = set(test_row['genre'])
    if not true_genres:
        continue

    test_vector = features_test[i].reshape(1, -1)
    similarities = cosine_similarity(test_vector, features_train).flatten()
    top_indices = similarities.argsort()[::-1][1:6]  # Top 5 similar

    recommended_genres = set()
    for idx in top_indices:
        recommended_genres.update(train_df.iloc[idx]['genre'])

    if not recommended_genres:
        continue

    tp = len(true_genres & recommended_genres)
    fp = len(recommended_genres - true_genres)
    fn = len(true_genres - recommended_genres)

    if tp + fp == 0 or tp + fn == 0:
        continue

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

# Final results
if precision_list:
    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)
    avg_f1 = np.mean(f1_list)

    print(f"Average Precision: {avg_precision:.4f}")
    print(f"Average Recall: {avg_recall:.4f}")
    print(f"Average F1-Score: {avg_f1:.4f}")
else:
    print("No valid comparisons available.")


Average Precision: 0.8823
Average Recall: 0.9888
Average F1-Score: 0.9202
