In [11]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the dataset
file_path = '/content/anime.csv'
anime_df = pd.read_csv(file_path)

# Display the column names and the first few rows of the dataset
print(anime_df.columns)
print(anime_df.head())

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [12]:
# Handling missing values for 'genre' and 'rating'
anime_df['genre'] = anime_df['genre'].fillna(anime_df['genre'].mode()[0])
anime_df['rating'] = anime_df['rating'].fillna(anime_df['rating'].mean())

# Preprocessing the 'episodes' column
anime_df['episodes'] = pd.to_numeric(anime_df['episodes'], errors='coerce')
anime_df['episodes'] = anime_df['episodes'].fillna(anime_df['episodes'].median())

# One-hot encode the genres
one_hot_encoder = OneHotEncoder()
genre_encoded = one_hot_encoder.fit_transform(anime_df[['genre']]).toarray()

# Standardize the ratings and episodes
scaler = StandardScaler()
ratings_scaled = scaler.fit_transform(anime_df[['rating']])
episodes_scaled = scaler.fit_transform(anime_df[['episodes']])

# Combine the features into a single feature set
features = np.hstack((genre_encoded, ratings_scaled, episodes_scaled))

# Calculate cosine similarity matrix
cosine_sim = cosine_similarity(features)


In [13]:
# Function to get recommendations
def get_recommendations(name, cosine_sim=cosine_sim, num_recommendations=10, similarity_threshold=0.5):
    if name not in anime_df['name'].values:
        return f"Anime '{name}' not found in the dataset."

    # Get the index of the anime that matches the title
    idx = anime_df[anime_df['name'] == name].index[0]

    # Get the pairwise similarity scores of all anime with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Filter out anime with similarity scores below the threshold
    sim_scores = [score for score in sim_scores if score[1] >= similarity_threshold]

    # Sort the anime based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the most similar anime
    sim_scores = sim_scores[1:num_recommendations + 1]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the most similar anime
    return anime_df['name'].iloc[anime_indices]

# Test the recommendation system
print(get_recommendations('Naruto'))

206                                         Dragon Ball Z
582                                                Bleach
628                                         Keroro Gunsou
816     Kochira Katsushikaku Kameari Kouenmae Hashutsu...
1834                                              Pokemon
1960                                     Atashin&#039;chi
1371                              Yu☆Gi☆Oh! Duel Monsters
8992                           Itazura Tenshi Chippo-chan
2362                                Dr. Slump: Arale-chan
907                                        Urusei Yatsura
Name: name, dtype: object


In [22]:
# Evaluation Function
from sklearn.model_selection import train_test_split

def evaluate_recommendation_system(anime_df, cosine_sim, num_recommendations=10):
    # Split the data into training and testing sets
    train_df, test_df = train_test_split(anime_df, test_size=0.2, random_state=42)

    # Placeholder lists for true positives, false positives, and false negatives
    true_positives = []
    false_positives = []
    false_negatives = []

    for name in test_df['name']:
        recommendations = get_recommendations(name, cosine_sim=cosine_sim, num_recommendations=num_recommendations)
        if not isinstance(recommendations, pd.Series):
            continue

        actual_genres = set(anime_df[anime_df['name'] == name]['genre'].values[0].split(', '))
        recommended_genres = set()
        for rec in recommendations:
            recommended_genres.update(anime_df[anime_df['name'] == rec]['genre'].values[0].split(', '))

        # Calculate true positives, false positives, and false negatives
        tp = len(actual_genres & recommended_genres)
        fp = len(recommended_genres - actual_genres)
        fn = len(actual_genres - recommended_genres)

        true_positives.append(tp)
        false_positives.append(fp)
        false_negatives.append(fn)

    # Calculate precision, recall, and F1 score
    tp_sum = sum(true_positives) # This line was incorrectly indented
    fp_sum = sum(false_positives)
    fn_sum = sum(false_negatives)

    precision = tp_sum / (tp_sum + fp_sum) if (tp_sum + fp_sum) > 0 else 0
    recall = tp_sum / (tp_sum + fn_sum) if (tp_sum + fn_sum) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

# Evaluate the recommendation system
precision, recall, f1 = evaluate_recommendation_system(anime_df, cosine_sim)
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}')

Precision: 0.41, Recall: 0.83, F1 Score: 0.55
