In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [None]:
ratings_path = '/kaggle/input/anime-recommendation-database-2020/animelist.csv'
data_path = '/kaggle/input/anime-recommendation-database-2020/anime.csv'

The dataset that we are dealing with is divided into two parts, the first part which we are going to store as `anime_ratings` consists of the individual user ratings for different animes<br>
The second part, `anime_data` consists of overall information about the anime such as genre, ratings, name, etc.

# Preprocessing

In [None]:
anime_ratings = pd.read_csv(ratings_path)
anime_data = pd.read_csv(data_path)

In [None]:
anime_data.info()

1) `anime_data` contains:

* MAL_ID: MyAnimelist ID of the anime. (e.g. 1)
* Name: full name of the anime. (e.g. Cowboy Bebop)
* Score: average score of the anime given from all users in MyAnimelist database. (e.g. 8.78)
* Genres: comma separated list of genres for this anime. (e.g. Action, Adventure, Comedy, Drama, Sci-Fi, Space)
* English name: full name in english of the anime. (e.g. Cowboy Bebop)
* Japanese name: full name in japanses of the anime. (e.g. カウボーイビバップ)
* Type: TV, movie, OVA, etc. (e.g. TV)
* Episodes': number of chapters. (e.g. 26)
* Aired: broadcast date. (e.g. Apr 3, 1998 to Apr 24, 1999)
* Premiered: season premiere. (e.g. Spring 1998)
* Producers: comma separated list of produducers (e.g. Bandai Visual)
* Licensors: comma separated list of licensors (e.g. Funimation, Bandai Entertainment)
* Studios: comma separated list of studios (e.g. Sunrise)
* Source: Manga, Light novel, Book, etc. (e.g Original)
* Duration: duration of the anime per episode (e.g 24 min. per ep.)
* Rating: age rate (e.g. R - 17+ (violence & profanity))
* Ranked: position based in the score. (e.g 28)
* Popularity: position based in the the number of users who have added the anime to their list. (e.g 39)
* Members: number of community members that are in this anime's "group". (e.g. 1251960)
* Favorites: number of users who have the anime as "favorites". (e.g. 61,971)
* Watching: number of users who are watching the anime. (e.g. 105808)
* Completed: number of users who have complete the anime. (e.g. 718161)
* On-Hold: number of users who have the anime on Hold. (e.g. 71513)
* Dropped: number of users who have dropped the anime. (e.g. 26678)
* Plan to Watch': number of users who plan to watch the anime. (e.g. 329800)
* Score-10': number of users who scored 10. (e.g. 229170)
* Score-9': number of users who scored 9. (e.g. 182126)
* Score-8': number of users who scored 8. (e.g. 131625)
* Score-7': number of users who scored 7. (e.g. 62330)
* Score-6': number of users who scored 6. (e.g. 20688)
* Score-5': number of users who scored 5. (e.g. 8904)
* Score-4': number of users who scored 4. (e.g. 3184)
* Score-3': number of users who scored 3. (e.g. 1357)
* Score-2': number of users who scored 2. (e.g. 741)
* Score-1': number of users who scored 1. (e.g. 1580)

In [None]:
anime_data.head()

As we are going to build a simplistic model, we drop most of our data to allow for easier computation:

In [None]:
anime_data = anime_data[['MAL_ID', 'Name', 'Score', 'Genres', 'Type', 'Episodes', 'Members']]

In [None]:
anime_data.rename(columns={'MAL_ID':"anime_id"},inplace=True)

In [None]:
anime_data.info()

In [None]:
anime_ratings.info()

In [None]:
anime_ratings.drop(anime_ratings.iloc[:,3:],axis=1,inplace=True)

In [None]:
anime_ratings.info()

In [None]:
anime_ratings.anime_id.nunique()

In [None]:
# anime_complete = pd.merge(anime_data,anime_ratings,on='anime_id')
# anime_complete=anime_complete.rename(columns={'rating':'user_rating','Score':'total_rating'})
# anime_complete.info()

The problem being faced above is that the resultant df is too big to work with (8.1+ GB!)<br>
To fix this, we downsample the anime_ratings df and then try creating the `anime_complete` df

In [None]:
anime_ratings = anime_ratings.sample(frac=0.2)
anime_ratings.info()

In [None]:
anime_ratings.anime_id.nunique()

We see that some animes are complete lost, we proceed anyway as it's not a significant number

We combine the two seperate dataframes into a single dataframe:

In [None]:
anime_complete = pd.merge(anime_data,anime_ratings,on='anime_id')
anime_complete=anime_complete.rename(columns={'rating':'user_rating','Score':'total_rating','Name':'anime_title'})
anime_complete.info()

In [None]:
anime_complete.isna().sum()

In [None]:
import matplotlib.pyplot as plt

# Count the number of occurrences of each anime name
top_10_anime = anime_complete['anime_title'].value_counts().nlargest(10)
palette = sns.color_palette('rocket', len(top_10_anime))
# Create the bar chart
plt.bar(top_10_anime.index, top_10_anime.values, color=palette)

# Set the title and labels
plt.title('Top 10 Anime by User Rating Count')
plt.xlabel('Anime Name')
plt.ylabel('User Rating Count')

# Rotate the x-axis labels for better readability
plt.xticks(rotation=40, ha="right")

# Show the plot
plt.show()

In [None]:
top_10_anime = anime_complete.sort_values(by='Members', ascending=False).drop_duplicates(subset='anime_title').head(10)

palette = sns.color_palette('rocket', len(top_10_anime))


# Create a bar chart with the anime titles and the number of members
plt.bar(top_10_anime['anime_title'], top_10_anime['Members'],color=palette)

# Set the title and labels
plt.title('Top 10 Anime by Number of Members')
plt.xlabel('Anime Title')
plt.ylabel('Number of Members')

# Rotate the x-axis labels for better readability
plt.xticks(rotation=40,ha='right')

# Show the plot
plt.show()

In [None]:
anime_features = anime_complete.copy()
anime_features.head()

In [None]:
anime_features.isnull().sum()

In [None]:
user_id_counts = anime_features['user_id'].value_counts()
user_id_counts

In [None]:
user_id_counts.describe()

In order to consider only the reviewers of "trusted" members, the ones we are considering trustworthy are those who have reviewed a certain number (100 in our case) of animes, the rest of the reviewers may be dropped.

In [None]:
# Keep only the rows for which the user_id appears at least 200 times
anime_features = anime_features[anime_features['user_id'].isin(user_id_counts[user_id_counts >= 100].index)]

In [None]:
anime_features.user_id.nunique()

Since the title text was not found to be clean we a function to clean the title names using regex:

In [None]:
def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    
    return text

In [None]:
anime_features['anime_title'] = anime_features['anime_title'].apply(text_cleaning)

In [None]:
anime_pivot=anime_features.pivot_table(index='anime_title',columns='user_id',values='user_rating').fillna(0)
anime_pivot.head()

# Collaborative Filtering

Collaborative filtering is a type of recommendation algorithm that predicts a user's preference for an item by finding patterns in the preferences of similar users. It works by analyzing a large dataset of user-item interactions, such as ratings or purchase histories, and then identifies users with similar patterns of interactions. The algorithm then uses these similarities to recommend items that the target user has not interacted with, but that similar users have rated highly. Collaborative filtering is widely used in recommendation systems for online retailers, streaming services, and social media platforms, among others.


## Cosine Similarity using KNN

Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space. In the context of recommendation systems, cosine similarity is often used to determine how similar two items or users are based on their feature vectors.

In [None]:
anime_data['Name'] = anime_data['Name'].apply(text_cleaning)

Sparse matrix was created to optimize the memory usage and computational efficiency of the model. The user-anime matrix can be very large and the majority of the entries are likely to be zeros, which means that they don't contribute to the similarity computations. By converting the matrix to a sparse format, we can represent it using less memory, and perform computations only on the non-zero entries. This can significantly speed up the model training and recommendation generation processes.


In [None]:
# Import the necessary libraries
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Convert the pivot table to a sparse matrix format
anime_matrix = csr_matrix(anime_pivot.values)

# Create a NearestNeighbors model using cosine similarity and brute-force algorithm
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')

# Fit the model to the data (i.e., the sparse matrix)
model_knn.fit(anime_matrix)

In [None]:
# import pickle
# with open('model.pkl', 'wb') as f:
#     pickle.dump(model_knn, f)
# with open('anime_pivot.pkl', 'wb') as f:
#     pickle.dump(anime_pivot, f)

In [None]:
# Select a random anime title, or input your own title here
anime_title = np.random.choice(anime_pivot.index)

# Print the selected anime title
print(f"Randomly selected anime title: {anime_title} \n")

# Find the row index of the selected anime title
query_index = anime_pivot.index.get_loc(anime_title)

# Use the fitted KNN model to find the 6 nearest neighbors to the selected row
distances, indices = model_knn.kneighbors(anime_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6)

# Print the recommendations for the selected row
print(f"Recommendations for {anime_pivot.index[query_index]}:\n")

# Iterate over the nearest neighbors and print their names and distances
for i, (distance, index) in enumerate(zip(distances.flatten()[1:], indices.flatten()[1:])):
    print(f"{i+1}: {anime_pivot.index[index]}, with distance of {distance}")

Code above is doing the following:

* It selects a random row index from the pivot table using np.random.choice() and assigns it to query_index.

* It uses the fitted KNN model to find the 6 nearest neighbors to the selected row by calling the kneighbors() method and passing the values of the selected row as a reshaped array. The distances and indices of the nearest neighbors are assigned to distances and indices, respectively.

* It prints the recommendations for the selected row by using an f-string to format the row index.

* It iterates over the nearest neighbors (excluding the selected row itself) and prints their names and distances using an f-string and the zip() function to iterate over the distances and indices arrays simultaneously. The enumerate() function is used to add a counter to the loop starting from 1.

In [None]:
def give_rec_knn(anime_title = np.random.choice(anime_pivot.index),anime_pivot=anime_pivot):

    # Print the selected anime title
    print(f"Randomly selected anime title: {anime_title} \n")

    # Find the row index of the selected anime title
    query_index = anime_pivot.index.get_loc(anime_title)

    # Use the fitted KNN model to find the 6 nearest neighbors to the selected row
    distances, indices = model_knn.kneighbors(anime_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6)

    # Print the recommendations for the selected row
    print(f"Recommendations for {anime_pivot.index[query_index]}:\n")

    # Iterate over the nearest neighbors and print their names and distances
    for i, (distance, index) in enumerate(zip(distances.flatten()[1:], indices.flatten()[1:])):
        print(f"{i+1}: {anime_pivot.index[index]}, with distance of {distance}")

In [None]:
give_rec_knn("Steins;Gate")
give_rec_knn()

# Content Based Filtering

Content-based filtering is a recommendation system technique that recommends items based on their intrinsic features or attributes. It identifies items similar to the ones the user has shown interest in and recommends them. For example, a movie recommendation system might recommend other movies with similar genres, actors, directors, or plot themes to those previously watched by the user. Content-based filtering does not rely on the preferences of other users and can work well for new or niche items with little user data. However, it may suffer from limited diversity in recommendations and inability to capture serendipitous recommendations.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer with various parameters
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

# Fill NaN values in the 'Genres' column with an empty string
anime_data['Genres'] = anime_data['Genres'].fillna('')

# Split the 'Genres' column by comma and convert to string format
genres_str = anime_data['Genres'].str.split(',').astype(str)

# Use the TfidfVectorizer to transform the genres_str into a sparse matrix
tfv_matrix = tfv.fit_transform(genres_str)

# Print the shape of the sparse matrix
print(tfv_matrix.shape)

TF-IDF stands for Term Frequency-Inverse Document Frequency.<br>
TF-IDF vectorizer is a specific implementation of the TF-IDF technique. It is a commonly used technique in natural language processing to convert a collection of text documents into numerical feature vectors, which can be used for machine learning tasks like text classification or clustering.
* It first counts the number of occurrences of each word (term) in each document (text) in the collection, and then * calculates a weight for each term based on how frequently it appears across all documents. 
* The weight is higher for terms that appear frequently in a particular document, but not so much in other documents. 
* The weight is also higher for terms that appear less frequently across all documents.

This helps to give more importance to words that are relevant to a specific document and less importance to common words that are not specific to any document.



In [None]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

The sigmoid kernel is a type of kernel function used in machine learning for non-linear classification and regression. It maps the data into a higher-dimensional space and computes the dot product between two data points in that space. 
* The sigmoid_kernel is a similarity function that computes the sigmoid kernel between two input feature vectors.

* It is commonly used in machine learning for non-linear classification and regression tasks.

* The sigmoid kernel function takes two feature vectors as input and computes a value between 0 and 1, where 1 indicates a high degree of similarity between the two vectors and 0 indicates no similarity.

* The sigmoid kernel function applies the sigmoid function to the dot product of the two feature vectors, which transforms the dot product into a value between 0 and 1.

* The `sigmoid_kernel` is being used to compute the similarity between anime genres based on their TF-IDF feature vectors, which can be used for content-based recommendation systems.

In [None]:
# Create a Pandas Series object where the index is the anime names and the values are the indices in anime_data
indices = pd.Series(anime_data.index, index=anime_data['Name'])

# Remove duplicates in the index (i.e., duplicate anime names)
indices = indices.drop_duplicates()

In [None]:
def give_rec_cbf(title, sig=sig):
    # Get the index corresponding to anime title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))

    # Sort the anime based on similarity scores
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of top 10 most similar anime excluding the input anime
    anime_indices = [i[0] for i in sig_scores[1:11]]

    # Create dataframe of top 10 recommended anime
    top_anime = pd.DataFrame({
        'Anime name': anime_data['Name'].iloc[anime_indices].values,
        'Rating': anime_data['Score'].iloc[anime_indices].values
    })

    return top_anime

In [None]:
give_rec_cbf('One Piece')

# Extra Stuff:

## Repository for Streamlit App: [here](https://github.com/5ham5h33r/Anime_Recommendation_System)
## PyPi Package: [here](https://pypi.org/project/animrec/)