In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

# For the genre_vector.
genre_map = ['Comedy',
 'Action',
 'Adventure',
 'Fantasy',
 'Sci-Fi',
 'Drama',
 'Shounen',
 'Kids',
 'Romance',
 'School',
 'Slice of Life',
 'Hentai',
 'Supernatural',
 'Mecha',
 'Music',
 'Historical',
 'Magic',
 'Ecchi',
 'Shoujo',
 'Seinen',
 'Sports',
 'Mystery',
 'Super Power',
 'Military',
 'Parody',
 'Space',
 'Horror',
 'Harem',
 'Demons',
 'Martial Arts',
 'Dementia',
 'Psychological',
 'Police',
 'Game',
 'Samurai',
 'Vampire',
 'Thriller',
 'Cars',
 'Shounen Ai',
 'NaN',
 'Shoujo Ai',
 'Josei',
 'Yuri',
 'Yaoi']

In [2]:
anime = pd.read_csv('assignment_2_anime.csv')
rtrain = pd.read_csv('assignment_2_ratings_train.csv')
rtest = pd.read_csv('assignment_2_ratings_test.csv')

# preprocess rtrain as discussed in explore.ipynb
rtrain = rtrain.drop_duplicates(subset=['user_id', 'anime_id'], keep='last')
rtest = rtest.drop_duplicates(subset=['user_id', 'anime_id'], keep='last')

In [3]:
train_users = rtrain.user_id.unique()
test_users = rtest.user_id.unique()

In [4]:
print(f'There are {train_users.size} unique users in the training set.')
print(f'There are {test_users.size} unique users in the test set.')
print(f'{np.intersect1d(train_users, test_users).size} users are in both sets.')

There are 68421 unique users in the training set.
There are 64627 unique users in the test set.
63448 users are in both sets.


In [5]:
# dictionaries to store the mean rating for each user and anime
user_mean = rtrain.groupby('user_id')[['rating']].mean().to_dict()['rating']
anime_mean = rtrain.groupby('anime_id')[['rating']].mean().to_dict()['rating']
all_mean: float = rtrain.rating.mean()

In [6]:
# evaluate using user_mean
MSE = 0
for row in rtest.itertuples():
    try:
        user_mean_pred = user_mean[row.user_id]
    except KeyError:
        user_mean_pred = all_mean
    MSE += (row.rating - user_mean_pred) ** 2
MSE /= rtest.shape[0]
MSE

1.8638812628457382

In [7]:
# evaluate using anime_mean
MSE = 0
for row in rtest.itertuples():
    try:
        anime_mean_pred = anime_mean[row.anime_id]
    except KeyError:
        anime_mean_pred = all_mean
    MSE += (row.rating - anime_mean_pred) ** 2
MSE /= rtest.shape[0]
MSE

2.0626857671311813

In [8]:
# evaluate using all_mean
MSE = 0
for row in rtest.itertuples():
    MSE += (row.rating - all_mean) ** 2
MSE /= rtest.shape[0]
MSE

2.472705125578593

# Singular Value Decomposition

using https://surpriselib.com/

In [10]:
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Load your dataframes
anime_df = anime
rtrain_df = rtrain
rtest_df = rtest

# Create a Reader object for parsing the ratings dataframes
reader = Reader(rating_scale=(1, 10))

# Load trainset and testset from your pre-split rtrain and rtest dataframes
train_data = Dataset.load_from_df(rtrain_df, reader)
trainset = train_data.build_full_trainset()

test_data = Dataset.load_from_df(rtest_df, reader)
testset = test_data.construct_testset(raw_testset=test_data.raw_ratings)

# Train the SVD algorithm on the trainset
algo = SVD()
algo.fit(trainset)

# Test the algorithm on the testset
predictions = algo.test(testset)

# Calculate the mean squared error
mse = accuracy.mse(predictions)

# Function to predict the rating
def predict_rating(user_id, anime_id):
    return algo.predict(user_id, anime_id).est

# Example usage
user_id = 1
anime_id = 21 # One Piece
predicted_rating = predict_rating(user_id, anime_id)
print(f"The predicted rating for user {user_id} and anime {anime_id} is {predicted_rating:.2f}")

MSE: 1.3164
The predicted rating for user 1 and anime 21 is 8.09


# Apriori Algorithm

using https://github.com/rasbt/mlxtend

In [19]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Convert user_id, anime_id pairs into lists of animes watched by each user
watched_animes = rtrain_df.groupby("user_id")["anime_id"].apply(list)

# Use TransactionEncoder to encode the dataset into a one-hot encoded DataFrame
te = TransactionEncoder()
te_ary = te.fit(watched_animes).transform(watched_animes)
watched_animes_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Apply the Apriori algorithm to find frequent itemsets
min_support = 0.05
frequent_itemsets = apriori(watched_animes_encoded, min_support=min_support, use_colnames=True)

# Generate association rules from the frequent itemsets
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

# Function to generate recommendations based on the Apriori algorithm
def apriori_recommendations(user_id, top_n=10):
    watched = set(rtrain_df[rtrain_df["user_id"] == user_id]["anime_id"].tolist())
    candidate_rules = rules[rules["antecedents"].apply(lambda x: x.issubset(watched))]
    candidate_rules = candidate_rules.sort_values("confidence", ascending=False)
    
    recommendations = []
    for _, row in candidate_rules.iterrows():
        new_recommendations = list(row["consequents"] - watched)
        recommendations.extend(new_recommendations)
        if len(recommendations) >= top_n:
            break
    
    return recommendations[:top_n]

Recommended animes for user 1: []


In [21]:
# Example usage
user_id = 44017
recommended_animes = apriori_recommendations(user_id, top_n=10)
print(f"Recommended animes for user {user_id}: {recommended_animes}")

Recommended animes for user 44017: [1195, 1195, 2904, 2904, 2904, 2904, 2904, 2904, 2904, 1535]


Keep in mind that the generated recommendations will not have predicted ratings, as the Apriori algorithm is not designed for rating prediction.