In [47]:
import os
import pickle

import implicit
import numpy as np
import pandas as pd
import scipy.sparse
from tqdm import tqdm

In [2]:
df = pickle.load(open("../../data/cleaned_data/user_anime_lists.pkl", "rb"))

## TODO remove this

In [3]:
extra = pickle.load(
    open("../../../anime_recommender/data/user_profiles/ExtraUserAnimeLists.pkl", "rb")
)
df = pd.concat(
    [
        df.loc[lambda x: ~x["username"].isin(extra.username)],
        extra[["username", "anime_id", "my_score"]],
    ],
    ignore_index=True,
)

# Normalization

In [4]:
average_rating = df["my_score"].mean()
user_bias = (
    pd.DataFrame(df.groupby("username")["my_score"].mean()).rename(
        {"my_score": "user_bias"}, axis=1
    )
    - average_rating
)
anime_bias = (
    pd.DataFrame(df.groupby("anime_id")["my_score"].mean()).rename(
        {"my_score": "anime_bias"}, axis=1
    )
    - average_rating
)

In [5]:
df = df.merge(anime_bias, on=["anime_id"]).merge(user_bias, on=["username"])
df["normalized_score"] = (
    df["my_score"] - df["anime_bias"] - df["user_bias"] - average_rating
)
df = df.dropna()

## Create a sparse matrix

In [6]:
user_ids = df[["username"]].drop_duplicates().reset_index(drop=True)
user_ids = user_ids.reset_index().rename({"index": "user_id"}, axis=1)

In [7]:
df = df.merge(user_ids, on="username")

In [14]:
# user-item matrix
R = scipy.sparse.coo_matrix((df["normalized_score"], (df["user_id"], df["anime_id"])))

In [18]:
for i, j, v in zip(R.row, R.col, R.data):
    print(i, j, v)
    break

0 21 0.6054736476506148


## Alternating least squares

In [53]:
# number of latent factors
K = 100

# user factors
X = np.random.default_rng().standard_normal((R.shape[0], K))

# item factors
Y = np.random.default_rng().standard_normal((K, R.shape[1]))

In [65]:
X[u, :]

array([ 0.84212542,  0.05647956,  1.38422928, -1.36568152,  1.23047597,
        0.39157219, -1.32036218,  0.74737695,  1.45640601, -0.59180446,
        0.04172253, -0.33977628,  0.63848984, -1.95739427,  0.38294226,
       -0.35605178,  0.33851752, -0.50882693,  1.36308425,  0.85092368,
        1.3924999 , -0.96214204, -0.57909811, -1.34670968, -1.26801485,
        1.49039446,  2.30033795,  0.83923011, -0.50188251, -1.10540208,
        0.3411895 , -1.78600506, -0.69228697, -0.8132054 , -0.2222082 ,
       -0.54112389, -0.83781031, -1.13588837, -0.79517271,  0.12945712,
        1.2039863 , -0.5435222 ,  0.85138593, -1.22453001,  1.45314017,
        1.63273581,  2.09341116, -0.41821475, -0.12447563,  2.31109379,
       -0.59378683,  0.53123804, -0.54914307,  0.3932507 ,  0.62264041,
        0.81384629, -0.04167501, -0.98871111, -1.21091292,  2.49631177,
       -1.71541655, -0.2511828 ,  0.45922761, -0.9633358 , -1.66269875,
       -2.23966565,  0.77250337,  0.06432058, -0.89757292,  0.66

In [66]:
for u, i, r in tqdm(zip(R.row, R.col, R.data)):
    print((X[u, :] @ Y[:, i]) ** 2)
    break

0it [00:00, ?it/s]

194.04008707580823





In [69]:
def training_loss(X, Y, R, λ):
    sse = 0
    N = 0
    for u, i, r in tqdm(zip(R.row, R.col, R.data)):
        sse += (r - X[u, :] @ Y[:, i]) ** 2
        N += 1
    return sse + λ * np.sum(X ** 2) + λ * np.sum(Y ** 2)

In [70]:
training_loss(X, Y, R, 0.1)

46358416it [02:14, 343953.73it/s]


4722489251.045748

In [48]:
user_item_mat[:, 0].asformat("coo")

<271680x1 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in COOrdinate format>

In [54]:
X.T @ user_item_mat[:, 1]

array([[ -66.00332253],
       [-483.78396434],
       [-596.32905136],
       [ 673.88067637],
       [ 281.20442432],
       [-366.30276219],
       [ 653.57415375],
       [-163.59993002],
       [-906.01998437],
       [-589.28197515]])

In [61]:
user_item_mat[:, 1].shape

(271680, 1)

In [60]:
X[:, 0].shape

(271680,)

In [58]:
X.T[0, :] + user_item_mat[:, 1]

ValueError: operands could not be broadcast together with remapped shapes [original->remapped]: (271680,)  and requested shape (271680,1)

In [53]:
user_item_mat[:, 1].nonzero()

(array([     1,      2,      4, ..., 236466, 236467, 236468], dtype=int32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int32))

In [44]:
X.T

array([[ 0.06562139,  1.42791229,  0.35396757, ..., -0.84945667,
         0.28283718,  0.2770499 ],
       [ 0.23053862,  0.22440642, -0.33240749, ...,  0.05164753,
         1.26772036, -0.66471765],
       [ 1.46876118,  0.37393615,  0.68481032, ...,  0.9879699 ,
        -0.10498704, -0.19720801],
       ...,
       [ 0.72083975,  0.44724505, -0.29310862, ...,  0.79588235,
         1.93882171,  0.19430441],
       [-0.98483782, -1.2906035 , -0.1195464 , ..., -1.32094051,
        -1.48430006,  0.1173306 ],
       [-0.2262312 , -0.72189462,  0.86276658, ..., -0.64379972,
        -1.34893883, -0.64031335]])

In [35]:
X.T @ X - user_item_mat[:, 0]

ValueError: operands could not be broadcast together with remapped shapes [original->remapped]: (10,10)  and requested shape (271680,1)

In [None]:
np.random.normal()

In [10]:
df

Unnamed: 0,username,anime_id,my_score,anime_bias,user_bias,normalized_score,user_id
0,karthiga,21,9,0.960564,-0.059898,0.605474,0
1,karthiga,59,7,0.040203,-0.059898,-0.474165,0
2,karthiga,74,7,0.316282,-0.059898,-0.750245,0
3,karthiga,120,7,0.309858,-0.059898,-0.743821,0
4,karthiga,178,7,-0.227339,-0.059898,-0.206624,0
...,...,...,...,...,...,...,...
46358411,temptemptemp,10040,6,-1.636717,-1.493860,1.636717,271675
46358412,cinnamoroller,12963,10,-0.798860,2.506140,0.798860,271676
46358413,inactiveX,5143,7,-0.652951,-0.493860,0.652951,271677
46358414,omgm,5581,5,-1.857497,-2.493860,1.857497,271678


In [9]:
os.environ["MKL_NUM_THREADS"] = "1"

In [20]:
model = implicit.als.AlternatingLeastSquares(factors=50)

In [21]:
model.fit(sparse_mat)

  0%|          | 0/15 [00:00<?, ?it/s]

In [22]:
recs = pd.DataFrame.from_records(
    model.recommend(19671, sparse_mat, 1000), columns=["anime_id", "score"]
)

In [23]:
seen_shows = df.loc[lambda x: x["username"] == "Fro116"]

In [24]:
recs.merge(seen_shows, on=["anime_id"])[["score", "my_score"]].corr()

Unnamed: 0,score,my_score
score,1.0,0.142057
my_score,0.142057,1.0


In [25]:
anime = pd.read_csv("../../data/cleaned_data/anime.csv")

In [26]:
recs.loc[lambda x: ~x.anime_id.isin(seen_shows.anime_id)].merge(
    anime, on="anime_id"
).loc[lambda x: x["type"] == "TV"][:30]

Unnamed: 0.1,anime_id,score,Unnamed: 0,title,genres,type,related_anime
2,16498,1.125039,8123,Shingeki no Kyojin,"Action, Military, Mystery, Super Power, Drama,...",TV,"[{'anime_id': 18397, 'relation': 'Side story'}..."
5,22319,1.000904,1235,Tokyo Ghoul,"Action, Mystery, Horror, Psychological, Supern...",TV,"[{'anime_id': 27899, 'relation': 'Sequel'}, {'..."
10,30276,0.909803,8863,One Punch Man,"Action, Sci-Fi, Comedy, Parody, Super Power, S...",TV,"[{'anime_id': 31704, 'relation': 'Side story'}..."
11,28223,0.853507,12967,Death Parade,"Game, Mystery, Psychological, Drama, Thriller",TV,"[{'anime_id': 14353, 'relation': 'Alternative ..."
12,7054,0.832874,11,Kaichou wa Maid-sama!,"Comedy, Romance, School, Shoujo",TV,"[{'anime_id': 9366, 'relation': 'Side story'},..."
13,22535,0.82936,9232,Kiseijuu: Sei no Kakuritsu,"Action, Sci-Fi, Horror, Psychological, Drama, ...",TV,[]
15,27899,0.817018,12935,Tokyo Ghoul √A,"Action, Mystery, Horror, Psychological, Supern...",TV,"[{'anime_id': 22319, 'relation': 'Prequel'}, {..."
16,14227,0.809003,13,Tonari no Kaibutsu-kun,"Slice of Life, Comedy, Romance, School, Shoujo",TV,"[{'anime_id': 16866, 'relation': 'Alternative ..."
17,5114,0.793178,2555,Fullmetal Alchemist: Brotherhood,"Action, Military, Adventure, Comedy, Drama, Ma...",TV,"[{'anime_id': 121, 'relation': 'Alternative ve..."
18,6045,0.785958,16,Kimi ni Todoke,"Slice of Life, Drama, Romance, School, Shoujo",TV,"[{'anime_id': 9656, 'relation': 'Sequel'}, {'a..."


In [19]:
recs.loc[lambda x: ~x.anime_id.isin(seen_shows.anime_id)].merge(
    anime, on="anime_id"
).loc[lambda x: x["type"] == "TV"][:30]

Unnamed: 0.1,anime_id,score,Unnamed: 0,title,genres,type,related_anime
0,16498,0.897526,8123,Shingeki no Kyojin,"Action, Military, Mystery, Super Power, Drama,...",TV,"[{'anime_id': 18397, 'relation': 'Side story'}..."
1,5114,0.84304,2555,Fullmetal Alchemist: Brotherhood,"Action, Military, Adventure, Comedy, Drama, Ma...",TV,"[{'anime_id': 121, 'relation': 'Alternative ve..."
2,6746,0.833527,9555,Durarara!!,"Action, Mystery, Supernatural",TV,"[{'anime_id': 8408, 'relation': 'Side story'},..."
5,4898,0.790061,13711,Kuroshitsuji,"Action, Comedy, Demons, Fantasy, Historical, S...",TV,"[{'anime_id': 6163, 'relation': 'Summary'}, {'..."
6,1,0.762213,9132,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,"[{'anime_id': 5, 'relation': 'Side story'}, {'..."
7,7054,0.761894,11,Kaichou wa Maid-sama!,"Comedy, Romance, School, Shoujo",TV,"[{'anime_id': 9366, 'relation': 'Side story'},..."
8,13601,0.759568,3226,Psycho-Pass,"Action, Police, Psychological, Sci-Fi",TV,"[{'anime_id': 17901, 'relation': 'Other'}, {'a..."
9,20,0.758924,4504,Naruto,"Action, Adventure, Comedy, Super Power, Martia...",TV,"[{'anime_id': 442, 'relation': 'Side story'}, ..."
10,9919,0.751778,5437,Ao no Exorcist,"Action, Demons, Fantasy, Shounen, Supernatural",TV,"[{'anime_id': 10647, 'relation': 'Side story'}..."
11,3588,0.750829,8453,Soul Eater,"Action, Fantasy, Comedy, Supernatural, Shounen",TV,"[{'anime_id': 21195, 'relation': 'Summary'}, {..."


In [None]:
anime

In [36]:
Y = np.random.rand(2000000, 10)

In [37]:
(Y.T @ Y).shape

(10, 10)

In [33]:
np.matmul?

[0;31mCall signature:[0m  [0mnp[0m[0;34m.[0m[0mmatmul[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m            ufunc
[0;31mString form:[0m     <ufunc 'matmul'>
[0;31mFile:[0m            ~/opt/anaconda3/lib/python3.8/site-packages/numpy/__init__.py
[0;31mDocstring:[0m      
matmul(x1, x2, /, out=None, *, casting='same_kind', order='K', dtype=None, subok=True[, signature, extobj])

Matrix product of two arrays.

Parameters
----------
x1, x2 : array_like
    Input arrays, scalars not allowed.
out : ndarray, optional
    A location into which the result is stored. If provided, it must have
    a shape that matches the signature `(n,k),(k,m)->(n,m)`. If not
    provided or None, a freshly-allocated array is returned.
**kwargs
    For other keyword-only arguments, see the
    :ref:`ufunc docs <ufuncs.kwargs>`.

    .. versionadded:: 1.16
       Now handles ufunc kwargs

Returns
-------
y : n