## Import libraries

In [None]:
from tqdm.auto import trange
from functools import lru_cache
from scipy.sparse import csr_matrix
from implicit.nearest_neighbours import TFIDFRecommender

## Import data

In [None]:
with open('train') as f:
    train = f.read().split('\n')[:-1]
    print(len(train))

In [None]:
with open('test') as f:
    tests = f.read().split('\n')[:-1]
    print(len(tests))

## Data preprocessing

In [None]:
from math import log

# Scary function that determines the importance of the user's songs with 
# high accuracy (needs improvement)
@lru_cache(None)
def f(n):
    if n < 50:
        return [((i+1) / n) ** log(n, 3) for i in range(n)]
    if n < 100:
        return [((i+1) / n) ** log(n, 2) for i in range(n)]
    return [((i+1) / n) ** 4 for i in range(n)]

# Convert the user's song sequence into a Compressed Sparse Row matrix (CSR)
ALL_SONGS = 483_275
def lines2csr(lines):
    ALL_USERS = len(lines)
    row, col, data = [], [], []
    for i in range(len(lines)):
        songs = lines[i].split()
        size = len(songs)
        row += [i] * size
        col += list(map(int, songs))
        data += f(size)
    return csr_matrix((data, (row, col)), shape=(ALL_USERS, ALL_SONGS))

## Create validation tests

In [None]:
X_test = [' '.join(x.split()[:-1]) for x in tests]
y_test = [int(x.split()[-1]) for x in tests]

In [None]:
%%time
csr_train = lines2csr(train + X_test)

## Training

In [None]:
model = TFIDFRecommender(K=1001)
model.fit(csr_train.T)

In [None]:
mrr = 0
N = 50000
for i in trange(1160084, 1160084 + N):
    pred = model.recommend(i, csr_train, N=100, filter_already_liked_items=True)
    pred = [x[0] for x in pred]

    target = y_test[i-1160084]
    if target in pred:
        mrr += 1 / (pred.index(target) + 1)
print(f'MRR@100: {round(mrr / N, 5)}') # 0.06663

### Now delete unnecessary data to free up memory

In [None]:
del X_test
del y_test

del model
del csr_train

## Submission

In [None]:
%%time
csr_train = lines2csr(train + tests)

In [None]:
model = TFIDFRecommender(K=1501)
model.fit(csr_train.T)

In [None]:
res = ''
for i in trange(1160084, len(train + tests)):
    pred = model.recommend(i, csr_train, N=100, filter_already_liked_items=True)
    res += ' '.join(str(x[0]) for x in pred) + '\n'

In [None]:
with open('sub_tfid', 'w') as f:
    f.write(res)