In [1]:
import requests
from itertools import islice 
import io
import json
import pandas as pd
import numpy as np
import zipfile as zf
from scipy import sparse as sp
from tqdm.notebook import tqdm
from urllib.parse import quote as qt
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import CosineRecommender, bm25_weight
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRanker
from gensim.models import Word2Vec, KeyedVectors
from multiprocessing import Pool
from collections import defaultdict, Counter

In [2]:
base_url = 'https://cloud-api.yandex.net/v1/disk/resources/download?'
folder_url = 'https://disk.yandex.ru/d/SI1aAooPn9i8TA'
file_url = 'likes_data.zip'
url = 'https://cloud-api.yandex.net/v1/disk/public/resources/download' + '?public_key=' + qt(folder_url) + '&path=/' + qt(file_url)
r = requests.get(url) # запрос ссылки на скачивание
h = json.loads(r.text)['href'] # 'парсинг' ссылки на скачивание
download_response = requests.get(h, stream=True)
z = zf.ZipFile(io.BytesIO(download_response.content))

with z.open('track_artists.csv') as f:
    df_tracks = pd.read_csv(f)

In [3]:
class ItemEncoder:
    
    def __init__(self, items):
        self.item_idx = {}
        self.item_pid = {}
        if type(items) == pd.DataFrame:
            self.item_idx = items[items.columns[0]].to_dict()
            self.item_pid = items.reset_index().set_index(items.columns[0])['index'].to_dict()
        else:
            for idx, pid in enumerate(items):
                self.item_idx[pid] = idx
                self.item_pid[idx] = pid
    
    def to_idx(self,items):
        '''
        Получить значения по индексу
        '''
        if len(items) == 1:
            return self.item_idx[items[0]]
        return list(map(lambda x: self.item_idx[int(x)], items))
    
    def to_pid(self,p_list):
        '''
        Получить индекс по значению
        '''
        if len(p_list) == 1:
            return [self.item_pid[int(p_list[0])]]
        return list(map(lambda x: self.item_pid[int(x)], p_list))
    
    def make_csr_data(self, data):
        '''
        Создание матрицы user/items
        '''
        rows = [0]
        cols = []
        values = []
        for user in tqdm(data):
            idx_col = self.to_pid(user)
            value = list(np.ones(len(user)))
            cols.extend(idx_col)
            values.extend(value)
            rows.append(len(cols))
        return sp.csr_matrix((values, cols,rows),dtype='float64')


class CooccurrenceRecommender:
    def __init__(self, dist = 1, ):
        self.d = defaultdict(dict)
        self.distance = dist
    
    def fit(self, X):
        for user in tqdm(X):
            start = len(user) // 2
            end = len(user)
            for i in range(start,end):
                try:
                    self.d[user[i]][user[i+self.distance]] = self.d[user[i]].get(user[i+self.distance], 0) + 1
                except:
                    continue
    
    def predict(self, X, pos=-1, topn=100):
        result = []
        for user in tqdm(X):
            res = list(map(lambda x: x[0],Counter(self.d[str(user[pos])]).most_common(topn)))
            result.append(res)
        return result


class SplitData:
    def __init__(self, path, size=0.7, ):
        self.size = size
        self.path = path
  
    def read_data(self):
        with z.open(self.path) as f:
            data = map(lambda x: x.decode('ascii').strip().split(' '), f.readlines())
        return data 
  
    def split_X1(self):
        for user in self.read_data():
            l = int(len(user) * self.size)
            X1 = user[:l]
            yield X1

    def split_X2(self):
        for user in self.read_data():
            l = int(len(user) * self.size)
            X2 = user[l:]
            yield X2


def score(X, y):
    cou = 0
    for k, user in enumerate(tqdm(X)):
        user = list(map(int, user))
        if int(y[k]) in user:
            cou += 1 / (list(user).index(int(y[k]))+1)
        else:
            cou += 0
    print('Scor: {}'.format(cou/len(y)))

def make_y(data):
    X=[]
    y = []
    for i in tqdm(data):
        X.append(list(map(int,i[:-1])))
        y.append(i[-1])
    return X, y

def make_top_als(model, X_test,  X_gbm=None, n=100):
    user_ids = np.arange(X_test.shape[0])
    rec, scores = model.recommend(user_ids, X_test, N=n,filter_already_liked_items=True, recalculate_user=True)
    result = np.array(list(map(lambda x: tracks.to_idx(x), rec)))
    us_id = np.ones((rec.shape[0], n)) * np.arange(rec.shape[0])[:,np.newaxis]
    rank = np.array(list(map(lambda x: np.argsort(-x), scores))).ravel()
    if X_gbm:
        target = get_target(result,X_gbm)
        print('us_ids: {}\nres: {}\nscores: {}\ntarget: {}'.format(us_id.shape,  result.shape, scores.shape, target.shape))
        out_put = np.vstack([us_id.ravel(), result.ravel(), scores.ravel(), rank, target.ravel()])
        return out_put
    out_put = np.vstack([us_id.ravel(), result.ravel(), scores.ravel(), rank])
    return out_put

In [4]:
data_train = SplitData(size=0.8, path='train')
data_test = SplitData(size=0.5, path='test')

In [5]:
n = 900000
X_train_full = data_train.read_data()
X_train_1 = islice(data_train.split_X1(),0,n)
X_train_2 = islice(data_train.split_X2(),0,n)
X_test, y = make_y(islice(data_test.split_X1(), 0, 100000))

0it [00:00, ?it/s]

In [6]:
tracks = ItemEncoder(df_tracks[['trackId']])
X_train_als = tracks.make_csr_data(X_train_full)
X_test_als = tracks.make_csr_data(X_test)
user_track_bm25 = bm25_weight(X_train_als)

0it [00:00, ?it/s]

  0%|          | 0/100000 [00:00<?, ?it/s]

In [7]:
model_als = AlternatingLeastSquares(factors=300, iterations=2, num_threads=-1)
model_als.fit(user_track_bm25)



  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
top_200_test = make_top_als(model_als, X_test_als, n=100)
res = top_200_test[1].reshape(-1,100)[:,:100]
score(res, y)

  0%|          | 0/100000 [00:00<?, ?it/s]

Scor: 0.03817243388048856


In [9]:
#реализовал решение после завершения соревнований
n = 900000
X_train_full = data_train.read_data()
X_train_1 = islice(data_train.split_X1(),0,n)
X_train_2 = islice(data_train.split_X2(),0,n)
X_test, y = make_y(islice(data_test.split_X1(), 0, 100000))

X_train_1 = islice(data_train.split_X1(),0,n)
model_co = CooccurrenceRecommender()
model_co.fit(X_train_full)
pred = model_co.predict(X_test)
score(pred,y)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/100000 [00:00<?, ?it/s]

  0%|          | 0/100000 [00:00<?, ?it/s]

Scor: 0.05327767220140505
