In [1]:
!pip install implicit
!pip install lightfm
!pip install --upgrade gensim
import requests
from itertools import islice 
import io
import json
import implicit as im
import pandas as pd
import numpy as np
import zipfile as zf
from scipy import sparse as sp
from tqdm.notebook import tqdm
from urllib.parse import quote as qt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


  f"CUDA extension is built, but disabling GPU support because of '{e}'",


# Preprocessing

In [2]:
base_url = 'https://cloud-api.yandex.net/v1/disk/resources/download?'
folder_url = 'https://disk.yandex.ru/d/SI1aAooPn9i8TA'
file_url = 'likes_data.zip'
url = 'https://cloud-api.yandex.net/v1/disk/public/resources/download' + '?public_key=' + qt(folder_url) + '&path=/' + qt(file_url)
r = requests.get(url) # запрос ссылки на скачивание
h = json.loads(r.text)['href'] # 'парсинг' ссылки на скачивание
download_response = requests.get(h, stream=True)
z = zf.ZipFile(io.BytesIO(download_response.content))

with z.open('track_artists.csv') as f:
  df_tracks = pd.read_csv(f)

In [3]:
with z.open('train') as f:
  data = map(lambda x: x.decode('ascii').strip().split(' '), f.readlines())
with z.open('test') as f:
  test = map(lambda x: x.decode('ascii').strip().split(' '), f.readlines())

In [None]:
df_tracks.astype('object').describe()

In [3]:
class ItemEncoder:
  '''
  Класс для кодировки значений в индексы и обратно.
  На вход: список значений в формате pandas.DataFrame или list
  '''
  def __init__(self, items):
    self.item_idx = {}
    self.item_pid = {}
    if type(items) == pd.DataFrame:
      self.item_idx = items[items.columns[0]].to_dict()
      self.item_pid = items.reset_index().set_index(items.columns[0])['index'].to_dict()
    else:
      for idx, pid in enumerate(items):
        self.item_idx[pid] = idx
        self.item_pid[idx] = pid
    
  def to_idx(self,items):
    '''
    Получить значения по индексу
    '''
    if len(items) == 1:
      return self.item_idx[items[0]]
    return list(map(lambda x: self.item_idx[int(x)], items))
  
  def to_pid(self,p_list):
    '''
    Получить индекс по значению
    '''
    if len(p_list) == 1:
      return [self.item_pid[int(p_list[0])]]
    return list(map(lambda x: self.item_pid[int(x)], p_list))
  
  def make_csr_data(self, data):
    '''
    Создание матрицы user/items
    '''
    rows = [0]
    cols = []
    values = []
    for user in tqdm(data):
      idx_col = self.to_pid(user)
      value = list(np.ones(len(user)))
      cols.extend(idx_col)
      values.extend(value)
      rows.append(len(cols))
    return sp.csr_matrix((values, cols,rows),dtype='float64')

# Implicit ALS

In [4]:
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import CosineRecommender, bm25_weight
from sklearn.model_selection import train_test_split

In [5]:
def score(X, y):
  cou = 0
  for k, user in enumerate(tqdm(X)):
    user = list(map(int, user))
    if int(y[k]) in user:
      cou += 1 / (list(user).index(int(y[k]))+1)
    else:
      cou += 0
  print('Scor: {}'.format(cou/len(y)))

def make_subm(X_test, model, n=100):
  user_ids = np.arange(X_test.shape[0])
  rec, scores = model.recommend(user_ids, X_test, N=n,filter_already_liked_items=True, recalculate_user=True)
  result = []
  for user in tqdm(rec):
    result.append(tracks.to_idx(user))
  return result,scores

In [None]:
with z.open('test') as f:
  test = map(lambda x: x.decode('ascii').strip().split(' '), f.readlines())
user_test = DataSet.make_csr_data(test, tracks)

In [None]:
user_track_bm25 = bm25_weight(user_track)
model_als = AlternatingLeastSquares(factors=900, iterations=2, use_gpu=True)
model_als.fit(user_track_bm25)

In [None]:
res = make_subm(user_test, model_als)
pd.DataFrame(res).to_csv('/content/drive/MyDrive/Colab Notebooks/YA_cup_ML/als_top200.csv', index=False, header=False, sep=' ') #save_file

# LightFM

In [6]:
from lightfm import LightFM
from lightfm.datasets import fetch_stackexchange
from lightfm.data import Dataset
from lightfm.evaluation import reciprocal_rank, precision_at_k

def pred_fm(model, user_idxs, X):
  n_items = np.arange(X.shape[1])
  result = []
  for user in tqdm(user_idxs):
    feat = n_items#list(set(n_items) - set(X[user].indices))
    scores = model.predict(int(user), feat)
    top_items = np.argsort(scores)[-100:]
    result.append(tracks.to_idx(top_items))
  return result

# ALS + lightgbm


In [7]:
import itertools as it
from lightgbm import LGBMRanker
from gensim.models import Word2Vec, KeyedVectors
from multiprocessing import Pool

In [8]:

def get_train_1(size=0.7, path='train'):
  with z.open(path) as f:
    data = map(lambda x: x.decode('ascii').strip().split(' '), f.readlines())
  for user in data:
    batch = int(len(user) * size)
    train_1 = user[:batch]
    yield train_1

def get_train_2(size=0.7, path='train'):
  with z.open(path) as f:
    data = map(lambda x: x.decode('ascii').strip().split(' '), f.readlines())
  for user in data:
    batch = int(len(user) * size)
    train_2 = user[batch:]
    yield train_2


def get_target(top_200, X_gbm):
  targ = []
  for k, i in enumerate(X_gbm):
    targ.append(np.isin(list(map(int,top_200[k])), i).astype('int'))
  return np.array(targ)

def make_top_als(model, X_test,  X_gbm=None, n=100):
  user_ids = np.arange(X_test.shape[0])
  rec, scores = model.recommend(user_ids, X_test, N=n,filter_already_liked_items=True, recalculate_user=True)
  result = np.array(list(map(lambda x: tracks.to_idx(x), rec)))
  us_id = np.ones((rec.shape[0], n)) * np.arange(rec.shape[0])[:,np.newaxis]
  rank = np.array(list(map(lambda x: np.argsort(-x), scores))).ravel()
  if X_gbm:
    target = get_target(result,X_gbm)
    print('us_ids: {}\nres: {}\nscores: {}\ntarget: {}'.format(us_id.shape,  result.shape, scores.shape, target.shape))
    out_put = np.vstack([us_id.ravel(), result.ravel(), scores.ravel(), rank, target.ravel()])
    return out_put
  out_put = np.vstack([us_id.ravel(), result.ravel(), scores.ravel(), rank])
  return out_put

def split_x_y(data):
  X=[]
  y = []
  for i in tqdm(data):
    X.append(list(map(int,i[:-1])))
    y.append(i[-1])
  return X, y

def find_track(track):
  track = str(track)
  if track in w:
    return w[track] 
  else:
    return np.zeros(100)

def predict_wv(X, X_gbm=False, n=200):
  result = []
  scores = []
  for user in tqdm(X):
    new_user = np.array(list(map(find_track,tracks.to_idx(user.indices))))
    res, scor = zip(*w.similar_by_vector(new_user.mean(axis=0), topn=n))
    #res, scor = zip(*w.similar_by_vector(new_user[-1], topn=n))
    result.append(res)
    scores.append(scor)
  result = np.array(result).astype('int')
  scores = np.array(scores)
  us_id = np.ones((result.shape[0], n)) * np.arange(result.shape[0])[:,np.newaxis]
  rank = np.array(list(map(lambda x: np.argsort(-x), scores))).ravel()
  if X_gbm:
    target = get_target(result,X_gbm)
    print('us_ids: {}\nres: {}\nscores: {}\ntarget: {}'.format(us_id.shape,  result.shape, scores.shape, target.shape))
    out_put = np.vstack([us_id.ravel(), result.ravel(), scores.ravel(), rank, target.ravel()])
    return out_put
  out_put = np.vstack([us_id.ravel(), result.ravel(), scores.ravel(), rank])
  return out_put

In [9]:
#подготовка данных train/test/y for als and gbm
train_als = get_train_1(size=0.7)
train_gbm = get_train_2(size=0.7)

#test_als = get_train2_1(size=0.7, path='test')
#test_gbm = get_train2_2(size=0.7, path='test')

tracks = ItemEncoder(df_tracks[['trackId']])

n = 150000

X_train_als = it.islice(train_als,0, n)
X_test_als = it.islice(train_als,0, n)

X_train_gbm, train_y = split_x_y(it.islice(train_gbm,0, n))
X_test_gbm, test_y = split_x_y(it.islice(train_gbm,0, n))

X_train = tracks.make_csr_data(X_train_als)
X_test = tracks.make_csr_data(X_test_als)

n_train = 20000
n_test = 5000

X_train_gbm, train_y  = X_train_gbm[:n_train], train_y[:n_train]
X_train = X_train[:n_train]

X_test_gbm, test_y  = X_test_gbm[:n_test], test_y[:n_test]
X_test = X_test[:n_test]

print('-----------\n')
print('train size - {}\ntest size - {}'.format(X_train.shape,X_test.shape))

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

-----------

train size - (20000, 483275)
test size - (5000, 483275)


In [None]:
train_als = get_train_1(size=0.7)
train_gbm = get_train_2(size=0.7)

test_als = get_train2_1(size=0.7, path='test')
test_gbm = get_train2_2(size=0.7, path='test')

##ALS



In [10]:
X_train_bm25 = bm25_weight(X_train)

model_als = AlternatingLeastSquares(factors=600, iterations=3)#,,use_gpu=True

model_als.fit(X_train_bm25)

  "OpenBLAS detected. Its highly recommend to set the environment variable "


  0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
n = 300
top_200_train = make_top_als(model_als, X_train, X_gbm=X_train_gbm, n=n)
print('------------')
top_200_test = make_top_als(model_als, X_test, X_gbm=X_test_gbm, n=n)

us_ids: (20000, 300)
res: (20000, 300)
scores: (20000, 300)
target: (20000, 300)
------------
us_ids: (5000, 300)
res: (5000, 300)
scores: (5000, 300)
target: (5000, 300)


In [12]:
res = top_200_test[1].reshape(-1,n)[:,:100]
score(res, test_y)

  0%|          | 0/5000 [00:00<?, ?it/s]

Scor: 0.015727992431253567


In [13]:
X_test_bm25 = bm25_weight(X_test)

In [14]:
X_train_lf = sp.vstack([X_train_bm25,X_test_bm25])

In [15]:
model_lf = LightFM(no_components=500,learning_rate=0.05, loss='warp')
model_lf.fit(X_train_lf, epochs=2, num_threads=2, verbose=True)

Epoch: 100%|██████████| 2/2 [00:54<00:00, 27.35s/it]


<lightfm.lightfm.LightFM at 0x7f1fe240ee90>

In [16]:
idx_us = np.arange(25000)

In [17]:
item_b = model_lf.item_biases
user_b = model_lf.user_biases

##W2V

In [None]:
def multi(data, workers=2):
  b_size = len(data) // workers
  b_idx = np.arange(data.shape[0], step=b_size)
  batch = []
  for i in range(len(b_idx)):
    low = b_idx[i]
    hight = None
    if b_idx[i+1]:
      hight = b_idx[i+1]
    batch.append(data[b_idx[i],b_idx[i+1]])
  with Pool(workers) as p:
    result = p.map(predict_wv, batch)
  return result

In [None]:
w = KeyedVectors.load('/content/drive/MyDrive/Colab Notebooks/YA_cup_ML/features/key_vec_baseline.wv')

In [None]:
wv_train = predict_wv(X_train,X_gbm=X_train_gbm)
wv_test = predict_wv(X_test,X_gbm=X_test_gbm)

0it [00:00, ?it/s]

us_ids: (20000, 200)
res: (20000, 200)
scores: (20000, 200)
target: (20000, 200)


##LightGBM

In [18]:
idx_track_filter = np.loadtxt('/content/drive/MyDrive/Colab Notebooks/YA_cup_ML/features/idx_track_filter.csv')
idx_art_filter = np.loadtxt('/content/drive/MyDrive/Colab Notebooks/YA_cup_ML/features/idx_artist_filter.csv')
track_art_features = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/YA_cup_ML/features/pop_items.csv', index_col=0)
user_feature = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/YA_cup_ML/user_features.csv', index_col=0)
track_artist = track_art_features[['trackId', 'artistId']]
track_features = track_art_features[['trackId', 'cou_like_track']].set_index('trackId')
artist_features = track_art_features[['artistId', 'cou_like_art']].drop_duplicates().set_index('artistId')
del track_art_features

In [21]:
track_features = track_features.merge(pd.Series(item_b).to_frame(), how='left', left_index=True, right_index=True)
track_features.columns = ['cou_like', 'bias_t']

In [22]:
user_feature_train = user_feature.iloc[:20000,:-1]
user_feature_test = user_feature.iloc[20000:25000,:-1].reset_index(drop=True)

In [23]:
user_feature_train = pd.concat([user_feature_train, pd.Series(user_b)[:20000]], axis=1)
user_feature_test= pd.concat([user_feature_test, pd.Series(user_b)[20000:].reset_index(drop=True)], axis=1)

In [26]:
def make_data_gbm(top_n, user_feature, track_feature=None, artist_feature=None, artist_track=None, filter_idx=False):
  df = pd.DataFrame(top_n.T)
  df.columns = ['user_idx', 'track', 'score', 'rank', 'target']
  #user_feature
  df = df.merge(user_feature, how='left', left_on='user_idx', right_index=True)
  df = df.merge(track_feature, how='left', left_on='track', right_index=True)
  #df = df.merge(artist_feature, how='left', left_on='like_artist', right_index=True)
  df = df.merge(artist_track.set_index('trackId'), how='left', left_on='track', right_on='trackId')
  df.loc[:, 'like_artist'] = df['like_artist'].astype('category')
  df.loc[:, 'artistId'] = df['artistId'].astype('category')
  if filter_idx:
    df = df.loc[df['track'].isin(idx_track_filter)]
    #df = df.loc[df['like_artist'].isin(idx_art_filter)]
  return df

def predict_gbm(X,df, model):
  df['predict'] = model.predict(X)
  df_ = df.sort_values(by='predict', ascending=False)
  for user, group in df_[['user_idx', 'track','predict']].groupby('user_idx'):
    yield group.track[:100].to_numpy()

In [27]:
#top_400_train = np.hstack([top_200_train,wv_train])
#top_400_test = np.hstack([top_200_test,wv_test])

In [28]:
#train_df = make_data_gbm(top_200_train, user_feature_train)
#train_df = make_data_gbm(top_200_train, user_feature_train, track_feature=track_features)
#test_df = make_data_gbm(top_200, user_feature_test,track_feature=track_features)
#train_df = make_data_gbm(top_200_train, user_feature_train, track_feature=track_features, artist_feature=artist_features)
#test_df = make_data_gbm(top_200, user_feature_train, track_feature=track_features, artist_feature=artist_features)
train_df = make_data_gbm(top_200_train, user_feature_train, track_feature=track_features, artist_feature=artist_features,artist_track=track_artist, filter_idx=False)
test_df = make_data_gbm(top_200_test, user_feature_test, track_feature=track_features, artist_feature=artist_features,artist_track=track_artist)
train_df

Unnamed: 0,user_idx,track,score,rank,target,cou_tracks,like_artist,cou_artists,0,cou_like,bias_t,artistId
0,0.0,377667.0,0.080717,0.0,0.0,54,8473,23,-1.266036,15936,-0.549752,33877
1,0.0,319242.0,0.066462,1.0,0.0,54,8473,23,-1.266036,26010,0.485515,5933
2,0.0,443256.0,0.060647,2.0,0.0,54,8473,23,-1.266036,22606,-0.500000,3226
3,0.0,286594.0,0.060025,3.0,0.0,54,8473,23,-1.266036,71192,-0.396493,44875
4,0.0,64323.0,0.058830,4.0,0.0,54,8473,23,-1.266036,52461,-0.500000,55854
...,...,...,...,...,...,...,...,...,...,...,...,...
5999995,19999.0,65190.0,0.074373,295.0,0.0,206,4460,159,-1.382942,9912,-0.456696,23891
5999996,19999.0,208573.0,0.074025,296.0,0.0,206,4460,159,-1.382942,3658,-0.549752,3208
5999997,19999.0,277076.0,0.073927,297.0,0.0,206,4460,159,-1.382942,5062,0.551888,32965
5999998,19999.0,351426.0,0.073785,298.0,0.0,206,4460,159,-1.382942,10631,-0.443800,36541


In [29]:
test_df

Unnamed: 0,user_idx,track,score,rank,target,cou_tracks,like_artist,cou_artists,0,cou_like,bias_t,artistId
0,0.0,89927.0,0.040760,0.0,0.0,256,2994,167,-0.780050,9257,0.388792,35869
1,0.0,71674.0,0.029036,1.0,0.0,256,2994,167,-0.780050,4274,-0.414981,47748
2,0.0,370718.0,0.028291,2.0,0.0,256,2994,167,-0.780050,3691,0.619575,35869
3,0.0,23228.0,0.024944,3.0,0.0,256,2994,167,-0.780050,7224,-0.500000,53252
4,0.0,469830.0,0.022654,4.0,0.0,256,2994,167,-0.780050,8892,-0.500000,93
...,...,...,...,...,...,...,...,...,...,...,...,...
1499995,4999.0,69464.0,0.029741,295.0,0.0,45,1526,14,-1.129968,28634,-0.500000,8638
1499996,4999.0,247516.0,0.029691,296.0,0.0,45,1526,14,-1.129968,1720,0.687249,6691
1499997,4999.0,419412.0,0.029677,297.0,0.0,45,1526,14,-1.129968,4399,-0.549752,41084
1499998,4999.0,79304.0,0.029645,298.0,0.0,45,1526,14,-1.129968,57321,-0.585019,23444


In [31]:
train_df = pd.concat([train_df,test_df])
train_df = train_df.loc[train_df['cou_like'] <=33904]

In [33]:
idx_train, idx_val = train_test_split(train_df['user_idx'].unique(), test_size=0.2)

X_train = train_df.drop(columns=['user_idx', 'track', 'target', 'artistId', 'like_artist']).loc[train_df['user_idx'].isin(idx_train)]
y_train = train_df['target'].loc[train_df['user_idx'].isin(idx_train)]
qid = train_df.loc[train_df['user_idx'].isin(idx_train)].groupby('user_idx')['user_idx'].count().to_numpy()

val_x = train_df.drop(columns=['user_idx', 'track', 'target', 'artistId', 'like_artist']).loc[train_df['user_idx'].isin(idx_val)]
val_y = train_df['target'].loc[train_df['user_idx'].isin(idx_val)]
val_qid = train_df.loc[train_df['user_idx'].isin(idx_val)].groupby('user_idx')['user_idx'].count().to_numpy()

print(X_train.shape, y_train.shape, val_x.shape, val_y.shape, qid.shape, val_qid.shape)

X_train

(5648261, 7) (5648261,) (1397651, 7) (1397651,) (16000,) (4000,)


Unnamed: 0,score,rank,cou_tracks,cou_artists,0,cou_like,bias_t
0,0.080717,0.0,54,23,-1.266036,15936,-0.549752
1,0.066462,1.0,54,23,-1.266036,26010,0.485515
2,0.060647,2.0,54,23,-1.266036,22606,-0.500000
5,0.058585,5.0,54,23,-1.266036,17241,-0.500000
6,0.048466,6.0,54,23,-1.266036,20544,0.337343
...,...,...,...,...,...,...,...
1499994,0.029774,294.0,45,14,-1.129968,4044,0.842146
1499995,0.029741,295.0,45,14,-1.129968,28634,-0.500000
1499996,0.029691,296.0,45,14,-1.129968,1720,0.687249
1499997,0.029677,297.0,45,14,-1.129968,4399,-0.549752


In [None]:
X_train.loc[:,'like_artist'] = X_train['like_artist'].astype('category')
val_x.loc[:,'like_artist'] = val_x['like_artist'].astype('category')

In [34]:
model = LGBMRanker(objective="lambdarank", metric="ndcg")
model.fit(X_train, y=y_train, eval_set=[(val_x,val_y)], eval_group=[val_qid], eval_at=[10], verbose=20, group=qid)

[20]	valid_0's ndcg@10: 0.245583
[40]	valid_0's ndcg@10: 0.247867
[60]	valid_0's ndcg@10: 0.248743
[80]	valid_0's ndcg@10: 0.249577
[100]	valid_0's ndcg@10: 0.249483


LGBMRanker(metric='ndcg', objective='lambdarank')

In [35]:
X_test= test_df.drop(columns=['user_idx', 'track', 'target', 'artistId','like_artist'])
X_test

Unnamed: 0,score,rank,cou_tracks,cou_artists,0,cou_like,bias_t
0,0.040760,0.0,256,167,-0.780050,9257,0.388792
1,0.029036,1.0,256,167,-0.780050,4274,-0.414981
2,0.028291,2.0,256,167,-0.780050,3691,0.619575
3,0.024944,3.0,256,167,-0.780050,7224,-0.500000
4,0.022654,4.0,256,167,-0.780050,8892,-0.500000
...,...,...,...,...,...,...,...
1499995,0.029741,295.0,45,14,-1.129968,28634,-0.500000
1499996,0.029691,296.0,45,14,-1.129968,1720,0.687249
1499997,0.029677,297.0,45,14,-1.129968,4399,-0.549752
1499998,0.029645,298.0,45,14,-1.129968,57321,-0.585019


In [36]:
model.feature_importances_

array([757, 505, 176, 307, 465, 666, 124])

In [37]:
res_test = predict_gbm(X_test, test_df, model)
print('-----------')
score(res_test, test_y)

-----------


0it [00:00, ?it/s]

Scor: 0.015370290871018799
