# Домашнее задание "Рекомендации на основе содержания"

Преподаватель: Алексей Кузьмин

“Требуется построить модель рекомендаций на основе скрытых факторов (implicit) на основе dataset’а https://grouplens.org/datasets/hetrec-2011/ (Delicious Bookmarks)”

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from implicit.als import AlternatingLeastSquares

# Загрузка данных

Загружаем информацию по пользователям и их закладкам

In [2]:
user_bmark_ds = pd.read_csv('hetrec2011-delicious-2k/user_taggedbookmarks.dat',  sep='\t')
print(user_bmark_ds.shape)
user_bmark_ds.head()

(437593, 9)


Unnamed: 0,userID,bookmarkID,tagID,day,month,year,hour,minute,second
0,8,1,1,8,11,2010,23,29,22
1,8,2,1,8,11,2010,23,25,59
2,8,7,1,8,11,2010,18,55,1
3,8,7,6,8,11,2010,18,55,1
4,8,7,7,8,11,2010,18,55,1


Кодируем пользователей и закладки в порядке возрастания их id

In [3]:
userID_list = np.sort(user_bmark_ds['userID'].unique())
userID_map = {k:idx for idx, k in enumerate(userID_list)}

bmarkID_list = np.sort(user_bmark_ds['bookmarkID'].unique())
bmarkID_map = {k:idx for idx, k in enumerate(bmarkID_list)}

In [4]:
user_bmark_ds['userID_normed'] = user_bmark_ds['userID'].apply(lambda x: userID_map[x])
user_bmark_ds['bookmarkID_normed'] = user_bmark_ds['bookmarkID'].apply(lambda x: bmarkID_map[x])

Загружаем дополнительную информацию по закладкам

In [5]:
bmarks_ds = pd.read_csv('hetrec2011-delicious-2k/bookmarks.dat', sep='\t',
                        encoding="ISO-8859-1",)
bmarks_ds = bmarks_ds[bmarks_ds['id'].isin(bmarkID_map.keys())]
bmarks_ds['id_normed'] = bmarks_ds['id'].apply(lambda x: bmarkID_map[x])
bmarks_ds = bmarks_ds.set_index('id_normed')
print(bmarks_ds.shape)
bmarks_ds.head()

(69223, 6)


Unnamed: 0_level_0,id,md5,title,url,md5Principal,urlPrincipal
id_normed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,ab4954b633ddaf5b5bba6e9b71aa6b70,IFLA - The official website of the Internation...,http://www.ifla.org/,7f431306c428457bc4e12b15634484f,www.ifla.org
1,2,2221e9cd106d269dd34682666f576fa3,gcdp-e.pdf (application/pdf Object),http://archive.ifla.org/VII/s14/nd1/gcdp-e.pdf,1ef8cfcfe968101fa9b4e301847503d4,archive.ifla.org
2,7,c97c571dadaddbbb493126a0d4d01ba3,EdSelect,http://www.edselect.com/,792fd7eb20143386d0c4eb193c6124d,www.edselect.com
3,8,25bfe8dca0ef263ec9c341b9f16c38b5,Cool Canada (Collections Canada),http://www.collectionscanada.gc.ca/cool/index-...,6fce4f6391516f0732531d9cfacda5b7,www.collectionscanada.gc.ca
4,9,c97284629e17b8e2861afaacd59918bc,Kidsreads.com,http://www.kidsreads.com/,5854ce8404857a45373eea01a3d98000,www.kidsreads.com


# Подготовка данных

Нет уверенности, что количество проставленных тегов коррелирует с качеством закладки, поэтому будем считать события бинарными: добавил пользователь страницу в закладки или нет

In [6]:
user_bmark_ds_short = user_bmark_ds[['userID_normed','bookmarkID_normed']].drop_duplicates()
print(user_bmark_ds_short.shape)
user_bmark_ds_short.head()

(104799, 2)


Unnamed: 0,userID_normed,bookmarkID_normed
0,0,0
1,0,1
2,0,2
5,0,3
8,0,4


In [7]:
data_sparse = sparse.csr_matrix(
    ([1]*user_bmark_ds_short.shape[0],
     (user_bmark_ds_short['userID_normed'].values, user_bmark_ds_short['bookmarkID_normed'].values)
    ),
    shape=(len(userID_list), len(bmarkID_list)))

# Обучаем модель и получаем рекомендации

In [8]:
model = AlternatingLeastSquares(factors=50)
model.fit(data_sparse)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [9]:
def get_recomendations(userid):
    user_items = data_sparse.T.tocsr()
    recommendations = model.recommend(userid, user_items)
    
    recs_ds = pd.DataFrame(recommendations, columns=['bookmarkID_normed', 'score']).set_index('bookmarkID_normed')
    recs_ds = pd.merge(recs_ds, bmarks_ds, left_index=True, right_index=True)
    return recs_ds

In [10]:
get_recomendations(100)

Unnamed: 0,score,id,md5,title,url,md5Principal,urlPrincipal
1458,0.102064,2034,1b7d8f294075465f030a21926ed63eb4,How to Clean Electronics - DIY Life,http://www.diylife.com/2010/10/19/how-to-clean...,bbb36c47f2c8a1105177d47e8f8ca333,www.diylife.com
4,0.079738,9,c97284629e17b8e2861afaacd59918bc,Kidsreads.com,http://www.kidsreads.com/,5854ce8404857a45373eea01a3d98000,www.kidsreads.com
3,0.077228,8,25bfe8dca0ef263ec9c341b9f16c38b5,Cool Canada (Collections Canada),http://www.collectionscanada.gc.ca/cool/index-...,6fce4f6391516f0732531d9cfacda5b7,www.collectionscanada.gc.ca
1459,0.033268,2037,ec527b8ae76159c792b223acc2e145de,21 Free Video Players For Your Website and Blo...,http://www.instantshift.com/2010/05/14/21-free...,9e4fee4c7fa185ff030589f449a6200d,www.instantshift.com
86,0.026792,135,60d31d8f48a1ea18f9d56d3a19e3b59c,MIXRIOT,http://www.mixriot.com/,274da608ac5fbe35f94b203040e22c47,www.mixriot.com
198,0.02518,310,841a49ee401c6d96b62ebe0f361d0475,Forrester Research Communities: Community: Cus...,http://community.forrester.com/community/custo...,4e904792059d1b7e4dd29196a10b8085,community.forrester.com
1550,0.022499,2138,f06bca0a44c06bf2b427c25976fa6b4c,Typo-Shark - Find eBay Listing Typos and Missp...,http://www.typo-shark.com/,59fb9aa49ea073643c1a4e05aec9a7e9,www.typo-shark.com
1676,0.021825,2305,3d36ab4dbfe93745d965a61d0c28f982,The Web Is Dead. Long Live the Internet | Maga...,http://www.wired.com/magazine/2010/08/ff_webri...,c72d08405dc30e25e7627843e8a5d3ac,www.wired.com
160,0.020976,266,e3656593310059d1feadf79d7932c4da,YouTube - WHERE GOOD IDEAS COME FROM by Steven...,http://www.youtube.com/watch?v=NugRZGDbPFU,ab3201c6103205c14f6e56b11b2fcd46,www.youtube.com
651,0.018346,889,a6245fa58c132c38ed8493a1f76309a0,LRB · John Lanchester · The Great British Econ...,http://www.lrb.co.uk/v32/n05/john-lanchester/t...,d204156859ea54839d7aedfba962c969,www.lrb.co.uk
