# Basic Recommandation Part: Collaborative Filtering

## 1. Function definition and realize


In [2]:
import json
import math
from pprint import pprint
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
import sklearn
from sklearn.model_selection import train_test_split
import pandas as pd
import sys
import operator

In [3]:
#The function that load the json file and store it in an array
def loadFile(filename):
    datas = []
    with open(filename) as f:
        for line in f:
            datas.append(json.loads(line))
    return datas

In [4]:
#Load the files and store the book id including in idSet, this fuction won't be used in the project
def loadFileDict(filename, idSet):
    datax = []
    with open(filename) as f:
        for line in f:
            new = json.loads(line)
            if new['book_id'] in idSet:
                datax.append(json.loads(line))
        f.close()
    result = {}
    for meta in datax:
        user = meta['user_id']
        if user not in result:
            if len(result) >= 70000:
                break;
            else:
                result[user] = {}
                result[user]['book_id'] = []
                result[user]['rate'] = []
                result[user]['isRead'] = []
            result[user]['book_id'].append(meta['book_id'])
            result[user]['rate'].append(meta['rating'])
            result[user]['isRead'].append(meta['isRead'])
    pprint(filename)
    pprint(len(result))
    return result

In [5]:
#Seperate the data, store the rated data in the first dict, the read but un-rated data in the second dict
#the un-read data in the third dict
def sep_data(datas):
    result = [];
    ReadYes = {}
    ReadNo = {}
    ReadUnknown = {}
    for user in datas:
        meta = datas[user];
        for i in range(len(meta['isRead'])):
            if meta['isRead'][i] == True: #If the book in history is read
                if meta['rate'][i] == 0:
                    if user not in ReadUnknown:
                        ReadUnknown[user] = {}
                        ReadUnknown[user]['book_id'] = []
                        ReadUnknown[user]['rate'] = []
                    ReadUnknown[user]['book_id'].append(meta['book_id'][i])
                    ReadUnknown[user]['rate'].append(0)                
                else:
                    if user not in ReadYes:
                        ReadYes[user] = {}
                        ReadYes[user]['book_id'] = []
                        ReadYes[user]['rate'] = []
                    ReadYes[user]['book_id'].append(meta['book_id'][i])
                    ReadYes[user]['rate'].append(meta['rate'][i])
            else:
                if user not in ReadNo:
                    ReadNo[user] = {}
                    ReadNo[user]['book_id'] = []
                    ReadNo[user]['rate'] = []
                ReadNo[user]['book_id'].append(meta['book_id'][i])
                ReadNo[user]['rate'].append(5) 
    result.append(ReadYes)
    result.append(ReadNo)
    result.append(ReadUnknown)
    return result

In [9]:
#This function is used to convert the originally read metaData into a dictionary with user id as the search.
def transToDict(dataUserMeta):
    dict_user = {}
    for meta in dataUserMeta:
        if meta['isRead'] != True:
            continue;
        user_id = meta['user_id'];
        book_id = meta['book_id'];
        if user_id not in dict_user:
            dict_user[user_id] = {};
            dict_user[user_id]['book_id'] = [];
            dict_user[user_id]['rate'] = [];
            dict_user[user_id]['isRead'] = [];
        dict_user[user_id]['book_id'].append(book_id);
        dict_user[user_id]['rate'].append(meta['rating']);
        dict_user[user_id]['isRead'].append(meta['isRead']);
    return dict_user

In [10]:
#Transfer the un-read data with a rating 5
def transFindRemain(dataUserMeta):
    dict_remain = {}
    for meta in dataUserMeta:
        if meta['isRead'] == True:
            continue;
        user_id = meta['user_id'];
        book_id = meta['book_id'];
        if user_id not in dict_user:
            dict_user[user_id] = {};
            dict_user[user_id]['book_id'] = [];
            dict_user[user_id]['rate'] = [];
        dict_user[user_id]['book_id'].append(book_id);
        dict_user[user_id]['rate'].append(5);
    return dict_user

In [11]:
#Transfer the dict-array data structure to dict-dict
def trans_array_to_dict(dicto):
    result = {}
    for username in dicto:
        result[username] = {}
        for i in range(len(dicto[username]['book_id'])):
            result[username][dicto[username]['book_id'][i]] = dicto[username]['rate'][i]
    return result;

In [6]:
#Extract the user-index dict from the data
def buildUserIndex(datas):
    result = {}
    count = 0
    for user in datas:
        result[user] = count;
        count +=1
    return result

In [7]:
#Extract the book-index dict from the data
def buildBookIndex(inputValue):
    result = {}
    count = 0
    for user in inputValue:
        for book in inputValue[user]['book_id']:
            if book not in result:
                result[book] = count;
                count += 1;
    return result

In [8]:
#merge the data from different dict
def mergedata(target, origin):
    for user in origin:
        if user not in target:
            target[user] = {}
            target[user]['book_id'] = []
            target[user]['rate'] = []
            target[user]['isRead'] = []
        target[user]['book_id'].extend(origin[user]['book_id'])
        target[user]['rate'].extend(origin[user]['rate'])
        target[user]['isRead'].extend(origin[user]['isRead'])
    return target

In [12]:
#Calculate the average ratings for every user
def cal_avg(d):
    result = {}
    for user in d:
        count = 0;
        sums = 0;
        for key in d[user]:
            sums += d[user][key];
            count += 1;
        if count == 0:
            result[user] = -1;
            continue;
        result[user] = sums / count;
    return result

In [13]:
#The function calculate similarity by original Pearson Correlation
def get_single_sim(train_dict, avge_dict, user1, user2):
    top = 0;
    add1 = 0;
    add2 = 0;
    avg1 = avge_dict[user1]
    avg2 = avge_dict[user2]
    for book in train_dict[user1].keys()&train_dict[user2].keys():
        top += ((train_dict[user1][book] - avg1)*(train_dict[user2][book] - avg2))
        add1 += pow(train_dict[user1][book] - avg1, 2)
        add2 += pow(train_dict[user2][book] - avg2, 2)
    if(add1*add2) == 0:
        return 0;
    result = top / np.sqrt(add1 * add2)
    return result

In [14]:
#The function calculate similarity by original Pearson Correlation with the weight of the number of rating same books
def get_single_sim_ps_improve(train_dict, avge_dict, user1, user2):
    top = 0;
    add1 = 0;
    add2 = 0;
    avg1 = avge_dict[user1]
    avg2 = avge_dict[user2]
    count = 0;
    for book in train_dict[user1].keys()&train_dict[user2].keys():
        top += ((train_dict[user1][book] - avg1)*(train_dict[user2][book] - avg2))
        add1 += pow(train_dict[user1][book] - avg1, 2)
        add2 += pow(train_dict[user2][book] - avg2, 2)
        count += 1;
    if(add1*add2) == 0:
        return 0; #
    weight = count / 5;
#    if count > 2:
#        pprint(count)
    if weight > 1:
        weight = 1;
    result = top / np.sqrt(add1 * add2) * weight #Using weight to scale
    return result    

In [229]:
#The function calculate similarity by cosine similarity
def get_single_sim_cos(train_dict, avge_dict, user1, user2):
    top = 0;
    add1 = 0;
    add2 = 0;
    avg1 = avge_dict[user1]
    avg2 = avge_dict[user2]
    count = 0;
    flag1 = 0;
    flag2 = 0;
    for book in train_dict[user1].keys()|train_dict[user2].keys():
        if book in train_dict[user1].keys():
            eval1 = train_dict[user1][book]
            flag1 = 1;
        else:
            eval1 = 0;
            flag1 = 0;

        if book in train_dict[user2].keys():
            eval2 = train_dict[user2][book]
            flag2 = 1;
        else:
            eval2 = 0;
            flag2 = 0;
        top += (eval1 * eval2)
        add1 += pow(eval1, 2)
        add2 += pow(eval2, 2)
        count += (flag1 * flag2);
    if(add1*add2) == 0:
        return 0; 
#    if count > 3:
#        count = 3;
    result = top / np.sqrt(add1 * add2)
    return result

In [221]:
#Predict the ratings by Pearson Correlation
def pred_test(train_, test_, ranges):
    k = 0.02
    avg = cal_avg(train_) #Calculate the average 
    result = {}
    book_set = gen_bookset(train_) #得到book set
    count = 0;
    for username in test_: 
        result[username] = {}
        for book in test_[username]: 
            r = avg[username]; #Add the average at first
            if r < 0:
                continue;
            if book in book_set:
                for user_other in book_set[book]: 
                        r += k * get_single_sim(train_, avg, username, user_other) * (train_[user_other][book] - avg[user_other])
            if r > 5:
                r = 5;
            if r < 1:
                r = 1;
            result[username][book] = r;
        count += 1;
        if count >= ranges:
            return result
    return result

In [222]:
#Predict the ratings by improved Pearson Correlation
def pred_test_im(train_, test_, ranges):
    k = 0.02
    avg = cal_avg(train_) 
    result = {}
    book_set = gen_bookset(train_) 
    count = 0;
    for username in test_: 
        result[username] = {}
        for book in test_[username]: 
            r = avg[username];
            if r < 0:
                continue;
            if book in book_set:
                for user_other in book_set[book]:
                        #different
                        r += k * get_single_sim_ps_improve(train_, avg, username, user_other) * (train_[user_other][book] - avg[user_other])
            if r > 5:
                r = 5;
            if r < 1:
                r = 1;
            result[username][book] = r;
        count += 1;
        if count >= ranges:
            return result
    return result

In [223]:
#Predict the ratings by Cosine
def pred_test_cos(train_, test_, ranges):
    k = 0.02
    avg = cal_avg(train_) 
    result = {}
    book_set = gen_bookset(train_) 
    count = 0;
    for username in test_:
        result[username] = {}
        for book in test_[username]: 
            r = avg[username]; 
            if r < 0:
                continue;
            if book in book_set:
                for user_other in book_set[book]: 
                        r += k * get_single_sim_cos(train_, avg, username, user_other) * (train_[user_other][book] - avg[user_other])
            if r > 5:
                r = 5;
            if r < 1:
                r = 1;
            result[username][book] = r;
        count += 1;
        if count >= ranges:
            return result
    return result

In [17]:
#Get the book set 
def gen_bookset(train_):
    result = {}
    for username in train_:
        for book in train_[username]:
            if book not in result:
                result[book] = set()
            result[book].add(username)
    return result

In [18]:
#Calculate the MAE for dict data.
def cal_mae(test, pred):
    count = 0;
    sum_mae = 0;
    for user in pred:
        for book in pred[user]:
            sum_mae += np.abs(pred[user][book] - test[user][book]);
            count += 1;
    return sum_mae / count

In [19]:
#Calculate the RMSE for dict data.
def cal_rmse(test, pred):
    count = 0;
    sum_rmse = 0;
    for user in pred:
        for book in pred[user]:
            sum_rmse += ((pred[user][book] - test[user][book])** 2);
            count += 1;
    return math.sqrt(sum_rmse / count)

In [232]:
#Using factorized matrixes to predict
def prediction(P, Q):
    result = np.dot(P.T, Q)
#    if result > 5:
#        result = 5
#    else:
#        if result < 0:
#            result = 0;
    return result

In [233]:
#Predict ratings for the books rated in the test set by MF
def predictions(P, Q, T):
    user, book = T.nonzero()
    Z = np.zeros((len(P.T), len(Q[0])))   
    for u, i in zip(user, book):
        pred = prediction(P[:,u], Q[:, i])
        if pred > 5:
            pred = 5;
        else:
            if pred < 1:
                pred = 1;
        Z[u, i] = pred
    return Z

In [234]:
#Convert the book/user-index dict to index-book/user dict.
def convert_dict(dicts):
    result = {}
    for user in dicts:
        result[dicts[user]] = user
    return result;


In [242]:
#Calculate the RMSE for matrix data.
def rmse(I, R, M):
    count = np.sum(I);
    return np.sqrt(np.sum((I * (R - M)) ** 2) / count)

In [243]:
#Calculate the MAE for matrix data.
def mae(I, R, M):
    count = np.sum(I);
    return np.sum(np.abs(I * (R - M))) / count

In [73]:
#Load the book_id to title data
poetry_dict = loadFile('poemTitle.json')[0]
pprint(len(poetry_dict))

36514


In [220]:
with open('source_dict.json', 'wt') as file_obj:
    json.dump(dict_user, file_obj)

In [215]:
en = loadFile('rc_po_5_less.json')
pprint(en)

[[['22151696', 'Lullabies'],
  ['29431081', 'The Universe of Us'],
  ['23513349', 'Milk and Honey'],
  ['25384844', 'Black Butterfly'],
  ['23434371', 'Beautiful Chaos'],
  ['13123245', 'B'],
  ['13105527', 'I Wrote This For You'],
  ['35606560', 'The Sun and Her Flowers'],
  ['25746714', 'The Type'],
  ['19230408', 'I Wrote This For You: Just the Words'],
  ['23534', 'Love Is a Dog from Hell'],
  ['18288210', 'No Matter the Wreckage'],
  ['29457318', 'Habang Wala Pa Sila: Mga Tula ng Pag-ibig'],
  ['13376363', 'Teaching My Mother How to Give Birth'],
  ['32468495', 'Pillow Thoughts'],
  ['7824768', 'ليتها تقرأ'],
  ['6017893', 'قهوة وشيكولاتة'],
  ['980426', 'Love Poems'],
  ['20821097', 'Chasers of the Light: Poems from the Typewriter Series'],
  ['11625', 'Ariel: The Restored Edition'],
  ['23522212', 'Mouthful of Forevers'],
  ['29758714', 'Dirty Pretty Things'],
  ['31443393', 'Note to Self'],
  ['6944946', 'يوميات امرأة لا مبالية'],
  ['25986828', 'Today Means Amen'],
  ['1294049

In [76]:
rate_list = []
for i in range(len(book_rate)):
    ins = (book_rate[i],train_[target_user][book_rate[i]], poetry_dict[book_rate[i]])
    rate_list.append(ins)
pprint(rate_list)

[('764332', 4, 'Jason and the Golden Fleece'),
 ('1519', 4, 'The Oresteia  (Ορέστεια, #1-3)'),
 ('1715', 4, 'Metamorphoses'),
 ('12914', 4, 'The Aeneid'),
 ('1371', 5, 'The Iliad'),
 ('1381', 5, 'The Odyssey'),
 ('2696', 4, 'The Canterbury Tales')]


In [210]:
res_list = []
for i in range(len(book_list)):
    ins = (book_list[i], poetry_dict[book_list[i]])
    res_list.append(ins)
pprint(res_list)


[('22151696', 'Lullabies'),
 ('29431081', 'The Universe of Us'),
 ('23513349', 'Milk and Honey'),
 ('25384844', 'Black Butterfly'),
 ('23434371', 'Beautiful Chaos'),
 ('13123245', 'B'),
 ('13105527', 'I Wrote This For You'),
 ('35606560', 'The Sun and Her Flowers'),
 ('25746714', 'The Type'),
 ('19230408', 'I Wrote This For You: Just the Words'),
 ('23534', 'Love Is a Dog from Hell'),
 ('18288210', 'No Matter the Wreckage'),
 ('29457318', 'Habang Wala Pa Sila: Mga Tula ng Pag-ibig'),
 ('13376363', 'Teaching My Mother How to Give Birth'),
 ('32468495', 'Pillow Thoughts'),
 ('7824768', 'ليتها تقرأ'),
 ('6017893', 'قهوة وشيكولاتة'),
 ('980426', 'Love Poems'),
 ('20821097', 'Chasers of the Light: Poems from the Typewriter Series'),
 ('11625', 'Ariel: The Restored Edition'),
 ('23522212', 'Mouthful of Forevers'),
 ('29758714', 'Dirty Pretty Things'),
 ('31443393', 'Note to Self'),
 ('6944946', 'يوميات امرأة لا مبالية'),
 ('25986828', 'Today Means Amen'),
 ('1294049', 'Love Songs'),
 ('24688

In [78]:
pprint(len(res_list))

50


In [212]:
with open('rc_po_5_less.json', 'wt') as file_obj:
    json.dump(res_list, file_obj)

#with open('rc_po_rate_4.json', 'wt') as file_obj:
#    json.dump(rate_list, file_obj)

In [520]:
rc = loadFile('rcTitle_new.json')[0]
pprint(rc)
rc_list = []
for meta in rc:
    rc_li
    rc_list.append(meta)

{'10041732': 'Core Samples from the World',
 '104831': 'آرش کمانگیر',
 '1056805': 'Sir Gawain and the Green Knight: A New Verse Translation',
 '1089337': 'Words for the Wind: The Collected Verse',
 '111044': 'Stop Pretending: What Happened When My Big Sister Went Crazy',
 '11255697': 'مهندس العالم',
 '113209': 'The Best of Robert Service',
 '1160501': 'The Ballad of the White Horse',
 '120724': 'Selected Poems',
 '12352685': 'The Wild Book',
 '12429335': 'Water Sings Blue: Ocean Poems',
 '12675105': 'Balloon Pop Outlaw Black',
 '1330089': 'Oh Forbidden',
 '1371100': 'The Great Poets: Rudyard Kipling',
 '1388192': 'The Elder Edda: A Book of Viking Lore',
 '141600': 'Doré\'s Illustrations for "Paradise Lost"',
 '152532': 'Auden: Poems',
 '165251': 'Song of the Simple Truth: The Complete Poems of Julia de Burgos',
 '1785216': 'Mahabharata',
 '178746': 'The Selected Poetry of Yevgeny Yevtushenko',
 '18901': 'The Book of Nightmares',
 '193603': 'The Complete Poems',
 '20658': 'Blind Huber',

In [525]:
set1 = ('1785216', '7083492', '881508', '104831')
set2 = ('1160501','1388192','141600','2355014','236188','46231','304433','50479','6061538','752886','75494','7953223')

In [527]:
book_re_list = []
for i in range(len(book_list)):
    if book_list[i] in set1:
        book_re_list.append(book_list[i])

for i in range(len(book_list)):
    if book_list[i] in set2:
        book_re_list.append(book_list[i])
        
for i in range(len(book_list)):
    if (book_list[i] not in set1) and (book_list[i] not in set2):
        book_re_list.append(book_list[i])

In [528]:
pprint(len(book_re_list))

50


In [529]:
with open('rc_po_2.json', 'wt') as file_obj:
    json.dump(book_re_list, file_obj)

In [523]:
book_ly = []
book_st = []
for i in range(50):
    if i < 25:
        book_ly.append(rc_list[i])
    else:
        book_st.append(rc_list[i])

In [524]:
pprint(book_st)

['295149',
 '7083492',
 '193603',
 '120724',
 '20658',
 '732112',
 '761271',
 '50479',
 '1388192',
 '1089337',
 '657504',
 '236188',
 '8850512',
 '10041732',
 '46231',
 '1330089',
 '897487',
 '104831',
 '3348584',
 '1785216',
 '881508',
 '1371100',
 '111044',
 '165251',
 '6421376']


In [12]:
bookList = loadFile('mix_data.json')
idSet = set();
for meta in bookList:
    idSet.add(meta['book_id']);
pprint(len(idSet));

13568


In [21]:
pprint(bookList[0])

{'asin': 'B00NLXQ534',
 'authors': [{'author_id': '8551671', 'role': ''}],
 'average_rating': '4.12',
 'book_id': '25742454',
 'country_code': 'US',
 'description': 'Lillian Ann Cross is forced to live the worst nightmare of '
                'her life. She is an everyday middle class American, striving '
                'to survive in an everyday changing world. Her life was '
                'abruptly<br />turned upsidedown forever as she was kidnapped '
                'and forced into a world called "Hen Fighting."<br /><br />A '
                'world in which women fight and bets are made upon their '
                'bloodshed.Lillian is forced to comply due to the threats made '
                "upon her mother's life. Being a loving person her whole life, "
                'Lillian finds difficulty grasping her new functions. As she '
                'is conditioned to live in her new world, she is subjected to '
                'an experimental procedure. A procedure which ha

In [19]:
idList = [];
idDict = {};
idConv = {}
count = 0;
for bookid in idSet:
    idList.append(bookid)
    idDict[bookid] = count
    idConv[count] = bookid
    count += 1;
print(idList[0])
print(idConv[1])
print(idDict['10894849'])

10894849
3224278
0


In [85]:
dataUserMeta1 = loadFileDict('goodreads_interactions_poetry.json', idSet)
dataUserMeta2 = loadFileDict('goodreads_interactions_children.json', idSet)
dataUserMeta3 = loadFileDict('goodreads_interactions_comics_graphic.json', idSet)
dataUserMeta4 = loadFileDict('goodreads_interactions_fantasy_paranormal.json', idSet)
dataUserMeta5 = loadFileDict('goodreads_interactions_history_biography.json', idSet)
dataUserMeta6 = loadFileDict('goodreads_interactions_mystery_thriller_crime.json', idSet)
dataUserMeta7 = loadFileDict('goodreads_interactions_romance.json', idSet)
dataUserMeta8 = loadFileDict('goodreads_interactions_young_adult.json', idSet)

'goodreads_interactions_poetry.json'
59452
'goodreads_interactions_children.json'
58555
'goodreads_interactions_comics_graphic.json'
63603
'goodreads_interactions_fantasy_paranormal.json'
70000
'goodreads_interactions_history_biography.json'
70000
'goodreads_interactions_mystery_thriller_crime.json'
70000
'goodreads_interactions_romance.json'
70000
'goodreads_interactions_young_adult.json'
70000


In [135]:
#全部写完后合并数据集
dict_all = {}
dict_all = mergedata(dict_all, dataUserMeta1)
dict_all = mergedata(dict_all, dataUserMeta2)
dict_all = mergedata(dict_all, dataUserMeta3)
dict_all = mergedata(dict_all, dataUserMeta4)
dict_all = mergedata(dict_all, dataUserMeta5)
dict_all = mergedata(dict_all, dataUserMeta6)
dict_all = mergedata(dict_all, dataUserMeta7)
dict_all = mergedata(dict_all, dataUserMeta8)
pprint(len(dict_all))

267739


In [136]:
#对数据进行储存，并且进行写入验证
with open('user.json', 'wt') as file_obj:
    json.dump(dict_all, file_obj)

datas = []
with open('user.json') as f:
    for line in f:
        datas = json.loads(line)
    f.close()
pprint(len(datas))

267739


In [343]:
dict_re = {};
for user in dict_all:
    if len(dict_all[user]['isRead']) >= 5:
        dict_re[user] = dict_all[user]
pprint(len(dict_re))

15389


In [416]:
#分割两个数组

In [421]:
#找到目标用户
for user in dict_user:
    if len(dict_user[user]['rate']) > 7:
        target_user = user
        break;
pprint(target_user)

target_book = dict_user[target_user]['book_id']
target_rating = dict_user[target_user]['rate']
pprint(target_book)
pprint(target_rating)
if target_user in dict_remain:
    pprint(dict_remain[target_user]['book_id'])
if target_user in dict_unknown:
    pprint(dict_unknown[target_user]['book_id'])

'b03f5b741a7b3b8e4a79eb76a99a6860'
['178911',
 '587318',
 '1085771',
 '477338',
 '137918',
 '129655',
 '179176',
 '15801762']
[5, 5, 5, 4, 2, 1, 5, 5]


In [129]:
print(len(dict_re['b03f5b741a7b3b8e4a79eb76a99a6860']))

3


In [422]:
#对用户进行分析： 和划分集合重复
train_dict = dict_user
test_dict = {}
test_dict[target_user] = {}
test_dict[target_user]['book_id'] = []
test_dict[target_user]['rate'] = []
for book in idSet:
    if book not in dict_all[target_user]['book_id']:
        test_dict[target_user]['book_id'].append(book)
        test_dict[target_user]['rate'].append(0)

train_ = trans_array_to_dict(train_dict)
test_ = trans_array_to_dict(test_dict)
res = pred_test_im(train_, test_, 100000);

In [448]:
#print(len(dict_unknown))
pprint(len(res))
result = res[target_user]
sorted_x=sorted(result.items(), key = operator.itemgetter(1))
#pprint(sorted_x[len(sorted_x) - 1])
start = len(sorted_x) - 51;
end = len(sorted_x) - 1;
pprint(sorted_x[end][0])
result_id = []
for i in range(end, start, -1):
    if i == end:
        print(i)
    if i == start:
        print(i)
    result_id.append(sorted_x[i])

1
'8369681'
13559


In [449]:
id_list = []
for i in range(len(result_id)):
    id_list.append(result_id[i][0])
pprint(len(id_list))

50


In [452]:
user_info = {}
user_info['user_id'] = target_user;
user_info['book_id'] = dict_user[target_user]['book_id']
user_info['rate'] = dict_user[target_user]['rate']

with open('user_info.json', 'wt') as file_obj:
    json.dump(user_info, file_obj)

data_in_user = []
with open('user_info.json') as f:
    for line in f:
        data_in_user = json.loads(line)
    f.close()
pprint(data_in_user)

{'book_id': ['178911',
             '587318',
             '1085771',
             '477338',
             '137918',
             '129655',
             '179176',
             '15801762'],
 'rate': [5, 5, 5, 4, 2, 1, 5, 5],
 'user_id': 'b03f5b741a7b3b8e4a79eb76a99a6860'}


In [461]:
title_dict = loadFile('title.json')[0]
title_list = []
for book in user_info['book_id']:
    title_list.append(title_dict[book])
for i in range(len(title_list)):
    print('The title: ', title_list[i]);
    print('The rating:', user_info['rate'][i])

The title:  Annabel Lee
The rating: 5
The title:  The Last Battle (The Chronicles of Narnia, #7)
The rating: 5
The title:  The Walking Dead, Vol. 07: The Calm Before
The rating: 5
The title:  Blue is for Nightmares (Blue is for Nightmares, #1)
The rating: 4
The title:  More, Now, Again: A Memoir of Addiction
The rating: 2
The title:  The Fatal Fashione (Elizabeth I, #8)
The rating: 1
The title:  The Werewolf's Sin (Voodoo Moon, #3)
The rating: 5
The title:  Zom-B Angels
The rating: 5


In [462]:
rc_list = []
for book in id_list:
    rc_list.append(title_dict[book])
pprint(rc_list)

['Prom and Prejudice',
 'Revenant (Buffy the Vampire Slayer: Season 3, #11)',
 "Don't Let the Pigeon Drive the Bus!",
 'I Know What You Did Last Summer',
 'The Worry Week',
 'The Pleasures of the Damned',
 'The Name of the Wind (The Kingkiller Chronicle, #1)',
 'Claire de Lune (Claire de Lune, #1)',
 'Bons Sonhos, Meu Amor',
 'Strangers',
 '50 Below Zero',
 'Le due guerriere (Le Guerre del Mondo Emerso #2)',
 "Emily's Secret Book of Strange (Emily the Strange Graphic Novels, #2)",
 'MARS: Horse With No Name',
 'William Wilson',
 'The Paris Wife',
 'Shadowlands',
 'The Legend of the Poinsettia',
 'Born a Crime: Stories From a South African Childhood',
 'A Northern Light',
 'rock',
 "Don't Ever Change",
 'Nightwing #5 (Nightwing 2016, #5)',
 'A Little Something Different',
 'Good-Bye, Chunky Rice',
 'Injustice: Gods Among Us #24',
 "Baby-sitters' Island Adventure (The Baby-Sitters Club Super Special, #4)",
 "There's Someone Inside Your House",
 'Abraham Lincoln: Vampire Hunter',
 'Beauty

In [450]:
with open('rc_book_id.json', 'wt') as file_obj:
    json.dump(id_list, file_obj)

data_in_id = []
with open('rc_book_id.json') as f:
    for line in f:
        data_in_id = json.loads(line)
    f.close()
pprint(data_in_id)

['8369681',
 '293450',
 '191113',
 '47763',
 '835990',
 '220682',
 '186074',
 '6658573',
 '6396080',
 '15676',
 '560262',
 '9703431',
 '424604',
 '543980',
 '6798492',
 '8683812',
 '17428654',
 '14063',
 '29780253',
 '64481',
 '23153351',
 '23361053',
 '30740909',
 '20801166',
 '37264',
 '18131324',
 '290512',
 '15797848',
 '9421274',
 '6629481',
 '3922195',
 '21823465',
 '25624089',
 '1406392',
 '6536909',
 '175245',
 '16093690',
 '24357334',
 '111044',
 '2854614',
 '14069',
 '111002',
 '27246906',
 '226946',
 '1327609',
 '20582289',
 '108064',
 '29501904',
 '16117517',
 '23267836']


In [1]:
bookMeta = loadFile('goodreads_interactions_poetry.json')

NameError: name 'loadFile' is not defined

In [None]:
rc_all = {}
rate_all = {}
for meta in bookMeta:
    book_id = meta['book_id']
    if book_id in book_list:
        rc_all.append(rc_all)
    if book_id in book_rate:
        rc_all.append()

In [87]:
dataMetaAll = loadFile('goodreads_interactions_poetry.json')

In [88]:
dataDict = transToDict(dataMetaAll)

In [89]:
dict_all = dataDict

In [90]:
#先build两个Index
user_index = buildUserIndex(dict_all)
pprint(len(user_index))
book_index = buildBookIndex(dict_all);
pprint(len(book_index))


282415
36374


In [159]:
user_more = 0;
count = 0
for user in dict_all:
    if len(dict_all[user]['book_id']) > 20:
        count += 1
        user_more = user
        if count > 9:
            break;
pprint(user_more)
pprint(dict_all[user_more])

'80d52f5e70f023bd0098ab96599a3530'
{'book_id': ['23913',
             '47180',
             '563782',
             '19351',
             '20413',
             '65336',
             '304079',
             '402128',
             '72155',
             '782580',
             '95819',
             '112204',
             '23919',
             '142080',
             '99944',
             '2547',
             '30118',
             '534647',
             '6295',
             '30119',
             '46231'],
 'isRead': [True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True],
 'rate': [4, 4, 4, 4, 4, 3, 3, 4, 5, 5, 5, 5, 5, 5, 4, 4, 5, 4, 5, 5, 4]}


In [160]:
for book in dict_all[user_more]['book_id']:
    pprint(poetry_dict[book])

'The Marriage of Heaven and Hell'
'Kaddish and Other Poems'
'Collected Poems, 1947-1980'
'The Epic of Gilgamesh'
"A Child's Garden of Verses"
'Selected Poems'
'The Essential Rumi'
"Old Possum's Book of Practical Cats"
'The Collected Poems, Vol. 1: 1909-1939'
'The Complete Poetry and Prose'
'The Poetry of Robert Frost'
'The Complete Poems of Emily Dickinson'
'The Complete Stories and Poems'
'Collected Poems, 1909-1962'
'The Bhagavad Gita'
'The Prophet'
'A Light in the Attic'
'The Portable Beat Reader'
'Howl and Other Poems'
'Where the Sidewalk Ends'
'The Portable Dorothy Parker'


In [192]:
user_min = 0;
count = 0
for user in dict_all:
    if len(dict_all[user]['book_id']) == 3:
        count += 1
        user_min = user
        if count > 20:
            break;
pprint(user_min)
pprint(dict_all[user_min])

'fab3e0cea61720dc85881bfc09d06b97'
{'book_id': ['34023590', '25330489', '18003300'],
 'isRead': [True, True, True],
 'rate': [2, 4, 3]}


In [193]:
for book in dict_all[user_min]['book_id']:
    pprint(poetry_dict[book])

'Sad Girls'
'Memories'
'Love & Misadventure'


In [194]:
#'1fa34209b2a0797942f7961ca8d69e2e'
dict_less = {}
dict_less['user_id'] = user_min
dict_less['rate'] = {}
for i in range(len(dict_all[user_min]['book_id'])):
    dict_less['rate'][dict_all[user_min]['book_id'][i]] = dict_all[user_min]['rate'][i]
pprint(dict_less)

{'rate': {'18003300': 3, '25330489': 4, '34023590': 2},
 'user_id': 'fab3e0cea61720dc85881bfc09d06b97'}


In [195]:
dict_more = {}
dict_more['user_id'] = '26b5bed05bcabbabdaec4ee08fc43244'
user_more = '26b5bed05bcabbabdaec4ee08fc43244'
dict_more['rate'] = {}
for i in range(len(dict_all[user_more]['book_id'])):
    dict_more['rate'][dict_all[user_more]['book_id'][i]] = dict_all[user_more]['rate'][i]
pprint(dict_more)
for book in dict_all[user_more]['book_id']

{'rate': {'12914': 4,
          '1371': 5,
          '1381': 5,
          '1519': 4,
          '1715': 4,
          '2696': 4,
          '764332': 4},
 'user_id': '26b5bed05bcabbabdaec4ee08fc43244'}


In [214]:
for book in dict_all[user_more]['book_id']:
    pprint(book)
    pprint(poetry_dict[book])

'764332'
'Jason and the Golden Fleece'
'1519'
'The Oresteia  (Ορέστεια, #1-3)'
'1715'
'Metamorphoses'
'12914'
'The Aeneid'
'1371'
'The Iliad'
'1381'
'The Odyssey'
'2696'
'The Canterbury Tales'
'764332'
'Jason and the Golden Fleece'
'1519'
'The Oresteia  (Ορέστεια, #1-3)'
'1715'
'Metamorphoses'
'12914'
'The Aeneid'
'1371'
'The Iliad'


In [196]:
restore = []
restore.append(dict_less)
restore.append(dict_more)
pprint(restore)

[{'rate': {'18003300': 3, '25330489': 4, '34023590': 2},
  'user_id': 'fab3e0cea61720dc85881bfc09d06b97'},
 {'rate': {'12914': 4,
           '1371': 5,
           '1381': 5,
           '1519': 4,
           '1715': 4,
           '2696': 4,
           '764332': 4},
  'user_id': '26b5bed05bcabbabdaec4ee08fc43244'}]


In [211]:
with open('two_user_end.json', 'wt') as file_obj:
    json.dump(restore, file_obj)

In [197]:
dict_re = {};
for user in dict_all:
    if len(dict_all[user]['isRead']) >= 10 or user == user_min:
        dict_re[user] = dict_all[user]
pprint(len(dict_re)) #把User_min加进去

27978


In [198]:
pprint(dict_re['1fa34209b2a0797942f7961ca8d69e2e'])

KeyError: '1fa34209b2a0797942f7961ca8d69e2e'

In [216]:
array_2 = []
array_2 = sep_data(dict_re)
dict_user = array_2[0]
dict_remain = array_2[1]
dict_unknown = array_2[2]

target_user = 'fab3e0cea61720dc85881bfc09d06b97'

user_index = buildUserIndex(dict_re)
pprint(len(user_index))
book_index = buildBookIndex(dict_re);
pprint(len(book_index))

train_dict = dict_user
test_dict = {}
test_dict[target_user] = {}
test_dict[target_user]['book_id'] = []
test_dict[target_user]['rate'] = []

for book in book_index:
    if book not in dict_all[target_user]['book_id']:
        test_dict[target_user]['book_id'].append(book)
        test_dict[target_user]['rate'].append(1)

train_ = trans_array_to_dict(train_dict)
test_ = trans_array_to_dict(test_dict)

pprint(len(train))

27978
33926
27873


In [None]:
s

In [219]:
pprint(dict_user[target_user])

{'book_id': ['34023590', '25330489', '18003300'], 'rate': [2, 4, 3]}


In [218]:
pprint(len(dict_user))

27873


In [200]:
user_index = buildUserIndex(dict_re)
pprint(len(user_index))
book_index = buildBookIndex(dict_all);
pprint(len(book_index))

27978
36374


In [201]:
pprint(train_['1fa34209b2a0797942f7961ca8d69e2e'])

KeyError: '1fa34209b2a0797942f7961ca8d69e2e'

In [202]:
#pprint(test_)

In [203]:
res = pred_test_im(train_, test_, 1000000);

In [204]:
for i in res:
    result = res[i]
pprint(len(result))

33923


In [205]:
sorted_x=sorted(result.items(), key = operator.itemgetter(1))
start = len(sorted_x) - 51;
end = len(sorted_x) - 1;
#pprint(sorted_x[end][0])
result_id = []
for i in range(end, start, -1):
    result_id.append(sorted_x[i])

In [208]:
pprint(sorted_x[len(sorted_x) - 10])

('19230408', 3.057364807746158)


In [71]:
pprint(len(sorted_x))

36367


In [209]:
book_list = []
for i in range(50):
    book_list.append(result_id[i][0])
pprint(book_list)
book_rate = []
for book_id in train_[target_user]:
    book_rate.append(book_id)
pprint(book_rate)
print(len(book_list))
#book_list.extend(book_rate)
#pprint(book_list)

['22151696',
 '29431081',
 '23513349',
 '25384844',
 '23434371',
 '13123245',
 '13105527',
 '35606560',
 '25746714',
 '19230408',
 '23534',
 '18288210',
 '29457318',
 '13376363',
 '32468495',
 '7824768',
 '6017893',
 '980426',
 '20821097',
 '11625',
 '23522212',
 '29758714',
 '31443393',
 '6944946',
 '25986828',
 '1294049',
 '24688932',
 '25334576',
 '19265831',
 '26850255',
 '27494',
 '47713',
 '400412',
 '3049',
 '3109162',
 '5868421',
 '539143',
 '34296927',
 '1434',
 '1371',
 '33667125',
 '42051',
 '5865732',
 '11958571',
 '12122965',
 '8098264',
 '26702564',
 '29752702',
 '29335538',
 '24717410']
['34023590', '25330489', '18003300']
50


In [509]:
with open('rc_po_1.json', 'wt') as file_obj:
    json.dump(book_list, file_obj)

In [70]:
with open('rc_po_3.json', 'wt') as file_obj:
    json.dump(book_list, file_obj)

with open('rc_po_rate_1.json', 'wt') as file_obj:
    json.dump(book_rate, file_obj)

In [502]:
pprint(train_[target_user])

{'12914': 4, '1371': 5, '1381': 5, '1519': 4, '1715': 4, '2696': 4, '764332': 4}


In [225]:
#完全转化为dict-dict 第一个key是user的名字，第二个key是book的名字
train_ = trans_array_to_dict(train_dict)
test_ = trans_array_to_dict(test_dict)
#userset = get_set([],train_dict);
print(len(train_))
#print(train_['8842281e1d1347389f2ab93d60773d4d'])

27873


In [224]:
#划分测试集和数据集

target_book = dict_user[target_user]['book_id']
target_rating = dict_user[target_user]['rate']
pprint(target_book)
pprint(target_rating)
if target_user in dict_remain:
    pprint(dict_remain[target_user]['book_id'])
if target_user in dict_unknown:
    pprint(dict_unknown[target_user]['book_id'])

test_dict = {}
train_dict = {}
for userMeta in dict_user:
    ## 注意，这里是跳过了评论少于10个的
#    if len(dict_user[userMeta]['book_id']) < 10:
#        continue;
    test_dict[userMeta] = {};
    train_dict[userMeta] = {};
    all_id = dict_user[userMeta]['book_id'];
    all_rating = dict_user[userMeta]['rate'];
    x_train, x_test, y_train, y_test = train_test_split(all_id,all_rating,test_size=0.33)
    test_dict[userMeta]['book_id'] = x_test;
    test_dict[userMeta]['rate'] = y_test;
    train_dict[userMeta]['book_id'] = x_train;
    train_dict[userMeta]['rate'] = y_train;

#for userMeta in dict_remain:
#    if userMeta not in train_dict:
#        train_dict[userMeta] = {}
#        train_dict[userMeta]['book_id'] = []
#        train_dict[userMeta]['rate'] = []
#    train_dict[userMeta]['book_id'].extend(dict_remain[userMeta]['book_id']);
#    train_dict[userMeta]['rate'].extend(dict_remain[userMeta]['rate']);

#for userMeta in dict_unknown:
#    if userMeta not in train_dict:
#        train_dict[userMeta] = {}
#        train_dict[userMeta]['book_id'] = []
#        train_dict[userMeta]['rate'] = []
#    train_dict[userMeta]['book_id'].extend(dict_unknown[userMeta]['book_id']);
#    train_dict[userMeta]['rate'].extend(dict_unknown[userMeta]['rate']);
pprint(len(train_dict))
pprint(len(test_dict)) #打印筛选后的用户数量

['34023590', '25330489', '18003300']
[2, 4, 3]
27873
27873


In [390]:
print(conv_user[1])
print(train_[conv_user[1]])

1afe8b35c5e568e95bc17e5b5cdbfd1b
{'17333426': 5, '20613635': 4}


In [227]:
#进行预测和评估
pred_res = pred_test(train_, test_, 10000);
mae = cal_mae(test_, pred_res);
rmse = cal_rmse(test_, pred_res);
print("The MAE value is:", mae)
print("The RMSE value is:", rmse)

The MAE value is: 0.6927776219125726
The RMSE value is: 0.9349987563510308


In [398]:
#pprint(pred_res)

In [228]:
#进行改进版预测
pred_res_im = pred_test_im(train_, test_, 10000);
mae_im = cal_mae(test_, pred_res_im);
rmse_im = cal_rmse(test_, pred_res_im);
print(mae_im)
print(rmse_im)

0.6562182740254755
0.8625402381412787


In [231]:
#对cos进行计算
pred_res_cos = pred_test_cos(train_, test_, 1000);
mae_cos = cal_mae(test_, pred_res_cos);
rmse_cos = cal_rmse(test_, pred_res_cos);
print(mae_cos)
print(rmse_cos)

0.664604524863647
0.8687005229252904


In [None]:
#对cos进行计算
pred_res_cos = pred_test_cos(train_, test_, 10000);
mae_cos = cal_mae(test_, pred_res_cos);
rmse_cos = cal_rmse(test_, pred_res_cos);
print(mae_cos)
print(rmse_cos)

{'book_id': '46231',
 'date_added': 'Tue Aug 25 19:04:35 -0700 2015',
 'date_updated': 'Tue Aug 25 19:04:36 -0700 2015',
 'isRead': False,
 'rating': 0,
 'read_at': '',
 'review_id': '6aa2d9f629fc4531dff4bfd2eab3ab0a',
 'started_at': '',
 'user_id': '854d7eea57ee70acf835e2ba5262e3d9'}


In [126]:
#得到根据book得到的信息的所有结果
dataBooks = loadFile('goodreads_books_poetry.json')
#pprint(dataBooks[1])

KeyboardInterrupt: 

In [17]:
#得到根据user得到信息的所有结果
dataUserMeta = loadFile('goodreads_interactions_poetry.json')
pprint(dataUserMeta[1])

{'book_id': '1376',
 'date_added': 'Wed May 09 09:33:18 -0700 2007',
 'date_updated': 'Wed May 09 09:33:18 -0700 2007',
 'isRead': True,
 'rating': 4,
 'read_at': '',
 'review_id': '403a2391eca7dc8651e89de396e436e7',
 'started_at': '',
 'user_id': '8842281e1d1347389f2ab93d60773d4d'}


In [17]:
#想法，进行评论，用什么进行排序呢？ 进行CB
#pprint(dataUserMeta[2])

In [216]:
dict_user = {}
dict_user = transToDict(dataUserMeta);
#pprint(dict_user['8842281e1d1347389f2ab93d60773d4d']) #将最基本的meta数据转化为dict-array数据

NameError: name 'transToDict' is not defined

In [19]:
pprint(len(dict_user))

282415


In [20]:
dict_remain = {}
dict_remain = transFindRemain(dataUserMeta);

In [21]:
test_dict = {}
train_dict = {}
for userMeta in dict_user:
    ## 注意，这里是跳过了评论少于10个的
#    if len(dict_user[userMeta]['book_id']) < 10:
#        continue;
    test_dict[userMeta] = {};
    train_dict[userMeta] = {};
    all_id = dict_user[userMeta]['book_id'];
    all_rating = dict_user[userMeta]['rate'];
    x_train, x_test, y_train, y_test = train_test_split(all_id,all_rating,test_size=0.33)
    test_dict[userMeta]['book_id'] = x_test;
    test_dict[userMeta]['rate'] = y_test;
    train_dict[userMeta]['book_id'] = x_train;
    train_dict[userMeta]['rate'] = y_train;

for userMeta in dict_remain:
    all_id = dict_user[userMeta]['book_id'];
    all_rating = dict_user[userMeta]['rate'];
    if userMeta not in train_dict:
        train_dict[userMeta] = {}
        train_dict[userMeta]['book_id'] = []
        train_dict[userMeta]['rate'] = []
    train_dict[userMeta]['book_id'].extend(dict_remain[userMeta]['book_id']);
    train_dict[userMeta]['rate'].extend(dict_remain[userMeta]['rate']);

In [22]:
#pprint(train_dict['8842281e1d1347389f2ab93d60773d4d'])
#pprint(test_dict['8842281e1d1347389f2ab93d60773d4d'])
pprint(len(train_dict))
pprint(len(test_dict)) #打印筛选后的用户数量

377799
377799


In [23]:
#此时已经对数据进行了筛选
dict_user = {}
book_index = {}
book_count = 0;
user_index = {}
user_count = 0;
for username in test_dict:
    if username not in user_index:
        user_index[username] = user_count;
        user_count += 1;
    for book in test_dict[username]['book_id']:
        if book not in book_index:
            book_index[book] = book_count;
            book_count += 1;

for username in train_dict:
    if username not in user_index:
        user_index[username] = user_count;
        user_count += 1;
    for book in train_dict[username]['book_id']:
        if book not in book_index:
            book_index[book] = book_count;
            book_count += 1;
            
pprint(len(user_index)) #用meta一个个存book_index和user_index 可能会用到
pprint(len(book_index))

377799
36514


In [24]:
#完全转化为dict-dict 第一个key是user的名字，第二个key是book的名字
train_ = trans_array_to_dict(train_dict)
test_ = trans_array_to_dict(test_dict)
#userset = get_set([],train_dict);
print(len(train_))
#print(train_['8842281e1d1347389f2ab93d60773d4d'])

377799


In [46]:
#train_avg = cal_avg(train_)
#test_avg = cal_avg(test_) 
#测试cal_avg函数计算平均值

In [53]:
#进行预测
pred_res = pred_test(train_, test_, 1000);

In [54]:
#进行评估
pred_res = pred_test(train_, test_, 1000);
mae = cal_mae(test_, pred_res);
rmse = cal_rmse(test_, pred_res);
print("The MAE value is:", mae)
print("The RMSE value is:", rmse)

The MAE value is: 0.5368075849751278
The RMSE value is: 1.061675099643044


In [50]:
#打印前20个预测结果
count = 0;
for user in pred_res:
    print("Origin:", test_[user]);
    print("Predict:",pred_res[user]);
    count += 1;
    if count > 20:
        break;

Origin: {'1376': 4}
Predict: {'1376': 4.3165343528175155}
Origin: {'30119': 3}
Predict: {}
Origin: {'30119': 4}
Predict: {'30119': 4.0}
Origin: {'30119': 5, '1420': 5}
Predict: {'30119': 5, '1420': 5}
Origin: {'2547': 4}
Predict: {}
Origin: {'1381': 5, '18743': 5}
Predict: {'1381': 5.0, '18743': 5.0}
Origin: {'30119': 5}
Predict: {}
Origin: {'1381': 4}
Predict: {'1381': 4.0}
Origin: {'35606560': 4}
Predict: {}
Origin: {'2696': 4}
Predict: {}
Origin: {'23513349': 5}
Predict: {}
Origin: {'15812153': 4}
Predict: {}
Origin: {'42038': 5, '53022': 5}
Predict: {'42038': 5.0, '53022': 5.0}
Origin: {'402128': 5}
Predict: {'402128': 4.0}
Origin: {'908708': 4}
Predict: {'908708': 3.97093354507458}
Origin: {'19351': 3}
Predict: {'19351': 4.0}
Origin: {'1420': 5}
Predict: {'1420': 5.0}
Origin: {'253264': 5}
Predict: {}
Origin: {'23534': 5}
Predict: {'23534': 5.0}
Origin: {'30119': 5}
Predict: {'30119': 5.0}
Origin: {'1432': 5}
Predict: {}


In [64]:
#进行改进版预测
pred_res_im = pred_test_im(train_, test_, 1000);
mae_im = cal_mae(test_, pred_res_im);
rmse_im = cal_rmse(test_, pred_res_im);
print(mae_im)
print(rmse_im)

0.8230888470445363
1.1405403810129706


In [65]:
#对改进版进行预测
pred_res_cos = pred_test_cos(train_, test_, 1000);
mae_cos = cal_mae(test_, pred_res_cos);
rmse_cos = cal_rmse(test_, pred_res_cos);
print(mae_cos)
print(rmse_cos)

0.8214173117507653
1.137028247304502


In [66]:
a = np.arange(1,5).reshape(2,2)
b = np.arange(5,9).reshape(2,2)
c = np.dot(a, b)
pprint(c)

array([[19, 22],
       [43, 50]])


In [69]:
z = np.zeros((len(a), len(b[0])))
for i in range(len(a)):
    z[i, :] = np.dot(a[i, :], b)
print(z)

[[19. 22.]
 [43. 50.]]


In [55]:
rm_all = 0;
mae_all = 0;
count = 0;
for user in pred_res_im:
    for book in pred_res_im[user]:
        diff = np.abs(pred_res_im[user][book] - test_[user][book]);
        mae_all += diff
        rm_all += (diff * diff);
        count += 1;
mae = mae_all / count;
rm = (rm_all / count) ** 0.5
print(mae);
print(rm);
print(count)
print(mae_all/count)
print(rm_all/count)

1.2547175167898095
1.6589688704114107
4872
1.2547175167898095
2.752177712994112


In [235]:
pprint(len(book_index))
pprint(len(user_index))
#利用user_index和book_index构建二维矩阵

33926
27978


In [236]:
conv_user = convert_dict(user_index)
conv_book = convert_dict(book_index)
#转化为通过数字找user和book的字典

In [237]:
print(conv_user[1])

f88032f4ad97b46654fe59ce3387cf5d


In [252]:
lmbda = 0.01 #Regularisation weight
k = 20
m = len(user_index) #user的矩阵长度
#m = 1000
#m = len(user_index)
n = len(book_index) #book的矩阵长度
n_epochs = 50
gamma = 0.001 #learning rate
P = 1 * np.random.rand(k, m)
Q = 1 * np.random.rand(k, n)

#len_user = 10000
#R = np.zeros((len(user_index), len(book_index)));
#I = np.zeros((len(user_index), len(book_index)));
R = np.zeros((m, len(book_index)));
I = np.zeros((m, len(book_index)));
count = 0;
for user_in in range(m):
    user = conv_user[user_in] #得到user是哪个
    if user not in train_:
        continue;
    for book in train_[user]:
        R[user_index[user]][book_index[book]] = train_[user][book]
        I[user_index[user]][book_index[book]] = 1
    count += 1;

count = 0;
#T = np.zeros((len(user_index), len(book_index)));
#I2 = np.zeros((len(user_index), len(book_index)));
T = np.zeros((m, len(book_index)));
I2 = np.zeros((m, len(book_index)));
for user_in in range(m):
    user = conv_user[user_in]
    if user not in test_:
        continue;
#    pprint(user);
    for book in test_[user]:
        T[user_index[user]][book_index[book]] = test_[user][book] #test set
        I2[user_index[user]][book_index[book]] = 1
#构建两个矩阵，R和T分别存储train和test的信息
#矩阵I I2分别存储项对应的信息

In [257]:
print(len(P[1]))

27978


In [258]:
k = 20
lmbda = 0.02
n_epochs = 50
gamma = 0.005 #learning rate
train_errors = []
test_errors = []
user, book = R.nonzero()
for epoch in range(n_epochs):
    if epoch % 10 == 0:
        print(epoch)
    for u, i in zip(user, book):
        e = R[u, i] - prediction(P[:,u], Q[:, i])
        P[:, u] += gamma * (e * Q[:, i] - lmbda * P[:, u])
        Q[:, i] += gamma * (e * P[:, u] - lmbda * Q[:, i])

0
10
20
30
40


In [312]:
pprint(P[:, 1])

array([0.46562236, 0.67916067, 0.44680695, 0.18390048, 0.64692949,
       0.01755266, 0.09910645, 0.52350605, 0.04623374, 0.6169816 ,
       0.66349279, 0.61015071, 0.40297021, 0.78651147, 0.72557995,
       0.41856451, 0.36838965, 0.45838551, 0.70199411, 0.52532535])


In [259]:
PredM = predictions(P, Q, T)
pprint("done")

'done'


In [120]:
pprint(sum(sum(PredM)))

167272.55971899832


In [138]:
target_index = user_index['26b5bed05bcabbabdaec4ee08fc43244']
pprint(conv_user[0])
pprint(train_['26b5bed05bcabbabdaec4ee08fc43244'])

dict_s = {}
for i in range(len(PredM[0])):
    if PredM[0][i] != 0:
        book = conv_book[i];
        dict_s[book] = PredM[0][i]

pprint(len(res))
result = dict_s
sorted_x=sorted(result.items(), key = operator.itemgetter(1))
#pprint(sorted_x[len(sorted_x) - 1])
start = len(sorted_x) - 20000;
end = len(sorted_x) - 19950;
#pprint(sorted_x[end][0])
result_id = []
for i in range(end, start, -1):
    if i == end:
        print(i)
    if i == start:
        print(i)
    result_id.append(sorted_x[i])

pprint(result_id)

'26b5bed05bcabbabdaec4ee08fc43244'
{'12914': 4, '1371': 5, '1381': 5, '1519': 4, '1715': 4, '2696': 4, '764332': 4}
1
13969
[('2696138', 4.894076124127441),
 ('574266', 4.894072835925565),
 ('30366501', 4.894001643840528),
 ('130229', 4.893988139079161),
 ('421012', 4.893971158023882),
 ('249105', 4.893943060730648),
 ('6033224', 4.8939172280449945),
 ('7042263', 4.893906349574056),
 ('6786717', 4.893885544049061),
 ('732354', 4.893858291337981),
 ('1406602', 4.893740752176383),
 ('569326', 4.893674691800695),
 ('567698', 4.893616644138188),
 ('1389984', 4.893428089150204),
 ('13368928', 4.8934232639710675),
 ('16029464', 4.89322380227537),
 ('1408793', 4.893115354372867),
 ('1320624', 4.893081648715032),
 ('11826882', 4.892985310504306),
 ('10757352', 4.892974568363418),
 ('1136122', 4.892965017671003),
 ('17789295', 4.892944561298616),
 ('217967', 4.8928639730324734),
 ('1654579', 4.892848076079172),
 ('23358157', 4.892752451059529),
 ('12482896', 4.892731881592242),
 ('1689292', 4.8

In [128]:
print(sum(PredM[0]))

167272.55971899832


In [236]:
pprint(len(R))
pprint(len(R[0]))
pprint(len(T))
pprint(len(T[0]))
pprint(len(I))
pprint(len(I[0]))
pprint(len(I2))
pprint(len(I2[0]))

15389
6988
15389
6988
15389
6988
15389
6988


In [260]:
#Tr = T[0:10000, :]
MAE = mae(I2, T, PredM)
RMSE = rmse(I2, T, PredM)
print("MAE by Matrix Factorization with Stochastic Gradient Descent is :", MAE)
print("RMSE by Matrix Factorization with Stochastic Gradient Descent is :", RMSE)



MAE by Matrix Factorization with Stochastic Gradient Descent is : 0.6990096911033689
RMSE by Matrix Factorization with Stochastic Gradient Descent is : 0.9173291196783132


In [129]:
print(np.sum(P[2]))

13594.37411840311
