# Basic Recommandation Part: Collaborative Filtering

## 1. Function definition and realize


In [1]:
import json
import math
from pprint import pprint
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
import sklearn
from sklearn.model_selection import train_test_split
import pandas as pd
import sys
import operator

In [2]:
#The function that load the json file and store it in an array
def loadFile(filename):
    datas = []
    with open(filename) as f:
        for line in f:
            datas.append(json.loads(line))
    return datas

In [3]:
#Load the files and store the book id including in idSet, this fuction won't be used in the project
def loadFileDict(filename, idSet):
    datax = []
    with open(filename) as f:
        for line in f:
            new = json.loads(line)
            if new['book_id'] in idSet:
                datax.append(json.loads(line))
        f.close()
    result = {}
    for meta in datax:
        user = meta['user_id']
        if user not in result:
            if len(result) >= 70000:
                break;
            else:
                result[user] = {}
                result[user]['book_id'] = []
                result[user]['rate'] = []
                result[user]['isRead'] = []
            result[user]['book_id'].append(meta['book_id'])
            result[user]['rate'].append(meta['rating'])
            result[user]['isRead'].append(meta['isRead'])
    pprint(filename)
    pprint(len(result))
    return result

In [4]:
#Seperate the data, store the rated data in the first dict, the read but un-rated data in the second dict
#the un-read data in the third dict
def sep_data(datas):
    result = [];
    ReadYes = {}
    ReadNo = {}
    ReadUnknown = {}
    for user in datas:
        meta = datas[user];
        for i in range(len(meta['isRead'])):
            if meta['isRead'][i] == True: 
                if meta['rate'][i] == 0:
                    if user not in ReadUnknown:
                        ReadUnknown[user] = {}
                        ReadUnknown[user]['book_id'] = []
                        ReadUnknown[user]['rate'] = []
                    ReadUnknown[user]['book_id'].append(meta['book_id'][i])
                    ReadUnknown[user]['rate'].append(0)                
                else:
                    if user not in ReadYes:
                        ReadYes[user] = {}
                        ReadYes[user]['book_id'] = []
                        ReadYes[user]['rate'] = []
                    ReadYes[user]['book_id'].append(meta['book_id'][i])
                    ReadYes[user]['rate'].append(meta['rate'][i])
            else:
                if user not in ReadNo:
                    ReadNo[user] = {}
                    ReadNo[user]['book_id'] = []
                    ReadNo[user]['rate'] = []
                ReadNo[user]['book_id'].append(meta['book_id'][i])
                ReadNo[user]['rate'].append(5) 
    result.append(ReadYes)
    result.append(ReadNo)
    result.append(ReadUnknown)
    return result

In [5]:
#This function is used to convert the originally read metaData into a dictionary with user id as the search.
def transToDict(dataUserMeta):
    dict_user = {}
    for meta in dataUserMeta:
        if meta['isRead'] != True:
            continue;
        user_id = meta['user_id'];
        book_id = meta['book_id'];
        if user_id not in dict_user:
            dict_user[user_id] = {};
            dict_user[user_id]['book_id'] = [];
            dict_user[user_id]['rate'] = [];
            dict_user[user_id]['isRead'] = [];
        dict_user[user_id]['book_id'].append(book_id);
        dict_user[user_id]['rate'].append(meta['rating']);
        dict_user[user_id]['isRead'].append(meta['isRead']);
    return dict_user

In [6]:
#Transfer the un-read data with a rating 5
def transFindRemain(dataUserMeta):
    dict_remain = {}
    for meta in dataUserMeta:
        if meta['isRead'] == True:
            continue;
        user_id = meta['user_id'];
        book_id = meta['book_id'];
        if user_id not in dict_user:
            dict_user[user_id] = {};
            dict_user[user_id]['book_id'] = [];
            dict_user[user_id]['rate'] = [];
        dict_user[user_id]['book_id'].append(book_id);
        dict_user[user_id]['rate'].append(5);
    return dict_user

In [7]:
#Transfer the dict-array data structure to dict-dict
def trans_array_to_dict(dicto):
    result = {}
    for username in dicto:
        result[username] = {}
        for i in range(len(dicto[username]['book_id'])):
            result[username][dicto[username]['book_id'][i]] = dicto[username]['rate'][i]
    return result;

In [8]:
#Extract the user-index dict from the data
def buildUserIndex(datas):
    result = {}
    count = 0
    for user in datas:
        result[user] = count;
        count +=1
    return result

In [9]:
#Extract the book-index dict from the data
def buildBookIndex(inputValue):
    result = {}
    count = 0
    for user in inputValue:
        for book in inputValue[user]['book_id']:
            if book not in result:
                result[book] = count;
                count += 1;
    return result

In [10]:
#merge the data from different dict
def mergedata(target, origin):
    for user in origin:
        if user not in target:
            target[user] = {}
            target[user]['book_id'] = []
            target[user]['rate'] = []
            target[user]['isRead'] = []
        target[user]['book_id'].extend(origin[user]['book_id'])
        target[user]['rate'].extend(origin[user]['rate'])
        target[user]['isRead'].extend(origin[user]['isRead'])
    return target

In [11]:
#Calculate the average ratings for every user
def cal_avg(d):
    result = {}
    for user in d:
        count = 0;
        sums = 0;
        for key in d[user]:
            sums += d[user][key];
            count += 1;
        if count == 0:
            result[user] = -1;
            continue;
        result[user] = sums / count;
    return result

In [12]:
#The function calculate similarity by original Pearson Correlation
def get_single_sim(train_dict, avge_dict, user1, user2):
    top = 0;
    add1 = 0;
    add2 = 0;
    avg1 = avge_dict[user1]
    avg2 = avge_dict[user2]
    for book in train_dict[user1].keys()&train_dict[user2].keys():
        top += ((train_dict[user1][book] - avg1)*(train_dict[user2][book] - avg2))
        add1 += pow(train_dict[user1][book] - avg1, 2)
        add2 += pow(train_dict[user2][book] - avg2, 2)
    if(add1*add2) == 0:
        return 0; 
    result = top / np.sqrt(add1 * add2)
    return result

In [13]:
#The function calculate similarity by original Pearson Correlation with the weight of the number of rating same books
def get_single_sim_ps_improve(train_dict, avge_dict, user1, user2):
    top = 0;
    add1 = 0;
    add2 = 0;
    avg1 = avge_dict[user1]
    avg2 = avge_dict[user2]
    count = 0;
    for book in train_dict[user1].keys()&train_dict[user2].keys():
        top += ((train_dict[user1][book] - avg1)*(train_dict[user2][book] - avg2))
        add1 += pow(train_dict[user1][book] - avg1, 2)
        add2 += pow(train_dict[user2][book] - avg2, 2)
        count += 1;
    if(add1*add2) == 0:
        return 0; 
    weight = count / 5;
#    if count > 2:
#        pprint(count)
    if weight > 1:
        weight = 1;
    result = top / np.sqrt(add1 * add2) * weight 
    return result    

In [14]:
#The function calculate similarity by cosine similarity
def get_single_sim_cos(train_dict, avge_dict, user1, user2):
    top = 0;
    add1 = 0;
    add2 = 0;
    avg1 = avge_dict[user1]
    avg2 = avge_dict[user2]
    count = 0;
    flag1 = 0;
    flag2 = 0;
    for book in train_dict[user1].keys()|train_dict[user2].keys():
        if book in train_dict[user1].keys():
            eval1 = train_dict[user1][book]
            flag1 = 1;
        else:
            eval1 = 0;
            flag1 = 0;

        if book in train_dict[user2].keys():
            eval2 = train_dict[user2][book]
            flag2 = 1;
        else:
            eval2 = 0;
            flag2 = 0;
        top += (eval1 * eval2)
        add1 += pow(eval1, 2)
        add2 += pow(eval2, 2)
        count += (flag1 * flag2);
    if(add1*add2) == 0:
        return 0; 
#    if count > 3:
#        count = 3;
    result = top / np.sqrt(add1 * add2)
    return result

In [15]:
#Predict the ratings by Pearson Correlation

def pred_test(train_, test_, ranges):
    k = 0.02
    avg = cal_avg(train_) 
    result = {}
    book_set = gen_bookset(train_) 
    count = 0;
    for username in test_:
        result[username] = {}
        for book in test_[username]: 
            r = avg[username]; 
            if r < 0:
                continue;
            if book in book_set:
                for user_other in book_set[book]: 
                        r += k * get_single_sim(train_, avg, username, user_other) * (train_[user_other][book] - avg[user_other])
            if r > 5:
                r = 5;
            if r < 1:
                r = 1;
            result[username][book] = r;
        count += 1;
        if count >= ranges:
            return result
    return result

In [16]:
#Predict the ratings by improved Pearson Correlation
def pred_test_im(train_, test_, ranges):
    k = 0.02
    avg = cal_avg(train_) 
    result = {}
    book_set = gen_bookset(train_) 
    count = 0;
    for username in test_: 
        result[username] = {}
        for book in test_[username]: 
            r = avg[username]; 
            if r < 0:
                continue;
            if book in book_set:
                for user_other in book_set[book]: 
                        r += k * get_single_sim_ps_improve(train_, avg, username, user_other) * (train_[user_other][book] - avg[user_other])
            if r > 5:
                r = 5;
            if r < 1:
                r = 1;
            result[username][book] = r;
        count += 1;
        if count >= ranges:
            return result
    return result

In [17]:
#Predict the ratings by Cosine
def pred_test_cos(train_, test_, ranges):
    k = 0.02
    avg = cal_avg(train_) 
    result = {}
    book_set = gen_bookset(train_) 
    count = 0;
    for username in test_: 
        result[username] = {}
        for book in test_[username]: 
            r = avg[username]; 
            if r < 0:
                continue;
            if book in book_set:
                for user_other in book_set[book]: 
                        r += k * get_single_sim_cos(train_, avg, username, user_other) * (train_[user_other][book] - avg[user_other])
            if r > 5:
                r = 5;
            if r < 1:
                r = 1;
            result[username][book] = r;
        count += 1;
        if count >= ranges:
            return result
    return result

In [18]:
#Get the book set 
def gen_bookset(train_):
    result = {}
    for username in train_:
        for book in train_[username]:
            if book not in result:
                result[book] = set()
            result[book].add(username)
    return result

In [19]:
#Calculate the MAE for dict data.
def cal_mae(test, pred):
    count = 0;
    sum_mae = 0;
    for user in pred:
        for book in pred[user]:
            sum_mae += np.abs(pred[user][book] - test[user][book]);
            count += 1;
    return sum_mae / count

In [20]:
#Calculate the RMSE for dict data.
def cal_rmse(test, pred):
    count = 0;
    sum_rmse = 0;
    for user in pred:
        for book in pred[user]:
            sum_rmse += ((pred[user][book] - test[user][book])** 2);
            count += 1;
    return math.sqrt(sum_rmse / count)

In [21]:
#Using factorized matrixes to predict
def prediction(P, Q):
    result = np.dot(P.T, Q)
#    if result > 5:
#        result = 5
#    else:
#        if result < 0:
#            result = 0;
    return result

In [22]:
#Predict ratings for the books rated in the test set by MF
def predictions(P, Q, T):
    user, book = T.nonzero()
    Z = np.zeros((len(P.T), len(Q[0])))   
    for u, i in zip(user, book):
        pred = prediction(P[:,u], Q[:, i])
        if pred > 5:
            pred = 5;
        else:
            if pred < 1:
                pred = 1;
        Z[u, i] = pred
    return Z

In [23]:
#Convert the book/user-index dict to index-book/user dict.
def convert_dict(dicts):
    result = {}
    for user in dicts:
        result[dicts[user]] = user
    return result;


In [24]:
#Calculate the RMSE for matrix data.
def rmse(I, R, M):
    count = np.sum(I);
    return np.sqrt(np.sum((I * (R - M)) ** 2) / count)

In [25]:
#Calculate the MAE for matrix data.
def mae(I, R, M):
    count = np.sum(I);
    return np.sum(np.abs(I * (R - M))) / count

## 2. Data Load

In [26]:
#Load the book_id to title data
poetry_dict = loadFile('poemTitle.json')[0]
pprint(len(poetry_dict))

36514


In [27]:
#Load the data from json file
dataMetaAll = loadFile('goodreads_interactions_poetry.json')

In [28]:
#Trransfer the data to the dict that indexed by user_id
dataDict = transToDict(dataMetaAll)
dict_all = dataDict

In [29]:
#Build the user-index dict and book-index dict
user_index = buildUserIndex(dict_all)
pprint(len(user_index))
book_index = buildBookIndex(dict_all);
pprint(len(book_index))


282415
36374


## 3. Presonalize books for 2 actual users using Pearson method
In the code of this part, it only shows the process of find two users rating data, and recommand for the user 2, We can also use this code to predict the result by changing the part of "Build the train set and test set for the user 2". The target_user can be changed to the user_id of user 1

In [192]:
#Find the User that rate less book (User2)
user_min = 0;
count = 0
for user in dict_all:
    if len(dict_all[user]['book_id']) == 3:
        count += 1
        user_min = user
        if count > 20:
            break;
pprint(user_min)
pprint(dict_all[user_min])

'fab3e0cea61720dc85881bfc09d06b97'
{'book_id': ['34023590', '25330489', '18003300'],
 'isRead': [True, True, True],
 'rate': [2, 4, 3]}


In [193]:
#Print the books he reated
for book in dict_all[user_min]['book_id']:
    pprint(poetry_dict[book])

'Sad Girls'
'Memories'
'Love & Misadventure'


In [194]:
#Print the information that will store, it user_id is 'fab3e0cea61720dc85881bfc09d06b97'
dict_less = {}
dict_less['user_id'] = user_min
dict_less['rate'] = {}
for i in range(len(dict_all[user_min]['book_id'])):
    dict_less['rate'][dict_all[user_min]['book_id'][i]] = dict_all[user_min]['rate'][i]
pprint(dict_less)

{'rate': {'18003300': 3, '25330489': 4, '34023590': 2},
 'user_id': 'fab3e0cea61720dc85881bfc09d06b97'}


In [195]:
#Find the User's id is '26b5bed05bcabbabdaec4ee08fc43244'
dict_more = {}
dict_more['user_id'] = '26b5bed05bcabbabdaec4ee08fc43244'
user_more = '26b5bed05bcabbabdaec4ee08fc43244'
dict_more['rate'] = {}
for i in range(len(dict_all[user_more]['book_id'])):
    dict_more['rate'][dict_all[user_more]['book_id'][i]] = dict_all[user_more]['rate'][i]
pprint(dict_more)
for book in dict_all[user_more]['book_id']

{'rate': {'12914': 4,
          '1371': 5,
          '1381': 5,
          '1519': 4,
          '1715': 4,
          '2696': 4,
          '764332': 4},
 'user_id': '26b5bed05bcabbabdaec4ee08fc43244'}


In [214]:
#Print the rate records, the duplicate records will be merge
for book in dict_all[user_more]['book_id']:
    pprint(book)
    pprint(poetry_dict[book])

'764332'
'Jason and the Golden Fleece'
'1519'
'The Oresteia  (Ορέστεια, #1-3)'
'1715'
'Metamorphoses'
'12914'
'The Aeneid'
'1371'
'The Iliad'
'1381'
'The Odyssey'
'2696'
'The Canterbury Tales'
'764332'
'Jason and the Golden Fleece'
'1519'
'The Oresteia  (Ορέστεια, #1-3)'
'1715'
'Metamorphoses'
'12914'
'The Aeneid'
'1371'
'The Iliad'


In [196]:
#Build the data structure that store the rate informating of that two users
restore = []
restore.append(dict_less)
restore.append(dict_more)
pprint(restore)

[{'rate': {'18003300': 3, '25330489': 4, '34023590': 2},
  'user_id': 'fab3e0cea61720dc85881bfc09d06b97'},
 {'rate': {'12914': 4,
           '1371': 5,
           '1381': 5,
           '1519': 4,
           '1715': 4,
           '2696': 4,
           '764332': 4},
  'user_id': '26b5bed05bcabbabdaec4ee08fc43244'}]


In [211]:
#Store the users information
with open('two_user_end.json', 'wt') as file_obj:
    json.dump(restore, file_obj)

In [None]:
#Build the train set of user 2
dict_re = {};
for user in dict_all:
    if len(dict_all[user]['isRead']) >= 10 or user == user_min:
        dict_re[user] = dict_all[user]

In [216]:
#Build the train set and test set for the user 2
array_2 = []
array_2 = sep_data(dict_re)
dict_user = array_2[0]
dict_remain = array_2[1]
dict_unknown = array_2[2]

target_user = 'fab3e0cea61720dc85881bfc09d06b97'

user_index = buildUserIndex(dict_re)
pprint(len(user_index))
book_index = buildBookIndex(dict_re);
pprint(len(book_index))

train_dict = dict_user
test_dict = {}
test_dict[target_user] = {}
test_dict[target_user]['book_id'] = []
test_dict[target_user]['rate'] = []

for book in book_index:
    if book not in dict_all[target_user]['book_id']:
        test_dict[target_user]['book_id'].append(book)
        test_dict[target_user]['rate'].append(1)

train_ = trans_array_to_dict(train_dict)
test_ = trans_array_to_dict(test_dict)

pprint(len(train))

27978
33926
27873


In [218]:
pprint(len(dict_user))

27873


In [219]:
pprint(dict_user[target_user])

{'book_id': ['34023590', '25330489', '18003300'], 'rate': [2, 4, 3]}


In [220]:
#Store the data of rated books
with open('source_dict.json', 'wt') as file_obj:
    json.dump(dict_user, file_obj)

In [200]:
#Build the user-index dict and book-index dict
user_index = buildUserIndex(dict_re)
pprint(len(user_index))
book_index = buildBookIndex(dict_all);
pprint(len(book_index))

27978
36374


In [37]:
#Transfer the train set and test set from dict-array to dict-dict
train_ = trans_array_to_dict(train_dict)
test_ = trans_array_to_dict(test_dict)
#userset = get_set([],train_dict);
print(len(train_))
#print(train_['8842281e1d1347389f2ab93d60773d4d'])

246699


In [203]:
#Training the data
res = pred_test_im(train_, test_, 1000000);

In [204]:
for i in res:
    result = res[i]
pprint(len(result))

33923


In [205]:
#Sort the predict result
sorted_x=sorted(result.items(), key = operator.itemgetter(1))
start = len(sorted_x) - 51;
end = len(sorted_x) - 1;
#pprint(sorted_x[end][0])
result_id = []
for i in range(end, start, -1):
    result_id.append(sorted_x[i])

In [209]:
#Print the book id result of user 2
book_list = []
for i in range(50):
    book_list.append(result_id[i][0])
pprint(book_list)
book_rate = []
for book_id in train_[target_user]:
    book_rate.append(book_id)
pprint(book_rate)
print(len(book_list))
#book_list.extend(book_rate)
#pprint(book_list)

['22151696',
 '29431081',
 '23513349',
 '25384844',
 '23434371',
 '13123245',
 '13105527',
 '35606560',
 '25746714',
 '19230408',
 '23534',
 '18288210',
 '29457318',
 '13376363',
 '32468495',
 '7824768',
 '6017893',
 '980426',
 '20821097',
 '11625',
 '23522212',
 '29758714',
 '31443393',
 '6944946',
 '25986828',
 '1294049',
 '24688932',
 '25334576',
 '19265831',
 '26850255',
 '27494',
 '47713',
 '400412',
 '3049',
 '3109162',
 '5868421',
 '539143',
 '34296927',
 '1434',
 '1371',
 '33667125',
 '42051',
 '5865732',
 '11958571',
 '12122965',
 '8098264',
 '26702564',
 '29752702',
 '29335538',
 '24717410']
['34023590', '25330489', '18003300']
50


In [210]:
#Print the result with the title
res_list = []
for i in range(len(book_list)):
    ins = (book_list[i], poetry_dict[book_list[i]])
    res_list.append(ins)
pprint(res_list)


[('22151696', 'Lullabies'),
 ('29431081', 'The Universe of Us'),
 ('23513349', 'Milk and Honey'),
 ('25384844', 'Black Butterfly'),
 ('23434371', 'Beautiful Chaos'),
 ('13123245', 'B'),
 ('13105527', 'I Wrote This For You'),
 ('35606560', 'The Sun and Her Flowers'),
 ('25746714', 'The Type'),
 ('19230408', 'I Wrote This For You: Just the Words'),
 ('23534', 'Love Is a Dog from Hell'),
 ('18288210', 'No Matter the Wreckage'),
 ('29457318', 'Habang Wala Pa Sila: Mga Tula ng Pag-ibig'),
 ('13376363', 'Teaching My Mother How to Give Birth'),
 ('32468495', 'Pillow Thoughts'),
 ('7824768', 'ليتها تقرأ'),
 ('6017893', 'قهوة وشيكولاتة'),
 ('980426', 'Love Poems'),
 ('20821097', 'Chasers of the Light: Poems from the Typewriter Series'),
 ('11625', 'Ariel: The Restored Edition'),
 ('23522212', 'Mouthful of Forevers'),
 ('29758714', 'Dirty Pretty Things'),
 ('31443393', 'Note to Self'),
 ('6944946', 'يوميات امرأة لا مبالية'),
 ('25986828', 'Today Means Amen'),
 ('1294049', 'Love Songs'),
 ('24688

In [212]:
#Store user2's information
with open('rc_po_5_less.json', 'wt') as file_obj:
    json.dump(res_list, file_obj)

## 4. The predict and evaluation from multiple methods

In [224]:
#Get the test set and train set for the evaluation part
target_book = dict_user[target_user]['book_id']
target_rating = dict_user[target_user]['rate']
pprint(target_book)
pprint(target_rating)
if target_user in dict_remain:
    pprint(dict_remain[target_user]['book_id'])
if target_user in dict_unknown:
    pprint(dict_unknown[target_user]['book_id'])

test_dict = {}
train_dict = {}
for userMeta in dict_user:
#    if len(dict_user[userMeta]['book_id']) < 10:
#        continue;
    test_dict[userMeta] = {};
    train_dict[userMeta] = {};
    all_id = dict_user[userMeta]['book_id'];
    all_rating = dict_user[userMeta]['rate'];
    x_train, x_test, y_train, y_test = train_test_split(all_id,all_rating,test_size=0.33)
    test_dict[userMeta]['book_id'] = x_test;
    test_dict[userMeta]['rate'] = y_test;
    train_dict[userMeta]['book_id'] = x_train;
    train_dict[userMeta]['rate'] = y_train;

#for userMeta in dict_remain:
#    if userMeta not in train_dict:
#        train_dict[userMeta] = {}
#        train_dict[userMeta]['book_id'] = []
#        train_dict[userMeta]['rate'] = []
#    train_dict[userMeta]['book_id'].extend(dict_remain[userMeta]['book_id']);
#    train_dict[userMeta]['rate'].extend(dict_remain[userMeta]['rate']);

#for userMeta in dict_unknown:
#    if userMeta not in train_dict:
#        train_dict[userMeta] = {}
#        train_dict[userMeta]['book_id'] = []
#        train_dict[userMeta]['rate'] = []
#    train_dict[userMeta]['book_id'].extend(dict_unknown[userMeta]['book_id']);
#    train_dict[userMeta]['rate'].extend(dict_unknown[userMeta]['rate']);
pprint(len(train_dict))
pprint(len(test_dict)) 

['34023590', '25330489', '18003300']
[2, 4, 3]
27873
27873


In [227]:
#Predict and evaluate for naive Pearson Correlation
pred_res = pred_test(train_, test_, 10000);
mae = cal_mae(test_, pred_res);
rmse = cal_rmse(test_, pred_res);
print("The MAE value is:", mae)
print("The RMSE value is:", rmse)

The MAE value is: 0.6927776219125726
The RMSE value is: 0.9349987563510308


In [228]:
#Predict and evaluate for optimized Pearson Correlation
pred_res_im = pred_test_im(train_, test_, 10000);
mae_im = cal_mae(test_, pred_res_im);
rmse_im = cal_rmse(test_, pred_res_im);
print(mae_im)
print(rmse_im)

0.6562182740254755
0.8625402381412787


In [231]:
#Predict and evaluate for cosine similarity
pred_res_cos = pred_test_cos(train_, test_, 1000);
mae_cos = cal_mae(test_, pred_res_cos);
rmse_cos = cal_rmse(test_, pred_res_cos);
print(mae_cos)
print(rmse_cos)

0.664604524863647
0.8687005229252904


In [31]:
#Build the train set and test set for evaluation
dict_re = {};
for user in dict_all:
    if len(dict_all[user]['isRead']) >= 10:
        dict_re[user] = dict_all[user]

array_2 = []
array_2 = sep_data(dict_re)
dict_user = array_2[0]
dict_remain = array_2[1]
dict_unknown = array_2[2]

In [32]:
#Get the books that marked as un-read by users
dict_remain = {}
dict_remain = transFindRemain(dataMetaAll);

In [34]:
#Rebuild the train set so that we can add the data of dict remain for the matrix factorization
#It uses the data that having 
test_dict = {}
train_dict = {}
for userMeta in dict_user:
#    if len(dict_user[userMeta]['book_id']) < 10:
#        continue;
    test_dict[userMeta] = {};
    train_dict[userMeta] = {};
    all_id = dict_user[userMeta]['book_id'];
    all_rating = dict_user[userMeta]['rate'];
    x_train, x_test, y_train, y_test = train_test_split(all_id,all_rating,test_size=0.33)
    test_dict[userMeta]['book_id'] = x_test;
    test_dict[userMeta]['rate'] = y_test;
    train_dict[userMeta]['book_id'] = x_train;
    train_dict[userMeta]['rate'] = y_train;


for userMeta in dict_remain:
    all_id = dict_user[userMeta]['book_id'];
    all_rating = dict_user[userMeta]['rate'];
    if userMeta not in train_dict:
        train_dict[userMeta] = {}
        train_dict[userMeta]['book_id'] = []
        train_dict[userMeta]['rate'] = []
    train_dict[userMeta]['book_id'].extend(dict_remain[userMeta]['book_id']);
    train_dict[userMeta]['rate'].extend(dict_remain[userMeta]['rate']);

In [40]:
user_index = buildUserIndex(dict_user)
book_index = buildBookIndex(dict_user);

In [41]:
#Convert the user-index and book-index dict
conv_user = convert_dict(user_index)
conv_book = convert_dict(book_index)

In [46]:
#Define the parameter and the Train and Test matrix
lmbda = 0.01 #Regularisation weight
k = 20
#m = len(user_index) #The length of user matrix
m = 10000
#m = len(user_index)
n = len(book_index) 
n_epochs = 50
gamma = 0.001 #learning rate
P = 1 * np.random.rand(k, m)
Q = 1 * np.random.rand(k, n)

#len_user = 10000
#R = np.zeros((len(user_index), len(book_index)));
#I = np.zeros((len(user_index), len(book_index)));
R = np.zeros((m, len(book_index)));
I = np.zeros((m, len(book_index)));
count = 0;
for user_in in range(m):
    user = conv_user[user_in] 
    if user not in train_:
        continue;
    for book in train_[user]:
        R[user_index[user]][book_index[book]] = train_[user][book]
        I[user_index[user]][book_index[book]] = 1
    count += 1;

count = 0;
#T = np.zeros((len(user_index), len(book_index)));
#I2 = np.zeros((len(user_index), len(book_index)));
T = np.zeros((m, len(book_index)));
I2 = np.zeros((m, len(book_index)));
for user_in in range(m):
    user = conv_user[user_in]
    if user not in test_:
        continue;
#    pprint(user);
    for book in test_[user]:
        T[user_index[user]][book_index[book]] = test_[user][book] #test set
        I2[user_index[user]][book_index[book]] = 1

In [47]:
print(len(P[1]))

10000


In [48]:
#Iterate calculation
k = 20
lmbda = 0.02
n_epochs = 50
gamma = 0.005 #learning rate
train_errors = []
test_errors = []
user, book = R.nonzero()
for epoch in range(n_epochs):
    if epoch % 10 == 0:
        print(epoch)
    for u, i in zip(user, book):
        e = R[u, i] - prediction(P[:,u], Q[:, i])
        P[:, u] += gamma * (e * Q[:, i] - lmbda * P[:, u])
        Q[:, i] += gamma * (e * P[:, u] - lmbda * Q[:, i])

0
10
20
30
40


In [312]:
#Print one matrix
pprint(P[:, 1])

array([0.46562236, 0.67916067, 0.44680695, 0.18390048, 0.64692949,
       0.01755266, 0.09910645, 0.52350605, 0.04623374, 0.6169816 ,
       0.66349279, 0.61015071, 0.40297021, 0.78651147, 0.72557995,
       0.41856451, 0.36838965, 0.45838551, 0.70199411, 0.52532535])


In [49]:
#Calculate the result
PredM = predictions(P, Q, T)
pprint("done")

'done'


In [50]:
#Tr = T[0:10000, :]
MAE = mae(I2, T, PredM)
RMSE = rmse(I2, T, PredM)
print("MAE by Matrix Factorization with Stochastic Gradient Descent is :", MAE)
print("RMSE by Matrix Factorization with Stochastic Gradient Descent is :", RMSE)

MAE by Matrix Factorization with Stochastic Gradient Descent is : 0.3586821446934908
RMSE by Matrix Factorization with Stochastic Gradient Descent is : 0.5105274090698148
