In [66]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split

In [297]:
ratings_df = pd.read_csv('./Dataset/ratings.txt',sep=" ",header=None)
trust_df = pd.read_csv('./Dataset/trust.txt',sep=" ",header=None)

In [151]:
ratings_df.rename(columns = {0:'user_id', 
                       1:'item_id',
                         2:'ratings'}, 
            inplace = True)

In [298]:
trust_df.rename(columns = {0:'trustors', 
                       1:'trustee',
                         2:'trust_value'}, 
            inplace = True)

In [152]:
ratings_df.head()

Unnamed: 0,user_id,item_id,ratings
0,1,1,2.0
1,1,2,4.0
2,1,3,3.5
3,1,4,3.0
4,1,5,4.0


In [299]:
trust_df.head()

Unnamed: 0,trustors,trustee,trust_value
0,2,966,1
1,2,104,1
2,5,1509,1
3,6,1192,1
4,7,1510,1


In [47]:
train_user_id = []
train_item_id = []
train_ratings = []
test_user_id = []
test_item_id = []
test_ratings = []

for user in list(ratings_df['user_id'].unique()):
    temp_user_df = ratings_df[ratings_df['user_id'] == user]
    user_len = temp_user_df.shape[0]
    train_ratio = int(round(user_len*0.8))
    test_ratio = user_len - train_ratio
    
    for i in range(train_ratio):
        train_user_id.append(user)
        train_item_id.append(temp_user_df['item_id'].iloc[i])
        train_ratings.append(temp_user_df['ratings'].iloc[i])
        
    if test_ratio > 0:
        for j in range(test_ratio):
            test_user_id.append(user)
            test_item_id.append(temp_user_df['item_id'].iloc[train_ratio+j])
            test_ratings.append(temp_user_df['ratings'].iloc[train_ratio+j])

In [48]:
train = pd.DataFrame()
test = pd.DataFrame()

train['user_id'] = train_user_id
train['item_id'] = train_item_id
train['ratings'] = train_ratings
test['user_id'] = test_user_id
test['item_id'] = test_item_id
test['ratings'] = test_ratings

In [54]:
def common_elements(list1, list2):
    result = []
    for element in list1:
        if element in list2:
            result.append(element)
    return result

In [83]:
def pearson_similarity(u,v):
    u = np.array(u)
    v = np.array(v)
    u_ = np.mean(u)
    v_ = np.mean(v)
    res = np.dot((u-u_),(v-v_))/np.sqrt(np.sum((u-u_)**2))*np.sqrt(np.sum((v-v_)**2))
    return round(res,3)

## Scenario 1

In [285]:
sim_dict = {}

count = 0

for test_user,test_item in zip(list(test['user_id']),list(test['item_id'])):
    user_similarity_list = []
    for user in list(train[train['item_id']==test_item]['user_id']):
        common_items = common_elements(list(train[train['user_id']==test_user]['item_id']),list(train[train['user_id']==user]['item_id']))
        u = []
        v = []

        for item in common_items:
            u.append(train[(train['user_id']==test_user)&(train['item_id']==item)]['ratings'].iloc[0])

        for item in common_items:
            v.append(train[(train['user_id']==user)&(train['item_id']==item)]['ratings'].iloc[0])

        sim = pearson_similarity(u,v)

        user_similarity_list.append((user,sim))
        
    key = str(test_user)+' '+str(test_item)

    sim_dict[key] = user_similarity_list
    
    count += 1
    
    if count >= 200:
        break

  res = np.dot((u-u_),(v-v_))/np.sqrt(np.sum((u-u_)**2))*np.sqrt(np.sum((v-v_)**2))


In [286]:
test_user_neighbor_dict = {}
test_user_neighbor_sim_dict = {}
for k in list(sim_dict.keys()):
    temp_dict = {}
    for user,sim in sim_dict[k]:
        if sim >= 0 or sim < 0:
            temp_dict[user] = sim

    sorted_temp_dict = dict(sorted(temp_dict.items(), key=lambda x:x[1], reverse=True))
    k_neighbors_user_id = list(sorted_temp_dict.keys())[:20]
    test_user_neighbor_dict[k] = k_neighbors_user_id
    temp_sim_list = []
    for u in k_neighbors_user_id:
        temp_sim_list.append(temp_dict[u])
    test_user_neighbor_sim_dict[k] = temp_sim_list

In [287]:
def prediction(key):
    
    k_user_id_list = test_user_neighbor_dict[key]
    k_user_sim_list = test_user_neighbor_sim_dict[key]
    
    test_user_id = key.split()[0]
    test_item_id = key.split()[1]
    
    temp_df = train[train['item_id'] == test_item_id]
    
    ratings = []
    for user in k_user_id_list:
        ratings.append(train[train['user_id'] == user]['ratings'].iloc[0])
        
    k_user_sim_list = np.array(k_user_sim_list)
    ratings = np.array(ratings)
    
    res = np.sum(np.multiply(k_user_sim_list,ratings))/np.sum(k_user_sim_list)    
    
    return round(res,1)

In [353]:
def mae(y_true,y_pred):
    nan_index = list(np.argwhere(np.isnan(y_pred)).flatten())
    te = len(y_true)
    y_true = np.array(y_true)
    y_pred = np.delete(y_pred, nan_index)
    y_true = np.delete(y_true, nan_index)
    res = np.sum(np.abs(y_true-y_pred))/te
    return round(res,3),nan_index

In [354]:
def rmse(y_true,y_pred):
    nan_index = list(np.argwhere(np.isnan(y_pred)).flatten())
    te = len(y_true)
    y_true = np.array(y_true)
    y_pred = np.delete(y_pred, nan_index)
    y_true = np.delete(y_true, nan_index)
    res = np.sqrt(np.sum((y_true-y_pred)**2)/te)
    return round(res,3),nan_index

In [288]:
y_pred = []
for k in list(sim_dict.keys()):
    y_pred.append(prediction(k))

  res = np.sum(np.multiply(k_user_sim_list,ratings))/np.sum(k_user_sim_list)


In [289]:
y_pred = np.array(y_pred)

In [290]:
y_pred.shape

(200,)

In [291]:
y_true = list(test['ratings'])[:200]

In [292]:
len(y_true)

200

In [293]:
mae_score, unpred_index = mae(y_true,y_pred)
mae_score

1.023

In [294]:
rmse_score, unpred_index = rmse(y_true,y_pred)
rmse_score

1.375

In [295]:
rc = (y_pred.shape[0]-len(unpred_index))/len(y_true)

In [296]:
rc

0.965

## Scenario 2 

In [325]:
for i in np.linspace(0.2,1,5):
    y_pred = []
    for test_user,test_item in zip(list(test['user_id']),list(test['item_id'])):
        try:
            trustee_list = list(trust_df[trust_df['trustors']==test_user]['trustee'])
            slice_value = round(len(trustee_list)*i)
            temp_df = train[train['item_id'] == test_item] 
            ratings = []
            for user in trustee_list[:slice_value]:
                ratings.append(train[train['user_id'] == user]['ratings'].iloc[0])
            y_pred.append(np.sum(ratings)/len(trustee_list))
        except:
            y_pred.append(np.nan)

    y_pred = np.array(y_pred)
    mae_score, unpred_index = mae(list(test['ratings']),y_pred)
    rmse_score, unpred_index = rmse(list(test['ratings']),y_pred)
    rc = (y_pred.shape[0]-len(unpred_index))/len(list(test['ratings']))
    
    print('For top '+str(i*100)+'% nearest neighbors, MAE is '+str(mae_score))
    print('For top '+str(i*100)+'% nearest neighbors, RMSE is '+str(rmse_score))
    print('For top '+str(i*100)+'% nearest neighbors, rate coverage is '+str(rc))

  y_pred.append(np.sum(ratings)/len(trustee_list))


For top 20.0% nearest neighbors, MAE is 1.039
For top 20.0% nearest neighbors, RMSE is 2.867
For top 20.0% nearest neighbors, rate coverage is 0.38657243816254416
For top 40.0% nearest neighbors, MAE is 0.819
For top 40.0% nearest neighbors, RMSE is 2.481
For top 40.0% nearest neighbors, rate coverage is 0.36706713780918726
For top 60.00000000000001% nearest neighbors, MAE is 0.441
For top 60.00000000000001% nearest neighbors, RMSE is 1.548
For top 60.00000000000001% nearest neighbors, rate coverage is 0.3395053003533569
For top 80.0% nearest neighbors, MAE is 0.332
For top 80.0% nearest neighbors, RMSE is 1.284
For top 80.0% nearest neighbors, rate coverage is 0.3236749116607774
For top 100.0% nearest neighbors, MAE is 0.297
For top 100.0% nearest neighbors, RMSE is 1.224
For top 100.0% nearest neighbors, rate coverage is 0.3185865724381625


## Scenario 3

In [347]:
weight_dict = {}

count = 0

for test_user,test_item in zip(list(test['user_id']),list(test['item_id'])):
    user_weight_list = []
    user_list = train[train['item_id'] == test_item]['user_id']
    trustee_list = list(trust_df[trust_df['trustors']==test_user]['trustee'])
    w = 0
    for user in user_list:
        common_items = common_elements(list(train[train['user_id']==test_user]['item_id']),list(train[train['user_id']==user]['item_id']))
        if user in trustee_list:
            trust_value = trust_df[(trust_df['trustors'] == test_user)&(trust_df['trustee'] == user)]['trust_value'].iloc[0]
            if len(common_items) > 0:
                u = []
                v = []

                for item in common_items:
                    u.append(train[(train['user_id']==test_user)&(train['item_id']==item)]['ratings'].iloc[0])

                for item in common_items:
                    v.append(train[(train['user_id']==user)&(train['item_id']==item)]['ratings'].iloc[0])

                sim = pearson_similarity(u,v)
                
                w = (2*sim*trust_value)/(sim+trust_value)
            else:
                w = trust_value
        else:
            if len(common_items) > 0:
                u = []
                v = []

                for item in common_items:
                    u.append(train[(train['user_id']==test_user)&(train['item_id']==item)]['ratings'].iloc[0])

                for item in common_items:
                    v.append(train[(train['user_id']==user)&(train['item_id']==item)]['ratings'].iloc[0])

                sim = pearson_similarity(u,v)

                w = sim
            else:
                w = 0
        
        user_weight_list.append((user,w))
        
    key = str(test_user)+' '+str(test_item)
    
    weight_dict[key] = user_weight_list
    
    count += 1
    
    print(count)
    
    if count >= 100:
        break

  res = np.dot((u-u_),(v-v_))/np.sqrt(np.sum((u-u_)**2))*np.sqrt(np.sum((v-v_)**2))


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100


In [348]:
test_user_neighbor_dict = {}
test_user_neighbor_weight_dict = {}
for k in list(weight_dict.keys()):
    temp_dict = {}
    for user,weight in weight_dict[k]:
        if weight >= 0 or weight < 0:
            temp_dict[user] = weight

    sorted_temp_dict = dict(sorted(temp_dict.items(), key=lambda x:x[1], reverse=True))
    k_neighbors_user_id = list(sorted_temp_dict.keys())[:20]
    test_user_neighbor_dict[k] = k_neighbors_user_id
    temp_weight_list = []
    for u in k_neighbors_user_id:
        temp_weight_list.append(temp_dict[u])
    test_user_neighbor_weight_dict[k] = temp_weight_list

In [331]:
def prediction(key):
    
    k_user_id_list = test_user_neighbor_dict[key]
    k_user_weight_list = test_user_neighbor_weight_dict[key]
    
    test_user_id = key.split()[0]
    test_item_id = key.split()[1]
    
    temp_df = train[train['item_id'] == test_item_id]
    
    ratings = []
    for user in k_user_id_list:
        ratings.append(train[train['user_id'] == user]['ratings'].iloc[0])
        
    k_user_weight_list = np.array(k_user_weight_list)
    ratings = np.array(ratings)
    
    res = np.sum(np.multiply(k_user_weight_list,ratings))/np.sum(k_user_weight_list)    
    
    return round(res,1)

In [349]:
y_pred = []
for k in list(weight_dict.keys()):
    y_pred.append(prediction(k))

  res = np.sum(np.multiply(k_user_weight_list,ratings))/np.sum(k_user_weight_list)


In [350]:
y_true = list(test['ratings'])[:100]

In [356]:
mae_score, unpred_index = mae(y_true,y_pred)
mae_score

1.068

In [357]:
rmse_score, unpred_index = rmse(y_true,y_pred)
rmse_score

1.544

In [359]:
rc = (len(y_pred)-len(unpred_index))/len(y_true)

In [360]:
rc

0.96