In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [None]:
class DataLoader:
    def __init__(self):
        self.user_mapper = {}
        self.item_mapper = {}
        self.user_counter = 0
        self.item_counter = 0

    def load_ratings_train(self, file_path):
        df = pd.read_csv(file_path, sep=" ", header=None)
        df.columns = ['userId', 'itemId', 'rating']
        
        df, self.user_mapper, self.item_mapper, self.user_counter, self.item_counter = self._map_entities(df)
        
        return df, self.user_counter, self.item_counter
    def _map_entities(self, df):
        for idx, row in df.iterrows():
            user_id = row['userId']
            item_id = row['itemId']
            
            if user_id not in self.user_mapper:
                self.user_mapper[user_id] = self.user_counter
                self.user_counter += 1
            if item_id not in self.item_mapper:
                self.item_mapper[item_id] = self.item_counter
                self.item_counter += 1
            
            df.at[idx, 'userId'] = self.user_mapper[user_id]
            df.at[idx, 'itemId'] = self.item_mapper[item_id]
            
            df.at[idx, 'rating'] = abs(row['rating'])
        
        return df, self.user_mapper, self.item_mapper, self.user_counter, self.item_counter


In [None]:
data_loader = DataLoader()

# Load ratings data
df, userNum, itemNum = data_loader.load_ratings_train(file_path='.../train.txt')


In [None]:
r_mean = {}
r_count = {}
for i in range(len(df)):
    r_mean[df.iloc[i,0]] = 0
    r_count[df.iloc[i,0]] = 0
for i in range(len(df)):
    r_mean[df.iloc[i,0]]+=df.iloc[i,2]
    r_count[df.iloc[i,0]] += 1

In [None]:
for elem in r_mean:
    r_mean[elem] = r_mean[elem] / r_count[elem]

In [None]:
user_itemlist = {}
for i in range(len(df)):
    user_itemlist[df.iloc[i,0]] = []
for i in range(len(df)):
    user_itemlist[df.iloc[i,0]] .append(df.iloc[i,1])    
            

In [None]:
item_list = {}
for i in range(len(df)):
    if df.iloc[i,1] not in item_list:
        item_list[df.iloc[i,1]] = 1

In [None]:
from scipy.sparse import lil_matrix

rate = lil_matrix((len(r_mean), len(item_list)))
Sim = lil_matrix((len(r_mean),len(r_mean)))

In [None]:
for i in range(len(df)):
    rate[df.iloc[i,0],df.iloc[i,1]] = df.iloc[i,2]

Similarity based on rates

In [None]:
for user1 in user_itemlist:
    for user2 in user_itemlist:
        if user1 < user2:
            a = 0
            b = 0
            c = 0
            for item in (set(user_itemlist[user1]) & set(user_itemlist[user2])):
                a+=(rate[user1,item] - r_mean[user1])*(rate[user2,item] - r_mean[user2])
            for item in (user_itemlist[user1]):
                b+=(rate[user1,item] - r_mean[user1])*(rate[user1,item] - r_mean[user1])
            for item in (user_itemlist[user2]):
                c+=(rate[user2,item] - r_mean[user2])*(rate[user2,item] - r_mean[user2])
            if a != 0 and b != 0 and c != 0:
                Sim[user1,user2] = a / (np.sqrt(b)*np.sqrt(c))
                Sim[user2,user1] = a / (np.sqrt(b)*np.sqrt(c))

In [None]:
from scipy.sparse import coo_matrix
aaa = coo_matrix(Sim)

In [None]:
Sim2 = aaa.toarray()

In [None]:
def sortSparseMatrix(m, rev=True, only_indices=True):

    """ Sort a sparse matrix and return column index dictionary
    """
    col_dict = dict() 
    for i in range(m.shape[0]): # assume m is square matrix.
        d = m.getrow(i)
        s = zip(d.indices, d.data)
        sorted_s = sorted(s, key=lambda v: v[1], reverse=True)
        if only_indices:
            col_dict[i] = [element[0] for element in sorted_s]
        else:
            col_dict[i] = sorted_s
    return col_dict

In [None]:
Sim_sort = sortSparseMatrix(aaa)

In [None]:
k = 5 # number of nearest neighbors
sim_sort2 = dict()
for elem in Sim_sort:
    sim_sort2[elem] = Sim_sort[elem][:k] 

In [None]:

rate_add = lil_matrix((len(r_mean), len(item_list)))
for user in r_mean:
    for item in item_list:
        if rate[user,item] == 0:
            a = 0
            Sim_sum = 0
            in_ = False
            for tk in sim_sort2[user]:
                if Sim2[user,tk] > 0:
                    Sim_sum += Sim2[user,tk]
                    if rate[tk,item] != 0:
                        a += Sim2[user,tk]*(rate[tk,item] - r_mean[tk])
                        in_ = True
            if in_:
                if (r_mean[user] + a/Sim_sum) > 0:
                    rate_add[user,item] = r_mean[user] + a/Sim_sum

In [None]:
with open('.../rate_add_+'str(k)+'nn.txt','w') as file_out:
    for i in range(len(r_mean)):
        for j in range(len(item_list)):
            if rate_add[i,j] != 0:
                file_out.write(str(i)+' '+str(j)+' '+str(rate_add[i,j])+'\n')   