## Colaborative Filtering with various distances

In [8]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import time
from tqdm import tqdm

In [9]:
class K_NN_Classifier:

    def __init__(self, K, distance):
        self.K = K
        self.distance = distance
        self.sources = None

    def train(self, sources):
        self.sources = sources
    
    # Pearson correlation coefficient
    def pearson_correlation_coefficient(self, x):

        mx = x.mean()
        xm = x - mx

        dist = list()
        for idx, y in self.sources.iterrows():

            my = y.mean()
            ym = y - my

            num = np.add.reduce(xm * ym)
            den = np.sqrt(np.sum(xm * xm, axis=0) * np.sum(ym * ym, axis=0))
            if den == 0:
                r = 0
            else:
                r = num / den

            dist.append(r)

        return np.array(dist)
    
    # Cosine distance
    def cosine_distance(self, x):
        
        sum_x = np.sum(np.sqrt(x**2))
        
        dist = list()
        for idx, y in self.sources.iterrows():
            
            num = np.sum(np.dot(x, y))
            den = np.sum(np.sqrt(y**2)) * sum_x
            
            if den == 0:
                r = 0
            else:
                r = num / den
            
            dist.append(r)
            
        return np.array(dist)
    
    # Absolute distance
    def abs_distance(self, x):
        
        sum_x = np.sum(x)
        
        dist = list()
        for idx, y in self.sources.iterrows():
            
            num = np.sum(np.abs(x, y))
            den = np.sum(y) * sum_x
            
            if den == 0:
                r = 0
            else:
                r = num / den
            
            dist.append(r)
        
        return np.array(dist)

    def predict(self, item):
        
        if self.distance == 'pcc':
            dists = self.pearson_correlation_coefficient(item)
        elif self.distance == 'cosine':
            dists = self.cosine_distance(item)
        else:
            dists = self.abs_distance(item)
        
        indices = dists.argsort()[-self.K - 1:]
        indices = indices[::-1]
        vectors = []
        for index in indices:
            vectors.append(self.sources.iloc[index])
        return np.sort(dists)[::-1][:self.K + 1], vectors

In [10]:
def predict_itembased(user_id, item_id, sample, log=True):

    similarities, vectors = model_knn.predict(sample)

    mean_i = sample.mean()
    nom = 0
    den = similarities.sum()

    for i, vector in enumerate(vectors):
        nom += similarities[i]*(vector.iloc[user_id] - mean_i)

    if den == 0:
        return 0
    prediction = mean_i + nom/den
    prediction = round(prediction)

    if log:
        print(f'\nPredicted rating for user {user_id} -> item {item_id}: {prediction}')

    return prediction

In [11]:
def evaluate(matrix, sample_size=1000):

    iteration = list()
    times = list()

    n_items = matrix.shape[0]
    n_users = matrix.shape[1]

    prediction = pd.DataFrame(0, index=range(n_items), columns=range(n_users))
    ground_truth = pd.DataFrame(0, index=range(n_items), columns=range(n_users))

    start = time.time()

    for i in tqdm(range(sample_size)):
        
        while True:
            rand_i = random.randint(0, n_users - 1)
            rand_j = random.randint(0, n_items - 1)
            sol = matrix.iloc[rand_j, rand_i]
            if sol != 0:
                break

        ground_truth.at[rand_j, rand_i] = sol

        vector = matrix.iloc[rand_j]
        vector[rand_i] = 0

        pred = predict_itembased(rand_i, rand_j, vector, False)
        prediction.at[rand_j, rand_i] = pred

        if i % 50 == 0:
            iteration.append(i)
            temp_time = time.time() - start
            times.append(temp_time)

    result = np.sqrt(np.sum((np.array(prediction) - np.array(ground_truth)) ** 2) / sample_size)
    print(f"RMSE value is: {result}")
    result = np.sum(np.abs(np.array(prediction) - np.array(ground_truth))) / sample_size
    print(f"MAE value is: {result}")

    return result, times, iteration

In [12]:
train = pd.read_csv('D:/Pycharm_Project/data/networks/filmtrust/train.csv', header=0, index_col=[0])
test = pd.read_csv('D:/Pycharm_Project/data/networks/filmtrust/test.csv', header=0, index_col=[0])

k = 5
model_knn = K_NN_Classifier(K=k, distance='pcc')
model_knn.train(train)

res, times, it = evaluate(test, 20)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:36<00:00,  1.82s/it]

RMSE value is: 1.6278820596099706
MAE value is: 1.15



