In [1]:
"""
the :mod:`knns` module includes some k-NN inspired algorithms.
"""
import heapq
import numpy as np
import pandas as pd

from surprise import AlgoBase, Dataset, Reader, accuracy, KNNWithMeans, KNNBasic, KNNBaseline, SVD
from surprise.model_selection import cross_validate
from surprise import PredictionImpossible
from surprise.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim

import os

current_directory = os.getcwd()
print("Current working directory:", current_directory)
print('asasa')

Current working directory: C:\Users\User\Desktop\MRes\src\Cred
asasa


In [None]:

def jaccard_sim(list1, list2):
    new_list1 = [x for i, x in enumerate(list1) if x != 0 or list2[i] != 0]
    new_list2 = [x for i, x in enumerate(list2) if x != 0 or list1[i] != 0]

    # Compute the intersection of the two lists, where each 1 or 0 is considered a value
    intersection = sum([1 for i in range(len(list1)) if list1[i] == 1 and list2[i] == 1])

    # Compute the union of the two lists, where each 1 or 0 is considered a value
    union = sum([1 for i in range(len(list1)) if list1[i] == 1 or list2[i] == 1])

    # Compute the Jaccard similarity
    jaccard_similarity = intersection / union
    return jaccard_similarity
    
# Load movie semantic data
u_header = ['item_id', 'title', 'genres']
items_df = pd.read_csv('ml-1m/movies.dat', sep='::', names=u_header, encoding='latin', engine='python')
items_df = items_df.drop('title', axis = 1)

items_dict = items_df.set_index('item_id').T.to_dict('dict')
genre_list = ['Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery','Romance','Sci-Fi','Thriller','War','Western']

itemSemanticMatrix = pd.DataFrame(columns=['item_id1', 'item_id2', 'jaccard_sim_value'])

for item_id1, item_genres1 in items_dict.items():
    genre_flags1 = [int(g in item_genres1['genres']) for g in genre_list]
    for item_id2, item_genres2 in items_dict.items():
        genre_flags2 = [int(g in item_genres2['genres']) for g in genre_list]
        jaccard_list = [item_id1, item_id2, jaccard_sim(genre_flags1, genre_flags2)]
        jaccard_df = pd.DataFrame([jaccard_list], columns=['item_id1', 'item_id2', 'jaccard_sim'])
        #itemSemanticMatrix.append(jaccard_list, ignore_index=True)
        itemSemanticMatrix = pd.concat([itemSemanticMatrix, jaccard_df], ignore_index=True)

itemSemanticMatrix = itemSemanticMatrix.set_index(['item_id1', 'item_id2'])

print('Calculated Semantic Matrix')

print(itemSemanticMatrix.loc[(3, 6), 'jaccard_sim_value'])

In [4]:
def classify_age(age):
    if age <= 13: #17
        return 1 #'Children'
    elif age <= 25: #30
        return 2 #'Teenagers'
    elif age <= 40: #45
        return 3 #'Young Adults'
    elif age <= 50: #60
        return 4 #'Adults'
    else:
        return 5 #'Seniors'
    
# Load movie semantic data
u_header = ['user_id', 'age', 'sex','job', 'zipcode']
users_df = pd.read_csv('data/u.user', sep='|', names=u_header, encoding='latin')
users_df = users_df.drop('zipcode', axis = 1)
#users_df = users_df.drop('job', axis = 1)
#users_df = users_df.drop('sex', axis = 1)

users_df['ageGroup'] = users_df['age'].apply(classify_age)
users_df = users_df.drop('age', axis = 1)

users_dict = users_df.set_index('user_id').T.to_dict('dict')
#print(users_dict)

occup_list = ["administrator", "artist", "doctor", "educator", "engineer", "entertainment", "executive", "healthcare", "homemaker", "lawyer", "librarian", "marketing", "none", "other", "programmer", "retired", "salesman", "scientist", "student", "technician", "writer"]
age_group = [1, 2, 3, 4, 5]

userDemoGraphicMatrix = np.zeros((len(users_df), len(users_df)))

for user_id1, user_demo1 in users_dict.items():
    
    # 5 age group, 1 gender, 21 occupation
    demo_values1 = [0 for i in range(27)]    
    values1 = list(user_demo1.values())
    if(values1[0] == "M"):
        demo_values1[0] = 1
    for oc in range(len(occup_list)):
        if(values1[1] == occup_list[oc]):
            demo_values1[1+oc] = 1 
    for ag in range(len(age_group)):
        if(values1[2] == age_group[ag]):
            demo_values1[22+ag] = 1

    for user_id2, user_demo2 in users_dict.items():
        demo_values2 = [0 for i in range(27)]
        values2 = list(user_demo2.values())
        #print(user_id1, values1[0], values1[1], values1[2])
        if(values2[0] == "M"):
            demo_values2[0] = 1
        for oc2 in range(len(occup_list)):
            if(values2[1] == occup_list[oc2]):
                demo_values2[1+oc2] = 1
        for ag2 in range(len(age_group)):
            if(values2[2] == age_group[ag2]):
                demo_values2[22+ag2] = 1
        userDemoGraphicMatrix[user_id1-1, user_id2-1] = jaccard_sim(demo_values1, demo_values2)
        
print(userDemoGraphicMatrix)
print('Calculated Demographic Matrix')

[[1.  0.  0.5 ... 0.5 0.  0.5]
 [0.  1.  0.  ... 0.  0.  0. ]
 [0.5 0.  1.  ... 0.5 0.  0.5]
 ...
 [0.5 0.  0.5 ... 1.  0.  1. ]
 [0.  0.  0.  ... 0.  1.  0. ]
 [0.5 0.  0.5 ... 1.  0.  1. ]]
Calculated Demographic Matrix


In [3]:
# Important note: as soon as an algorithm uses a similarity measure, it should
# also allow the bsl_options parameter because of the pearson_baseline
# similarity. It can be done explicitly (e.g. KNNBaseline), or implicetely
# using kwargs (e.g. KNNBasic).

class SymmetricAlgo2(AlgoBase):
    """This is an abstract class aimed to ease the use of symmetric algorithms.
    A symmetric algorithm is an algorithm that can can be based on users or on
    items indifferently, e.g. all the algorithms in this module.
    When the algo is user-based x denotes a user and y an item. Else, it's
    reversed.
    """
    def __init__(self, sim_options={}, verbose=True, **kwargs):

        AlgoBase.__init__(self, sim_options=sim_options, **kwargs)
        self.verbose = verbose
        self.isSemantic = sim_options["isSemantic"]
        self.isDemographic = sim_options["isDemographic"]
        self.isCr = sim_options["isCr"]
        self.printLimit = 10
        if(self.isCr):
            %store -r Cr
            self.Cr = Cr
            print(Cr[0], Cr[1])
        
        if(self.isDemographic):
           print('hihiihihi')

    def fit(self, trainset):

        AlgoBase.fit(self, trainset)

        ub = self.sim_options["user_based"]
        self.n_x = self.trainset.n_users if ub else self.trainset.n_items
        self.n_y = self.trainset.n_items if ub else self.trainset.n_users
        self.xr = self.trainset.ur if ub else self.trainset.ir
        self.yr = self.trainset.ir if ub else self.trainset.ur
                
        # Initilizating Semantic matrix
        if self.isSemantic:
            self.SemanticMatrix = np.zeros((self.n_x, self.n_x))
            for i in self.trainset.all_items():
                for j in self.trainset.all_items():
                    self.SemanticMatrix[i, j] = itemSemanticMatrix[int(self.trainset.to_raw_iid(i))-1, int(self.trainset.to_raw_iid(j))-1]
                    
        # Initilizating Demographic matrix
        if self.isDemographic:
            self.DemographicMatrix = np.zeros((self.n_x, self.n_x))
            for i in self.trainset.all_users():
                for j in self.trainset.all_users():
                    self.DemographicMatrix[i, j] = userDemoGraphicMatrix[int(self.trainset.to_raw_uid(i))-1, int(self.trainset.to_raw_uid(j))-1]
                    
        # Initilizating Cr matrix
        if self.isCr:
            self.CrMatrix = np.zeros((self.n_x, self.n_x))
            for i in self.trainset.all_users():
                #if(self.printLimit<20):
                #        print(self.Cr[int(self.trainset.to_raw_uid(i))-1], 'id:', int(self.trainset.to_raw_uid(i)))
                #        self.printLimit += 1
                for j in self.trainset.all_users():
                    self.CrMatrix[i, j] = self.Cr[int(self.trainset.to_raw_uid(i))-1]
                    if(i == j):
                        self.CrMatrix[i, j] = 1
        
        return self

    def switch(self, u_stuff, i_stuff):
        """Return x_stuff and y_stuff depending on the user_based field."""

        if self.sim_options["user_based"]:
            return u_stuff, i_stuff
        else:
            return i_stuff, u_stuff

class KNNBasic2(SymmetricAlgo2):
    """A basic collaborative filtering algorithm.
    Args:
        k(int): The (max) number of neighbors to take into account for
            aggregation (see :ref:`this note <actual_k_note>`). Default is
            ``40``.
        min_k(int): The minimum number of neighbors to take into account for
            aggregation. If there are not enough neighbors, the prediction is
            set to the global mean of all ratings. Default is ``1``.
        sim_options(dict): A dictionary of options for the similarity
            measure. See :ref:`similarity_measures_configuration` for accepted
            options.
        verbose(bool): Whether to print trace messages of bias estimation,
            similarity, etc.  Default is True.
    """

    def __init__(self, k=40, min_k=1, sim_options={}, verbose=True, **kwargs):

        SymmetricAlgo2.__init__(self, sim_options=sim_options, verbose=verbose, **kwargs)
        self.k = k
        self.min_k = min_k
        self.count = 0        

    def fit(self, trainset):

        SymmetricAlgo2.fit(self, trainset)

        if(self.isSemantic):
            self.sim = self.compute_similarities() *self.SemanticMatrix
            #self.sim = self.SemanticMatrix
        elif(self.isDemographic):
            self.sim = self.compute_similarities()*self.DemographicMatrix
        else:
            self.sim = self.compute_similarities()
                 
        #print(self.sim)
        return self

    def estimate(self, u, i):

        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible("User and/or item is unknown.")

        x, y = self.switch(u, i)

        neighbors = [(self.sim[x, x2], r) for (x2, r) in self.yr[y]]
        k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[0])
        
        #print(neighbors)
        
        # compute weighted average
        sum_sim = sum_ratings = actual_k = 0
        for (sim, r) in k_neighbors:
            if sim > 0:
                sum_sim += sim
                sum_ratings += sim * r
                actual_k += 1

        if actual_k < self.min_k:
            raise PredictionImpossible("Not enough neighbors.")

        est = sum_ratings / sum_sim
        
        self.count += 1
        #print('Count-', self.count, ':x = ',x,'y = ', y)

        details = {"actual_k": actual_k}
        return est, details

#================================ KNNBaseline ==============================#

class KNNBaseline2(SymmetricAlgo2):
    """A basic collaborative filtering algorithm taking into account a *baseline* rating.
    Args:
        k(int): The (max) number of neighbors to take into account for
            aggregation (see :ref:`this note <actual_k_note>`). Default is
            ``40``.
        min_k(int): The minimum number of neighbors to take into account for
            aggregation. If there are not enough neighbors, the neighbor
            aggregation is set to zero (so the prediction ends up being
            equivalent to the baseline). Default is ``1``.
        sim_options(dict): A dictionary of options for the similarity
            measure. See :ref:`similarity_measures_configuration` for accepted
            options. It is recommended to use the :func:`pearson_baseline
            <surprise.similarities.pearson_baseline>` similarity measure.
        bsl_options(dict): A dictionary of options for the baseline estimates
            computation. See :ref:`baseline_estimates_configuration` for
            accepted options.
        verbose(bool): Whether to print trace messages of bias estimation,
            similarity, etc.  Default is True.
    """

    def __init__(
        self, k=40, min_k=1, sim_options={}, bsl_options={}, verbose=True, **kwargs
    ):

        SymmetricAlgo2.__init__(
            self,
            sim_options=sim_options,
            bsl_options=bsl_options,
            verbose=verbose,
            **kwargs
        )

        self.k = k
        self.min_k = min_k

    def fit(self, trainset):

        SymmetricAlgo2.fit(self, trainset)

        self.bu, self.bi = self.compute_baselines()
        self.bx, self.by = self.switch(self.bu, self.bi)
        
        if(self.isSemantic):
            self.sim = self.compute_similarities()*self.SemanticMatrix
        elif(self.isDemographic):
            #%store -r Cr
            self.sim = self.compute_similarities()*self.DemographicMatrix
            #print(self.sim.shape())
        else:
            self.sim = self.compute_similarities()

        return self

    def estimate(self, u, i):

        est = self.trainset.global_mean
        if self.trainset.knows_user(u):
            est += self.bu[u]
        if self.trainset.knows_item(i):
            est += self.bi[i]

        x, y = self.switch(u, i)

        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            return est

        neighbors = [(x2, self.sim[x, x2], r) for (x2, r) in self.yr[y]]
        k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[1])

        # compute weighted average
        sum_sim = sum_ratings = actual_k = 0
        for (nb, sim, r) in k_neighbors:
            if sim > 0:
                sum_sim += sim
                nb_bsl = self.trainset.global_mean + self.bx[nb] + self.by[y]
                sum_ratings += sim * (r - nb_bsl)
                actual_k += 1

        if actual_k < self.min_k:
            sum_ratings = 0

        try:
            est += sum_ratings / sum_sim
        except ZeroDivisionError:
            pass  # just baseline again

        details = {"actual_k": actual_k}
        return est, details

    
#================================ KNNWithMeans2 ==============================#

class KNNWithMeans2(SymmetricAlgo2):
    """A basic collaborative filtering algorithm, taking into account the mean
    ratings of each user.
    Args:
        k(int): The (max) number of neighbors to take into account for
            aggregation (see :ref:`this note <actual_k_note>`). Default is
            ``40``.
        min_k(int): The minimum number of neighbors to take into account for
            aggregation. If there are not enough neighbors, the neighbor
            aggregation is set to zero (so the prediction ends up being
            equivalent to the mean :math:`\\mu_u` or :math:`\\mu_i`). Default is
            ``1``.
        sim_options(dict): A dictionary of options for the similarity
            measure. See :ref:`similarity_measures_configuration` for accepted
            options.
        verbose(bool): Whether to print trace messages of bias estimation,
            similarity, etc.  Default is True.
    """

    def __init__(self, k=40, min_k=1, sim_options={}, verbose=True, **kwargs):

        SymmetricAlgo2.__init__(self, sim_options=sim_options, verbose=verbose, **kwargs)

        self.k = k
        self.min_k = min_k

    def fit(self, trainset):

        SymmetricAlgo2.fit(self, trainset)
        
        self.sim = self.compute_similarities()
        
        if (self.isCr):
            #print(self.sim[0][0:4])
            #print(self.CrMatrix[0][0:4])
            self.sim = self.sim * self.CrMatrix
            #print(self.sim[0][0:4])
        
        if(self.isSemantic):
            self.sim2 = self.SemanticMatrix #self.sim*self.SemanticMatrix
        elif(self.isDemographic):
            self.sim2 = self.DemographicMatrix
        else:
            self.sim2 =  self.sim
            
        self.means = np.zeros(self.n_x)
        for x, ratings in self.xr.items():
            self.means[x] = np.mean([r for (_, r) in ratings])   

        return self

    def estimate(self, u, i):
        
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible("User and/or item is unknown.")

        x, y = self.switch(u, i)

        est = self.means[x]
        est1 = 0
        est2 = 0
                  
        # =========== general CF calculation =========== #
        neighbors = [(x_others, self.sim[x, x_others], r) for (x_others, r) in self.yr[y]]
        k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[1])
        
        # compute weighted average
        sum_sim = sum_ratings = actual_k = 0
        for (nb, sim, r) in k_neighbors:
            if sim > 0:
                sum_sim += sim
                sum_ratings += sim * (r - self.means[nb])
                actual_k += 1 
        
        if actual_k < self.min_k:
            sum_ratings = 0
        try:
            est1 += sum_ratings / sum_sim
        except ZeroDivisionError:
            pass  # return mean
        
        
        # =========== semantic or demographic based CF calculation =========== #
        if(self.isSemantic or self.isDemographic):
            neighbors2 = [(x_others, self.sim2[x, x_others], r) for (x_others, r) in self.yr[y]]
            k_neighbors2 = heapq.nlargest(self.k, neighbors2, key=lambda t: t[1])
           
            # compute weighted average
            sum_sim2 = sum_ratings2 = actual_k2 = 0
            for (nb2, sim2, r2) in k_neighbors2:
                if sim2 > 0:
                    sum_sim2 += sim2
                    sum_ratings2 += sim2 * (r2 - self.means[nb2])
                    actual_k2 += 1
                   
            if actual_k2 < self.min_k:
                sum_ratings2 = 0
            try:
                est2 += sum_ratings2 / sum_sim2
            except ZeroDivisionError:
                pass  # return mean
        
            if est1 == 0 and est2 != 0:
                est += est2
            elif est2 == 0 and est1 != 0:
                est += est1
            elif est1 + est2 == 0:
                if(est1 > est2):
                    est += est1
                else:
                    est += est2
            else:
                #est += (2 * est1 * est2)/(est1 + est2)
                est += (est1 + est2)/2
                #if(self.printLimit<20):
                #    print('est1: ', est1,'est2: ', est2, 'est1 + est2: ', est, 'est1 by only:', est + est1, 'est2 by only:', est + est2,)
                #    self.printLimit+=1
        
            details = {"actual_k": actual_k, "actual_k2": actual_k2}
        else:
            est += est1
            details = {"actual_k": actual_k}
         
        return est, details
    
class HybridRecommender(nn.Module):
    def __init__(self):
        super(HybridRecommender, self).__init__()
        self.fc = nn.Linear(2, 1)

    def forward(self, x):
        x = self.fc(x)
        return x

In [4]:
#========================================= Main functions starts here =============================================#

data = Dataset.load_builtin("ml-1m")

trainset, testset = train_test_split(data, test_size=0.05)


sim_options_forUserBased = {
    "name": "cosine",
    "user_based": True,  # compute  similarities between items
    "isSemantic": False,
    "isDemographic": False,
    "isCr": False
}

sim_options_forItemBased = {
    "name": "cosine",
    "user_based": False,  # compute  similarities between items
    "isSemantic": True,
    "isDemographic": False,
    "isCr": False
}

#algo = KNNBasic2(5, sim_options=sim_options)
#algo = KNNBaseline2(10, sim_options=sim_options)
#algo = KNNWithMeans2(10, sim_options=sim_options_forUserBased)
#cross_validate(algo, data, verbose=True)

# User-based CF
user_based_cf = KNNWithMeans2(40, sim_options=sim_options_forUserBased)
user_based_cf.fit(trainset)


# Item-based CF
item_based_cf = KNNWithMeans2(40, sim_options=sim_options_forItemBased)
item_based_cf.fit(trainset)

#user_based_pred = user_based_cf.test(testset)
#item_based_pred = item_based_cf.test(testset)

#print('User based:', accuracy.rmse(user_based_pred), accuracy.mae(user_based_pred))
#print('Item based:', accuracy.rmse(item_based_pred), accuracy.mae(item_based_pred))

print("Trained using User based and Item based ways")

Computing the cosine similarity matrix...
Done computing similarity matrix.


NameError: name 'itemSemanticMatrix' is not defined

In [97]:
model = HybridRecommender()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Prepare input data
inputs = []
targets = []
count = 0
for user, item, true_rating in trainset.all_ratings():
    uid = trainset.to_raw_uid(user)
    iid = trainset.to_raw_iid(item)
    user_based_pred = user_based_cf.predict(uid, iid).est
    item_based_pred = item_based_cf.predict(uid, iid).est
    
    #if(count < 10):
    #    print(trainset.to_raw_uid(user), trainset.to_raw_iid(item), true_rating, user_based_pred, item_based_pred ) 
    #    count += 1
        
    inputs.append([user_based_pred, item_based_pred])
    targets.append([float(true_rating)])

for d in range(10):
    print(inputs[d], targets[d])    

inputs = torch.tensor(inputs)
targets = torch.tensor(targets)


[2.6662694193880094, 2.712232066412871] [3.0]
[3.273006743015662, 3.262453833010792] [4.0]
[3.8090113976077475, 3.721872541865211] [4.0]
[2.953618856282781, 2.5387204215062913] [1.0]
[3.521026756202444, 3.562936717453293] [4.0]
[3.5327635374503403, 3.744584818917531] [4.0]
[3.0537295465103935, 3.0444891737334436] [3.0]
[3.2539109617930912, 3.2980736287512156] [4.0]
[3.068388254048331, 3.0124678252458126] [3.0]
[3.4428726800329255, 3.262218280870847] [3.0]


In [98]:
inputs = inputs.to(torch.float)
targets = targets.to(torch.float)

print(inputs.shape)
print(targets.shape)

torch.Size([95000, 2])
torch.Size([95000, 1])


In [99]:
# Train the model
epochs = 100
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()


total_mse = 0
total_rmse = 0
total_mae = 0

for uid, iid, true_r in testset:
    test_user_based_pred = user_based_cf.predict(uid, iid).est
    test_item_based_pred = item_based_cf.predict(uid, iid).est
    
    test_input_data = torch.tensor([[test_user_based_pred, test_item_based_pred]])
    test_input_data = test_input_data.to(torch.float)
    test_predicted_rating = model(test_input_data).item()
    total_mse += (test_predicted_rating - true_r) ** 2
    
    diff = test_predicted_rating - true_r
    total_rmse += diff ** 2
    total_mae += abs(diff)

mse = total_mse / len(testset)
print(f"Mean Squared Error: {mse}")

n_testset = len(testset)
rmse = (total_rmse / n_testset) ** 0.5
mae = total_mae / n_testset

print("RMSE:", rmse)
print("MAE:", mae)

#print('User based:', accuracy.rmse(user_based_pred), accuracy.mae(user_based_pred))
#print('Item based:', accuracy.rmse(item_based_pred), accuracy.mae(item_based_pred))

Mean Squared Error: 0.8961436838519771
RMSE: 0.9466486591402203
MAE: 0.7499297465801239
