In [2]:
import pandas as pd
import numpy as np


### Downloading Dataset
- Following Script has been used to extract dataset into `csv` folder
- Make sure to install `go`, `git`, `gdown` and `7z`.
- `softwareengineering.stackexchange.com.7z` should also be present in same directory.
- The dataset is now directly read from `csv` folder. Kindly run the script before executing. 

```bash
gdown --id 19UhOq9Z5IVqM926cC3hvxcl726CTv-kT
mkdir xml csv
7z e softwareengineering.stackexchange.com.7z -oxml
git clone https://github.com/SkobelevIgor/stackexchange-xml-converter
cd stackexchange-xml-converter/
go build
./stackexchange-xml-converter -result-format=csv -source-path=../xml -store-to-dir=../csv
```

In [3]:
question_answer = pd.read_csv('csv/Posts.csv') # Posts.csv should be there in csv/ extracted using above code.
question_answer.set_index('Id', inplace=True)
answers = question_answer[question_answer['PostTypeId'] == 2]
questions = question_answer[question_answer['PostTypeId'] == 1]
# this observation has come from the fact that, PostTypeId == 2, have a parent Id, while those with 1 have answerCount field non empty, rest all ids are wikis etc.

In [4]:
# We are only counting the unique questions answered by a user, so we are using set here.
answerer = answers.groupby('OwnerUserId')['ParentId'].apply(set).to_dict()

# Computing question tags
question_tags = questions['Tags'].apply(lambda x: list(filter(lambda x: x != '', x.split('|'))))
exploded_tags = question_tags.explode()
tags = exploded_tags.value_counts()
count_tag = tags.to_dict()

In [5]:
answers['OwnerUserId'].isna().sum()
# 6822 answers are such that their owner is missing

6822

This is to count only unique answers

In [6]:
count_answers_per_user = {}
for key, val in answerer.items():
  count_answers_per_user[key] = len(val)

count_answers_per_user = dict(sorted(count_answers_per_user.items(), key=lambda x: x[1], reverse=True))

Converting TagNames to TagIds

In [7]:
tags_df = pd.read_csv('csv/Tags.csv')

In [8]:
# tags_df.set_index('Id', inplace=True)
tag_cols = tags_df.columns
req_tag_cols = ['Id', 'TagName']
drop_cols = list(filter(lambda x: x not in req_tag_cols, tag_cols))
tag_cols, drop_cols # we will drop drop cols

(Index(['Id', 'ExcerptPostId', 'WikiPostId', 'TagName', 'Count'], dtype='object'),
 ['ExcerptPostId', 'WikiPostId', 'Count'])

In [9]:
new_tags_df = tags_df.drop(drop_cols, axis=1)
new_tags_df.set_index('TagName', inplace=True)
tag_dict = new_tags_df.to_dict()['Id']
# some tags not there in the tags.csv file, don't know why :(
count_tag = {tag_dict[k]: v for k, v in count_tag.items() if k in tag_dict} 

In [10]:
answerer_table = pd.DataFrame(count_answers_per_user.items(), columns=['UserId', 'AnsweredQuestionCount'])
answerer_table.head()

Unnamed: 0,UserId,AnsweredQuestionCount
0,9113.0,2838
1,177980.0,2318
2,1204.0,2042
3,123788.0,1672
4,131624.0,1602


In [11]:
tags_table = pd.DataFrame(count_tag.items(), columns=['TagId', 'Count'])
tags_table.head()

Unnamed: 0,TagId,Count
0,609,5162
1,249,4931
2,76,4928
3,391,4449
4,790,3510


In [12]:
# getting top 3 most used tags
sorted_tags_table = tags_table.sort_values(by='Count', ascending=False)
sorted_tags_table.head(3)

Unnamed: 0,TagId,Count
0,609,5162
1,249,4931
2,76,4928


In [13]:
# TOP 3 answerers
sorted_answerer_table = answerer_table.sort_values(by='AnsweredQuestionCount', ascending=False)
sorted_answerer_table.head(3)

Unnamed: 0,UserId,AnsweredQuestionCount
0,9113.0,2838
1,177980.0,2318
2,1204.0,2042


# Second Part

In [14]:
threshold = 20
answerer_table = answerer_table[answerer_table['AnsweredQuestionCount'] >= threshold]
tags_table = tags_table[tags_table['Count'] >= threshold]

In [15]:
# user_tag_count wlll be a dictionary of dictionaries, with UserId as key and a dictionary of tags as value.
user_tag_count = {}
tags_table = tags_table.sort_values(['TagId'])
answerer_table = answerer_table.sort_values(['UserId'])
users = answerer_table['UserId'].values
tags = tags_table['TagId'].values
q_tag_map = question_tags.to_dict()


for user in users:
  tag_count = {tag : 0 for tag in tags}
  for question in answerer[user]:
    for tag in q_tag_map[question]:
      try:
        tag_count[tag_dict[tag]] += 1
      except KeyError:
        pass
  user_tag_count[user] = tag_count
del q_tag_map
del users
del tags # to free up memory

In [16]:
expert = pd.DataFrame.from_dict(user_tag_count, orient='index')
del user_tag_count

In [17]:
expert.head()

Unnamed: 0,1,3,4,7,8,9,11,12,13,14,...,4639,4646,4661,4682,4683,4687,4690,4704,4720,4750
4.0,13,0,6,6,61,55,8,3,0,0,...,0,0,0,2,1,1,0,0,1,0
6.0,0,0,8,0,6,4,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0
11.0,1,0,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
14.0,0,0,1,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
15.0,1,0,2,1,4,4,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
expert.shape # 1160 users, 973 tags

(1160, 973)

# Third Part

In [19]:
expert = expert.map(lambda x: float(x//3) if x < 15 else 5.0)
# expert

In [20]:
expert_matrix = expert.to_numpy()
expert_shape = expert_matrix.shape
# expert

In [21]:
test_start = (int(0.85 * expert_shape[0] ), int(0.85 * expert_shape[1]))
test_users, test_tags = test_start
test_start
# As the number 0.85 * expert_shape[0] is not an integer, we can take its floor.
# Some may choose to take the ceiling, hence a difference of 1 is possible in the test_users, test_tags

(986, 827)

In [22]:
#! Not using a copy for test_matrix will lead to changes in expert_matrix as well, as numpy arrays are mutable
# This is a common mistake, but could lead to better answers ;) 
test_matrix = np.copy(expert_matrix[test_users:, test_tags:])
test_matrix.shape

(174, 146)

In [23]:
train_matrix = np.copy(expert_matrix)
train_matrix[test_users:, test_tags:] = 0

In [24]:
"""
Report the following about your utility matrix:
Summation value of the utility matrix
Highest row sum of the utility matrix
Highest column sum of the utility matrix

"""

sum_utility = np.nansum(expert_matrix)
highest_row_sum = np.nansum(expert_matrix, axis = 1).max()
highest_col_sum = np.nansum(expert_matrix, axis = 0).max()
sum_utility, highest_row_sum, highest_col_sum

(41180.0, 1162.0, 1403.0)

In [25]:
"""
Report the following for your train and test data
Summation value of the train matrix
Dimension of the test matrix
Summation value of test matrix
"""

sum_train = np.nansum(train_matrix)
dim_test = test_matrix.shape
sum_test = np.nansum(test_matrix)
sum_train, dim_test, sum_test


(40538.0, (174, 146), 642.0)

# Fourth Part

Class `CollaborativeFilter` is sort of an abstract class which will provide implementable methods for the actual collaborative filter classes. The actual collaborative filter classes will inherit from this class and implement the methods.

In [26]:
test_users_start, test_tags_start = test_start

In [27]:
from abc import ABC, abstractmethod
import numpy as np
class CollaborativeFilter():
  def __init__(self, utility_matrix, function='weighted'):
    self.utility_matrix = utility_matrix
    self.epsilon = 1e-9 # a small value to avoid division by zero
    
    self.final_rating_function = None
    if function == 'weighted':
      self.final_rating_function = self.weighted_average
    elif function=='regular_average':
      self.final_rating_function = self.average
    else:
      raise Exception("Function not allowed")
    
  
  def weighted_average(self, vector: np.ndarray, scores: np.ndarray):
    return np.dot(vector, scores)/(np.sum(scores) + self.epsilon)
  
  def average(self, vector: np.ndarray, scores: np.ndarray):
    return np.mean(vector)
    

  @abstractmethod
  def predict(self):
    pass
  
  
  @abstractmethod
  def compute_similarities(self):
    pass

`UserBasedCollaborativeFilter` class implements the collaborative filtering algorithm based on users. It will inherit from `CollaborativeFilter` and implement the methods.It computes the similarity between users based on the ratings they have given to the items. It then predicts the rating of a user for an item by taking `k` most similar users who have rated the item and computing the appropriate average rating.

In [28]:
class UserBasedCollaborativeFilter(CollaborativeFilter):
    def __init__(self,expert_matrix ,k , function):
      """
      utility_matrix : 2D numpy array
      k : int
      
      k is the number of similar users to consider for prediction
      utility_matrix is the matrix of user ratings, nan filled with 0
      """
      super().__init__(expert_matrix, function)
      self.k = k
      self.sim = dict()
        
    
    def compute_similarities(self, user_vector : np.ndarray, index: int):
      """
      user_vector : 1D numpy array
      """
      if index in self.sim:
        return self.sim[index]
      
      utility_rating_means = np.nanmean(self.utility_matrix, axis=0)
      user_rating_mean = np.nanmean(user_vector)

      utility_matrix_centered = self.utility_matrix - utility_rating_means
      
      user_vector_centered = user_vector - user_rating_mean
      
      user_vector_norm = np.sqrt(np.nansum(user_vector_centered**2))
      utility_matrix_norm = np.sqrt(np.nansum(utility_matrix_centered**2, axis=1))
      
      dot_product = np.nansum(utility_matrix_centered *user_vector_centered, axis = 1)

      similarities = dot_product/(user_vector_norm * utility_matrix_norm + self.epsilon)
      self.sim[index] = similarities
      return similarities
    
    def find_k_nearest_users(self, user_vector : np.ndarray, i: int, user_index: int):
      """
      user_vector : 1D numpy array
      
      returns the indices of the k most similar users to user_vector
      """
      
      similarities = self.compute_similarities(user_vector, user_index)
      users_to_consider = np.where(~np.isnan(self.utility_matrix[:, i]))[0]
      k = min(self.k, len(users_to_consider))
      sorted_indices = np.argsort(similarities[users_to_consider])[-k:]
      users_to_consider = users_to_consider[sorted_indices]
      similarities = similarities[users_to_consider]
      return similarities, users_to_consider
    
    
    def predict(self, user_vector : np.ndarray, i: int, user_index: int):
      """
      user_vector : 1D numpy array
      i : int
      
      i is the index of the item to predict the rating for
      """
      
      # find the k most similar users who rated this item 
      similarities, users_to_consider = self.find_k_nearest_users(user_vector, i, user_index)
      # similarity_scores = similarities[users_to_consider]
      
      # predict the rating for the item
      prediction = self.final_rating_function(self.utility_matrix[users_to_consider, i], similarities)
      return prediction
      

In [29]:
# to run tests on the model
def test_user_collaborator(train_matrix: np.ndarray, expert_matrix: np.ndarray, user_start: int, item_start: int, model: UserBasedCollaborativeFilter) -> float:
    rmse_loss = 0
    count = 0
    for i in range(user_start, expert_matrix.shape[0]):
        for j in range(item_start, expert_matrix.shape[1]):
            prediction = model.predict(train_matrix[i], j, i)
            # print(prediction, i, j)
            rmse_loss += (prediction - expert_matrix[i, j])**2
            count += 1
    rmse_loss = np.sqrt(rmse_loss/count)
    
    return rmse_loss


In [30]:
answers_dict = dict()

In [31]:
for i in [2, 3, 5]:
  user_cf = UserBasedCollaborativeFilter(train_matrix, i, 'regular_average')
  answers_dict[(i, 'user_regular_average')] = test_user_collaborator(train_matrix, expert_matrix, test_users_start, test_tags_start, user_cf)

In [32]:
for i in [2, 3, 5]:
  user_cf = UserBasedCollaborativeFilter(train_matrix, i, 'weighted')
  answers_dict[(i, 'user_weighted')] = test_user_collaborator(train_matrix, expert_matrix, test_users_start, test_tags_start, user_cf)

`ItemBasedCollaborativeFilter` class implements the collaborative filtering algorithm based on items. It will inherit from `CollaborativeFilter` and implement the methods.It computes the similarity between items based on the ratings given by the users. It then predicts the rating of a user for an item by taking `k` most similar items and computing the appropriate average rating.

In [33]:
class ItemBasedCollaborativeFilter(CollaborativeFilter):
    def __init__(self,expert_matrix ,k, function):
      """
      utility_matrix : 2D numpy array
      k : int
      
      k is the number of similar items to consider for prediction
      utility_matrix is the matrix of user ratings, nan filled with 0
      """
      super().__init__(expert_matrix, function)
      self.k = k
      self.sim = dict()
    
    def compute_similarities(self, item_vector : np.ndarray, index: int):
      """
      item_vector : 1D numpy array

      """
      if index in self.sim:
        return self.sim[index]
      utility_rating_means = np.nanmean(self.utility_matrix.T, axis=1)
      item_rating_mean = np.nanmean(item_vector)
      
      utility_matrix_centered = self.utility_matrix.T - utility_rating_means[:, np.newaxis]

      item_vector_centered = item_vector - item_rating_mean

      item_vector_norm = np.sqrt(np.nansum(item_vector_centered**2))
      utility_matrix_norm = np.sqrt(np.nansum(utility_matrix_centered**2, axis=1))

      dot_product = np.nansum(utility_matrix_centered *item_vector_centered, axis=1)

      similarities = dot_product/(item_vector_norm * utility_matrix_norm + self.epsilon)

      self.sim[index] = similarities

      return similarities
    
    
    def find_k_nearest_items(self, item_vector : np.ndarray, u: int, index: int):
      """
      item_vector : 1D numpy array
      
      returns the indices of the k most similar items to item_vector
      """
      
      similarities = self.compute_similarities(item_vector, index)
      items_to_consider = np.where(~np.isnan(self.utility_matrix[u]))[0]
      k = min(self.k, len(items_to_consider))
      sorted_indices = np.argsort(similarities[items_to_consider])[-k:]
      items_to_consider = items_to_consider[sorted_indices]
      similarities = similarities[items_to_consider]
      return similarities, items_to_consider
    
    
    def predict(self, item_vector : np.ndarray, u: int, index: int):
      """
      item_vector : 1D numpy array
      u : int
      u is the index of the user to predict the rating for
      """
      similarity_scores, items_to_consider = self.find_k_nearest_items(item_vector, u, index)
      
      # predict the rating for the item
      prediction = self.final_rating_function(self.utility_matrix[u, items_to_consider], similarity_scores)
      return prediction

In [34]:
# to run tests on the model
def test_item_item_filter(train_matrix: np.ndarray, expert_matrix: np.ndarray, user_start: int, item_start: int, model: ItemBasedCollaborativeFilter) -> float:
    rmse_loss = 0
    count = 0
    for i in range(user_start, expert_matrix.shape[0]):
        for j in range(item_start, expert_matrix.shape[1]):
            prediction = model.predict(train_matrix[:, j], i, j)
            # print(prediction, i, j)
            rmse_loss += (prediction - expert_matrix[i, j])**2
            count += 1
    rmse_loss = np.sqrt(rmse_loss/count)
    
    return rmse_loss

In [35]:
for i in [2, 3, 5]:
  item_cf = ItemBasedCollaborativeFilter(train_matrix, i, 'regular_average')
  answers_dict[(i, 'item_regular_average')] = test_item_item_filter(train_matrix, expert_matrix, test_users_start, test_tags_start, item_cf)

In [36]:
for i in [2, 3, 5]:
  item_cf = ItemBasedCollaborativeFilter(train_matrix, i, 'weighted')
  answers_dict[(i, 'item_weighted')] = test_item_item_filter(train_matrix, expert_matrix, test_users_start, test_tags_start, item_cf)

# Part 5

`LatentFactorDecomposition` uses matrix factorization/SVD to predict the ratings of users for items. It decomposes the user-item matrix into two matrices, one for users and one for items. It then predicts the rating of a user for an item by taking the dot product of the corresponding user and item vectors. Each user and item is represented by a vector of latent factors.

In [36]:
class LatentFactorDecomposition:
    def __init__(self, utility_matrix: np.ndarray, f: int, regp: int= 0, regq: int = 0):
        """
        utility_matrix: 2D numpy array, Expert matrix
        f: Number of Latent Factors
        regp: regularization rate over p
        regq: regularization rate over q
        """
        self.utility_matrix = utility_matrix
        self.f = f
        self.epsilon = 1e-9
        self.regp = regp
        self.regq = regq
        num_users, num_items = utility_matrix.shape
        self.P = np.random.randn(num_users, f)/np.sqrt(num_users*f)
        self.Q = np.random.randn(num_items, f)/np.sqrt(num_items*f)

    def predict(self, u, i):
        return np.dot(self.P[u], self.Q[i])

    def train(self, epochs = 1000, alpha = 0.0005):
        for epoch in range(epochs):
            pred_matrix = np.dot(self.P, self.Q.T)
            diffs = pred_matrix - self.utility_matrix
            loss = np.nansum(diffs**2) + self.regp*np.nansum(self.P**2) + self.regq*np.nansum(self.Q**2)

            p_reg_changes = 2*self.regp*self.P 
            q_reg_changes = 2*self.regq*self.Q


            diffs[np.isnan(diffs)] = 0
            # from observation each p[i,j] affects prediction of only user i for each item k. 
            # So, its GD will get changes for user i and each item k.
            p_rmse_changes = 2 * (np.dot(diffs, self.Q))
            # from observation each q[i,j] affects prediction of only item i for each user k. 
            # So, its GD will get changes for item i and each user k.
            q_rmse_changes = 2*(np.dot(diffs.T, self.P))

            self.P -= alpha*(p_rmse_changes + p_reg_changes)
            self.Q -= alpha*(q_rmse_changes + q_reg_changes)

            if epoch % 50 == 0:
                print(f"Epoch {epoch+1}/{epochs} : Loss = {loss}")
            
    def get_factors(self):
        return self.P, self.Q

In [37]:
# to run tests on the model
def test_lfd(model: LatentFactorDecomposition, expert_matrix: np.ndarray, user_start: int, item_start : int,f = 5):
    rmse_loss = 0
    count = 0
    for i in range(user_start, expert_matrix.shape[0]):
        for j in range(item_start, expert_matrix.shape[1]):
            prediction = model.predict(i, j)
            # print(prediction, i, j)
            rmse_loss += (prediction - expert_matrix[i, j])**2
            count += 1
    rmse_loss = np.sqrt(rmse_loss/count)
    return rmse_loss

In [38]:
for i in [2, 5, 10]:
  model = LatentFactorDecomposition(train_matrix, f = i)
  model.train(epochs=200)
  answers_dict[(i, 'SVD_Without')] = test_lfd(model, expert_matrix, test_users_start, test_tags_start)
  print(answers_dict[(i, 'SVD_Without')])

Epoch 1/200 : Loss = 108682.45423809403
Epoch 51/200 : Loss = 54548.79407066548
Epoch 101/200 : Loss = 50925.957594271145
Epoch 151/200 : Loss = 49350.73444101097
0.2285103252763705
Epoch 1/200 : Loss = 108682.46646106955
Epoch 51/200 : Loss = 54121.234991069454
Epoch 101/200 : Loss = 41586.83060232552
Epoch 151/200 : Loss = 36155.67962402595
0.21408036037841702
Epoch 1/200 : Loss = 108682.29243428483
Epoch 51/200 : Loss = 54330.75544438642
Epoch 101/200 : Loss = 38857.484483217304
Epoch 151/200 : Loss = 32282.319704725876
0.21306155966748108


In [39]:
for i in [2, 5, 10]:
  model = LatentFactorDecomposition(train_matrix, f = i, regp=0.001, regq=0.003)
  model.train(epochs=200)
  answers_dict[(i, 'SVD_With_0.001_0.003')] = test_lfd(model, expert_matrix, test_users_start, test_tags_start)
  print(answers_dict[(i, 'SVD_With_0.001_0.003')])

Epoch 1/200 : Loss = 108683.39604270893
Epoch 51/200 : Loss = 54448.67166662742
Epoch 101/200 : Loss = 47305.95672149307
Epoch 151/200 : Loss = 47124.285467495196
0.22091363150362173
Epoch 1/200 : Loss = 108682.33936755583
Epoch 51/200 : Loss = 54333.64864867695
Epoch 101/200 : Loss = 40333.4339028831
Epoch 151/200 : Loss = 36138.45988096543
0.2131792184625132
Epoch 1/200 : Loss = 108681.94093299267
Epoch 51/200 : Loss = 54240.34315062918
Epoch 101/200 : Loss = 38492.98363717051
Epoch 151/200 : Loss = 32780.67853778636
0.21557558741291857


In [40]:
for i in [2, 5, 10]:
  model = LatentFactorDecomposition(train_matrix, f = i, regp=0.05, regq=0.05)
  model.train(epochs=200)
  answers_dict[(i, 'SVD_With_0.05_0.05')] = test_lfd(model, expert_matrix, test_users_start, test_tags_start)
  print(answers_dict[(i, 'SVD_With_0.05_0.05')])

Epoch 1/200 : Loss = 108683.27350784901
Epoch 51/200 : Loss = 54241.24522602493
Epoch 101/200 : Loss = 47142.114327056915
Epoch 151/200 : Loss = 47128.09891241025
0.22145298235078978
Epoch 1/200 : Loss = 108681.74177835233
Epoch 51/200 : Loss = 54181.20487920439
Epoch 101/200 : Loss = 41491.689355902556
Epoch 151/200 : Loss = 36177.78008982759
0.21433817310380507
Epoch 1/200 : Loss = 108682.04215148147
Epoch 51/200 : Loss = 54033.50538528124
Epoch 101/200 : Loss = 38334.93551883678
Epoch 151/200 : Loss = 32244.997886542253
0.21469486749449163


In [41]:
for i in [2, 5, 10]:
  model = LatentFactorDecomposition(train_matrix, f = i, regp=0.5, regq=0.75)
  model.train(epochs=200)
  answers_dict[(i, 'SVD_With_0.5_0.75')] = test_lfd(model, expert_matrix, test_users_start, test_tags_start)
  print(answers_dict[(i, 'SVD_With_0.5_0.75')])

Epoch 1/200 : Loss = 108684.39489216299
Epoch 51/200 : Loss = 54798.230606556484
Epoch 101/200 : Loss = 48035.61950506613
Epoch 151/200 : Loss = 47562.070135485345
0.22206790184243405
Epoch 1/200 : Loss = 108683.57499126432
Epoch 51/200 : Loss = 54653.683362976444
Epoch 101/200 : Loss = 40209.25805401928
Epoch 151/200 : Loss = 37575.42367985966
0.21403378723754646
Epoch 1/200 : Loss = 108682.97417545425
Epoch 51/200 : Loss = 54290.531064528564
Epoch 101/200 : Loss = 39107.82921752086
Epoch 151/200 : Loss = 33591.80624163666
0.220214371145173


In [42]:
answers_dict

{(2, 'user_regular_average'): 0.2523897746165076,
 (3, 'user_regular_average'): 0.24193819276795364,
 (5, 'user_regular_average'): 0.2323844213545756,
 (2, 'user_weighted'): 0.2522930769032603,
 (3, 'user_weighted'): 0.2428522206789541,
 (5, 'user_weighted'): 0.23381228893778747,
 (2, 'item_regular_average'): 0.2519409772599003,
 (3, 'item_regular_average'): 0.25313168238934886,
 (5, 'item_regular_average'): 0.2597371586659773,
 (2, 'item_weighted'): 0.25207872457326164,
 (3, 'item_weighted'): 0.25250284085752034,
 (5, 'item_weighted'): 0.25014382125273715,
 (2, 'SVD_Without'): 0.2285103252763705,
 (5, 'SVD_Without'): 0.21408036037841702,
 (10, 'SVD_Without'): 0.21306155966748108,
 (2, 'SVD_With_0.001_0.003'): 0.22091363150362173,
 (5, 'SVD_With_0.001_0.003'): 0.2131792184625132,
 (10, 'SVD_With_0.001_0.003'): 0.21557558741291857,
 (2, 'SVD_With_0.05_0.05'): 0.22145298235078978,
 (5, 'SVD_With_0.05_0.05'): 0.21433817310380507,
 (10, 'SVD_With_0.05_0.05'): 0.21469486749449163,
 (2, 'SVD

## Part-6

In [43]:
!pip install scikit-learn scikit-surprise

Defaulting to user installation because normal site-packages is not writeable


In [44]:
from surprise import Reader
from surprise.prediction_algorithms.knns import KNNBasic
from surprise import Dataset, SVD
from surprise import accuracy

Generating Train Data

In [45]:
dataset_df = pd.DataFrame(train_matrix)
long_df = dataset_df.stack().reset_index()
long_df.columns = ['user_id', 'item_id', 'rating']
# Define reader with rating scale (adjust if necessary)
reader = Reader(rating_scale=(0, 5))

# Load into Surprise dataset
train_data = Dataset.load_from_df(long_df, reader)
# Build the full training set
trainset = train_data.build_full_trainset()


Generating Test Data

In [46]:
# generating test data in format of surprise dataset
test_matrix = np.copy(expert_matrix)
test_df = pd.DataFrame(test_matrix)
test_long_df = test_df.stack().reset_index()
test_long_df.columns = ['user_id', 'item_id', 'rating']
test_df_final = test_long_df[(test_long_df['user_id'] >= test_users_start) & (test_long_df['item_id'] >= test_tags_start)].reset_index(drop=True)
test_data = Dataset.load_from_df(test_df_final, reader)
testset = [tuple(x) for x in test_df_final[['user_id', 'item_id', 'rating']].values]
testset[:10]

[(986.0, 827.0, 0.0),
 (986.0, 828.0, 0.0),
 (986.0, 829.0, 0.0),
 (986.0, 830.0, 0.0),
 (986.0, 831.0, 0.0),
 (986.0, 832.0, 0.0),
 (986.0, 833.0, 0.0),
 (986.0, 834.0, 0.0),
 (986.0, 835.0, 0.0),
 (986.0, 836.0, 0.0)]

In [47]:
def test_surprise_colaborative_filter(user_based=True,k =2):
    model = KNNBasic(sim_options = {'name': 'pearson', 'user_based': user_based}, k=k)
    model.fit(trainset)
    predictions = model.test(testset)
    return accuracy.rmse(predictions)

def test_surprise_svd():
    model = SVD()
    model.fit(trainset)
    predictions = model.test(testset)
    return accuracy.rmse(predictions)

In [48]:
surprise_dict= dict()

**Note:** This takes a lot of time to run, so commented out

In [49]:

# # testing user based collaborative filter
# print("Testing user based collaborative filter")
# for i in [2, 3, 5]:
#     surprise_dict[(i, 'user_based')] = test_surprise_colaborative_filter(user_based=True, k=i)
    
# # testing item based collaborative filter
# print("Testing item based collaborative filter")
# for i in [2, 3, 5]:
#     surprise_dict[(i, 'item_based')] = test_surprise_colaborative_filter(user_based=False, k=i)
    
# # testing SVD
# print("Testing SVD")
# for i in [2, 5, 10]:
#     surprise_dict[(i, 'SVD')] = test_surprise_svd()


Testing user based collaborative filter
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.2511
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.2409
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.2266
Testing item based collaborative filter
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.2521
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.2525
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.2501
Testing SVD
RMSE: 0.2247
RMSE: 0.2248
RMSE: 0.2247


In [50]:
for key, val in surprise_dict.items():
  print(key, val)

(2, 'user_based') 0.2511263447799646
(3, 'user_based') 0.2408922378736791
(5, 'user_based') 0.22657016945231917
(2, 'item_based') 0.25207872457284747
(3, 'item_based') 0.25250284085757
(5, 'item_based') 0.25014613646770295
(2, 'SVD') 0.22467881155312133
(5, 'SVD') 0.22479555494685505
(10, 'SVD') 0.22472219224050763
