In [40]:
import numpy as np
from numpy import random as rand

In [152]:
# Create a 2-dimensional n*m matrix filled with random numbers from the uniform distribution 
def create_matrix(n_rows, n_columns):
    scores_array = np.around(rand.uniform(1,10,size=(n_rows, n_columns)), decimals = 2)
    return scores_array
 

In [220]:
# Test function
scores = create_matrix(10,10)

print(scores)

[[5.08 3.82 5.66 4.65 1.25 5.92 9.75 4.24 3.75 7.9 ]
 [9.27 4.27 2.75 4.99 1.93 1.39 2.44 9.03 6.39 7.4 ]
 [6.21 9.99 8.   3.19 5.51 4.66 6.86 5.51 4.87 9.58]
 [5.42 7.55 4.72 2.98 5.37 9.92 8.05 5.61 3.46 9.63]
 [1.38 5.16 5.27 8.56 9.68 3.56 5.61 6.68 2.44 4.95]
 [4.82 8.23 6.98 1.13 5.98 1.39 3.44 9.45 7.19 2.7 ]
 [5.43 9.48 6.01 9.93 3.12 1.62 5.97 2.18 7.88 5.42]
 [7.07 9.7  7.15 4.41 8.01 4.65 9.29 7.67 5.94 7.92]
 [6.1  7.14 2.49 4.71 6.96 2.92 3.94 6.87 3.   6.59]
 [3.38 9.6  4.34 9.42 4.3  5.12 1.25 9.22 7.96 8.13]]


In [221]:
# from the array filled with numbers from uniform distribution, choose only a percent of the matrix
# and the rest of the percent, fill it in with nan(not a number) values, in order to predict them
def calculate_scores_for_prediction(scores_array, percent):
    scores_for_prediction = scores_array.copy()
    number_of_nans = int(np.round((1-percent)*scores_array.shape[0]*scores_array.shape[1]))
    choices = np.random.choice(scores_array.size, number_of_zeros, replace=False)
    scores_for_prediction.ravel()[choices] = np.nan
    
    return scores_for_prediction

In [222]:
# Test function
scores_predict = calculate_scores_for_prediction(scores, 0.75)

print(scores_predict)

[[ nan  nan  nan  nan  nan 5.92 9.75 4.24 3.75 7.9 ]
 [9.27  nan  nan 4.99  nan  nan  nan 9.03 6.39 7.4 ]
 [6.21 9.99 8.   3.19  nan 4.66 6.86 5.51  nan 9.58]
 [5.42 7.55 4.72 2.98 5.37 9.92 8.05 5.61 3.46 9.63]
 [1.38 5.16  nan 8.56 9.68 3.56 5.61 6.68 2.44 4.95]
 [4.82  nan 6.98  nan 5.98 1.39  nan 9.45 7.19 2.7 ]
 [5.43 9.48 6.01  nan 3.12 1.62 5.97 2.18 7.88 5.42]
 [ nan 9.7  7.15 4.41 8.01 4.65 9.29 7.67  nan  nan]
 [6.1  7.14 2.49  nan 6.96 2.92 3.94 6.87  nan  nan]
 [3.38 9.6  4.34  nan 4.3  5.12 1.25 9.22 7.96  nan]]


In [223]:
def Jaccard_Similarity(matrix, column_01, column_02):
    # Create sets from x and y by removing the nan values
    x = {i for i in matrix[:,column_01] if not np.isnan(i)}
    y = {i for i in matrix[:,column_02] if not np.isnan(i)}
    
    # Calculate Jaccard Similarity
    intersection = len(set(x) & set(y))
    union = len(set(x) | set(y))
    return intersection/float(union)

In [224]:
# Test function
Jaccard_Similarity(scores_predict[:,0], scores_predict[:,4])

TypeError: Jaccard_Similarity() missing 1 required positional argument: 'column_02'

In [225]:
def Dice_Similarity(matrix, column_01, column_02):
    # Create sets from x and y by removing the nan values
    x = {i for i in matrix[:,column_01] if not np.isnan(i)}
    y = {i for i in matrix[:,column_02] if not np.isnan(i)}
    
    # Calculate Dice Similarity
    intersection = len(set(x) & set(y))
    return (2*intersection)/(len(x) + len(y))

In [226]:
# Test function
Dice_Similarity(scores_predict, 0, 10)

IndexError: index 10 is out of bounds for axis 1 with size 10

In [227]:
def Cosine_Similarity(matrix,column_01, column_02):
    
    # Get the 2 columns from matrix
    x = matrix[:, column_01]
    y = matrix[:, column_02]
    
    # Calculate the dot product, ignoring nan values
    dot_product = np.nansum(x*y)
    
    # calculate the norm of the x and y columns, ignoring nan values
    x_norm = np.linalg.norm(x[~np.isnan(x)])
    y_norm = np.linalg.norm(y[~np.isnan(y)])
    
    # return the result
    return dot_product/(x_norm*y_norm)

In [228]:
# Test function
Cosine_Similarity(scores_predict, 0, 3)

0.5015890985730859

In [184]:
def Adjusted_Cosine_Similarity(matrix,column_01, column_02):
    # Calculate the mean of each row, ignoring the nan values
    row_mean = np.nanmean(matrix,axis=1)
    
    # Get the 2 columns from matrix
    x = matrix[:,column_01]
    y = matrix[:,column_02]
    
    # substract each element of the 2 columns by their row average
    x_adjusted = x - row_mean
    y_adjusted = y - row_mean
    
    # create a copy of the initial scores matrix prediction and 
    # set the new columns with the adjusted ones
    copy_matrix = np.array(matrix)
    copy_matrix[:,column_01] = x_adjusted
    copy_matrix[:,column_02] = y_adjusted
    
    # Calculate cosine similarity on the adjusted columns
    return Cosine_Similarity(copy_matrix, column_01, column_02)

In [185]:
# Test function
Adjusted_Cosine_Similarity(scores_predict, 0,3)

-0.25612211110411365

In [292]:
# this function predicts the scores in the scores matrix filled with a certain percent of values, using a certain method
# of calculation (such as simple average, weighted average or hybrid prediction), a certain method of prediction
# such as(cosine or adjusted cosine) and a certain value of k
def predict_scores_in_matrix(scores_predict, scores_actual, method_prediction, method_calculation, k):
    absolute_error = 0
    number_of_nans = 0
    # loop through the 2d matrix
    for idx, x in np.ndenumerate(scores_predict):
        # if the value is not a number, then calculate the predicted value and find the absolute error of the predicted value
        if np.isnan(x):
            predicted_score = predict_score_in_place(idx[0], idx[1], scores_predict, method_prediction, method_calculation, k)
            actual_score = scores_actual[idx[0],idx[1]]
            absolute_error += np.absolute(predicted_score - actual_score)
            number_of_nans += 1
    # it returns the Mean Absolute Error of the predicted values
    return absolute_error/number_of_nans            
            

In [293]:
predict_scores_in_matrix(scores_predict,scores, method_prediction = 'Cosine', method_calculation = 'Simple Average', k = 7)

2.6386857142857143

In [265]:
# this function predicts the score of a certain value in the scores matrix. It takes as argument the row and the column, where
# the score is located at the score matrix, the score matrix, the method prediction (such as simple average, weighted average or hybrid prediction), 
# the method of prediction such as(cosine or adjusted cosine) and a certain value of k
def predict_score_in_place(n_row, n_col, score_matrix, method_prediction, method_calculation, k):
    # for each item(column) in the score matrix,except the column index where the predicted item is located
    # calculate the similarities between the column, where the item is located, and the other columns of 
    # the score matrix
    similarities = []
    for j in range(len(score_matrix[n_row])):
        current_column = j
        if current_column == n_col:
            continue
        else:
            similarity = choose_comparision_method(method_prediction,score_matrix, current_column, n_col)
            similarities.append((similarity, current_column))
    if method_calculation == 'Simple Average':
        return Simple_Average(score_matrix, similarities, n_row, k)
    elif method_calculation == 'Weighted Average':
        return Weighted_Average(score_matrix, similarities, n_row, k)
    elif method_calculation == 'Hybrid Prediction':
        return Hybrid_Prediction(score_matrix, similarities, n_row, k)

In [266]:
# calculate the similarity of 2 columns based on a certain similarity method(e.g Dice, Cosine, ..) 
def choose_comparision_method(name_of_method, matrix, index_column_01, index_column_02):
    if name_of_method == 'Jaccard':
        return Jaccard_Similarity(matrix, index_column_01, index_column_02)
    elif name_of_method == 'Dice':
        return Dice_Similarity(matrix, index_column_01, index_column_02)
    elif name_of_method == 'Cosine':
        return Cosine_Similarity(matrix, column_01, column_02)
    elif name_of_method == 'Adjusted Cosine':
        return Adjusted_Cosine_Similarity(matrix, column_01, column_02)

In [268]:
# This fuction is the upper function of the item based collcaborative filtering. It takes as arguments the number of rows and 
# columns of the matrix, the percentage of the known values in the score matrix, the method of prediction(like Cosine), 
# the method of calculation(like Simple Average), the k value, and the iterations of the program
def item_to_item_collaborative_filtering(n_rows, n_cols, percent, method_prediction, method_calculation, k, n_iters):
    # Create a matrix based on the dimensions the user wants
    scores_matrix = create_matrix(n_rows, n_cols)
    
    sum = 0
    # for each iteration of the program, fill the matrix with certain values based on the percent the user wants, predict the
    # scores and calculate the mean absolute error of the predictions
    for i in range(n_iters):
        scores_to_predict = calculate_scores_for_prediction(scores_matrix, percent)
        mean_absolute_error = predict_scores_in_matrix(scores_to_predict, scores_matrix,method_prediction,method_calculation, k)
        print("For iteration ", i+1, " the mae is: ", mean_absolute_error)
        sum += mean_absolute_error
        
    # return the average of all mean absolute errors
    average = sum/n_iters
    print("Average MAE is: ", average)

In [291]:
    item_to_item_collaborative_filtering(50,50,0.5,'Adjusted Cosine','Weighted Average',10,20)

For iteration  1  the mae is:  2.422485748903161
For iteration  2  the mae is:  2.583177071635511
For iteration  3  the mae is:  2.384330050184875
For iteration  4  the mae is:  2.4433550841405074
For iteration  5  the mae is:  2.4475166838021454
For iteration  6  the mae is:  2.4379857795028994
For iteration  7  the mae is:  2.3732925070152255
For iteration  8  the mae is:  2.3865821801421956
For iteration  9  the mae is:  2.3907864900506803
For iteration  10  the mae is:  2.380703018639413
For iteration  11  the mae is:  2.4293142709824473
For iteration  12  the mae is:  2.389242989307957
For iteration  13  the mae is:  2.388593175741769
For iteration  14  the mae is:  2.409971944016976
For iteration  15  the mae is:  2.301152835689793
For iteration  16  the mae is:  2.372427945633334
For iteration  17  the mae is:  2.3269433784188545
For iteration  18  the mae is:  2.3665396304336594
For iteration  19  the mae is:  2.33101426791522
For iteration  20  the mae is:  2.363910456292621
A

In [269]:
def Simple_Average(score_predict, similarity_matrix, row, k):
    # similarity matrix is an array with tuple elements of the form: (similarity_value, column_index)
    #Sort the similarity matrix based on the similarity value of item
    similarity_matrix.sort(key=lambda x:x[0], reverse=True)
    
    # Calculate the simple average with the k values that
    # have the highest score prediction value. If k < number_of_not_nan_values,
    # then calculate the simple average of the number_of_not_nan_values
    sum_scores = 0
    
    # i is for counting all the loops inside while loop
    i = 0
    
    # j is for counting the loops, where score is not nan
    j = 0
    
    # for the k values that are not nan with the highest prediction score,
    # calculate the simple average
    while j < k:
        column = similarity_matrix[i][1]
        score = score_predict[row][column]
        
        # check if a score is not nan,
        # then add score to the variable sum_scores
        if not np.isnan(score):
            sum_scores += score
            j += 1;
        i += 1
        
        # if i is higher or equal than the number of columns of scores matrix, break out of the loop
        if i >= score_predict.shape[1] - 1:
            break
    return sum_scores/j

In [270]:
def Weighted_Average(score_predict, similarity_matrix, row, k):
    # similarity matrix is an array with tuple elements of the form: (similarity_value, column_index)
    #Sort the similarity matrix based on the similarity value of item
    similarity_matrix.sort(key=lambda x:x[0], reverse=True)
    
    # Calculate the weighted average with the k values that
    # have the highest score prediction value and use the score prediction
    # values as weights for the weighted average calculation. If k < number_of_not_nan_values,
    # then calculate the weighted average of the number_of_not_nan_values
    sum_scores = 0
    sum_weights = 0
    
    # i is for counting all the loops inside while loop
    i = 0
    
    # j is for counting the loops, where score is not nan
    j = 0
    
    # for the k values that are not nan with the highest prediction score,
    # calculate the weighted average
    while j < k:
        column = similarity_matrix[i][1]
        weight = similarity_matrix[i][0]
        score = score_predict[row][column]
        
        # check if a score is not equal to nan,
        # then add weighted_score and weight to their variables sums
        if not np.isnan(score):
            weighted_score = weight*score
            sum_scores += weighted_score
            sum_weights += weight
            j += 1
        i += 1
        
        # if i is higher or equal than the number of columns of scores matrix, break out of the loop
        if i>=score_predict.shape[1] - 1:
            break
    return sum_scores/sum_weights

In [271]:
def Hybrid_Prediction(score_predict, similarity_matrix, row, k):
    simple_average = Simple_Average(score_predict, similarity_matrix, row, k);
    weighted_average = Weighted_Average(score_predict, similarity_matrix, row, k)
    return (simple_average + weighted_average)/2