In [143]:
import pandas as pd
import numpy as np
import json
import ast
import random
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
from sklearn.metrics import ndcg_score

In [2]:
dataset = pd.read_csv("dataset/okcupid_profile_data.csv")
dataset.insert(0, 'ID', range(0, len(dataset)))
dataset = dataset.head(5000)

group_counts = dataset.groupby(['orientation', 'sex']).size()
print(group_counts)
# dataset['sex'].unique()

orientation  sex
bisexual     f        96
             m        32
gay          f        68
             m       189
straight     f      1810
             m      2805
dtype: int64


In [3]:
comp_scores = "dataset/compatibility_5k.json"
with open(comp_scores) as f:
    comp_scores = json.loads(f.read())

def get_dict(row):
    index = [random.randint(0, 9999) for _ in range(50)]
    score = [random.random() for _ in range(50)]
    return comp_scores.get(str(row['ID']), {'index':index, 'score':score})

# Adding new column with corresponding dictionary
dataset['top_compatible'] = dataset.apply(get_dict, axis=1)
test_dataset = dataset[dataset['top_compatible'] != {}]
len(test_dataset)

5000

In [5]:
og_rating_matrix = np.zeros((len(test_dataset),len(test_dataset)))
rating_matrix = np.zeros((len(test_dataset),len(test_dataset)))

In [6]:
# Populate the ORIGINAL compatibility matrix
for idx, row in test_dataset.iterrows():
    user_id = row['ID']
    compatibilities = row['top_compatible']
    
    for compatible_id, score in zip(compatibilities['index'], compatibilities['score']):
        og_rating_matrix[user_id][compatible_id] = score
        og_rating_matrix[compatible_id][user_id] = score

for i in range(len(test_dataset)):
    og_rating_matrix[i][i] = 0

print(og_rating_matrix)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [7]:
# Populate the compatibility matrix TO USE
for idx, row in test_dataset.iterrows():
    user_id = row['ID']
    compatibilities = row['top_compatible']
    
    for compatible_id, score in zip(compatibilities['index'][:10], compatibilities['score'][:10]):
        rating_matrix[user_id][compatible_id] = score
        rating_matrix[compatible_id][user_id] = score

for i in range(len(test_dataset)):
    rating_matrix[i][i] = 0

print(rating_matrix)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [8]:
def predict_ratings(user_item_matrix, target_user_index, X, n_neighbors=20, n_recommendations=10):
    """
    Predict ratings for all items for a specific user using a modified k-NN approach
    that excludes the first X closest neighbors.

    Parameters:
    - user_item_matrix: 2D numpy array with rows as users and columns as items. Cell values are ratings.
    - target_user_index: Index of the target user for whom we want to predict ratings.
    - X: Number of closest neighbors to exclude.
    - n_neighbors: Number of neighbors to consider for prediction after excluding the first X.

    Returns:
    - predicted_ratings: A numpy array with the predicted ratings for each item.
    """
    
    # Ensure n_neighbors is less than the number of users minus the ones we skip
    total_users = user_item_matrix.shape[0]
    if n_neighbors + X >= total_users:
        raise ValueError("n_neighbors + X must be less than the total number of users.")

    # Initialize NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors=n_neighbors+X+1, metric='euclidean')
    nbrs.fit(user_item_matrix)

    # Find the nearest neighbors (including the user itself)
    distances, indices = nbrs.kneighbors(user_item_matrix[target_user_index].reshape(1, -1))
    # print(distances, indices)
    if(n_neighbors + X > len(indices[0])):
        print(f"could not find enough neighbours... only {len(indices[0])} could be found :(")
        return
    
    # Exclude the first X+1 closest neighbors to skip the user itself and the X closest neighbors
    relevant_indices = indices.flatten()[X+1:]
    relevant_distances = distances.flatten()[X+1:]
    # print("relevant_indices: ", relevant_indices)
    # print("relevant_distances: ", relevant_distances)
    
    # Initialize predictions array with zeros
    predicted_ratings = np.zeros(user_item_matrix.shape[1])
    
    # Calculate predictions for each item
    for item_index in range(user_item_matrix.shape[1]):
        # Skip if the user has already rated this item
        if user_item_matrix[target_user_index, item_index] > 0:
            continue
    
        # Aggregate ratings from neighbors for this item
        total_rating = 0
        total_weight = 0
        for i,neighbor_index in enumerate(relevant_indices):
            neighbor_rating = user_item_matrix[neighbor_index, item_index]
            if neighbor_rating > 0:
                # Use inverse distance as weight, add a small epsilon to avoid division by zero
                # print(i, neighbor_index, len(relevant_distances))
                weight = 1 / (relevant_distances[i] + 1e-5)
                total_rating += neighbor_rating * weight
                total_weight += weight
    
        # Predict rating if there are neighbors who rated this item
        if total_weight > 0:
            predicted_ratings[item_index] = total_rating / total_weight

    recommendations = {}

    for i,rating in enumerate(predicted_ratings):
        if(rating > 0):
            recommendations[i] = rating
    
    top_recommendations = dict(sorted(recommendations.items(), key=lambda item: item[1], reverse=True)[:n_recommendations])

    return top_recommendations

In [22]:
top_recommendations_s = predict_ratings(rating_matrix, 0, 10, 60, 50) #with serendipity
print(top_recommendations_s)

{2546: 0.4000639336588939, 3326: 0.3785120093869527, 1919: 0.3754953822642921, 510: 0.374913654250061, 819: 0.37453978153230283, 3383: 0.37417344990423734, 1448: 0.3696381774739505, 1838: 0.36533397642703225, 3125: 0.35977615932354995, 2705: 0.359621148457998, 109: 0.3574145057269234, 2479: 0.3574145057269234, 2213: 0.35703479932482507, 4342: 0.35703479932482507, 3917: 0.35620211294930565, 4650: 0.3559664367735224, 2199: 0.35564546750558135, 3467: 0.35534909083091554, 3041: 0.35358170309784437, 2170: 0.35353446301304725, 272: 0.3532579671606447, 1477: 0.3512783648513138, 3185: 0.3505350389173542, 695: 0.3499487828207367, 2042: 0.3498448191532034, 3796: 0.3494616328578646, 799: 0.3494280274653218, 1446: 0.34939669916315946, 3238: 0.34791394088549166, 4790: 0.347351546618459, 3718: 0.34663523937574797, 1735: 0.3457003427932689, 3429: 0.345649229721008, 4534: 0.3456322884048714, 1869: 0.345462190031814, 3: 0.34528835716008743, 1417: 0.34527009344247217, 1469: 0.34500745713087494, 825: 0.3

In [26]:
top_recommendations_s = predict_ratings(rating_matrix, 0, 10, 30, 20) #with serendipity
print(top_recommendations_s)
top_recommendations = predict_ratings(rating_matrix, 0, 0, 20, 20) #without serendipity
print(top_recommendations)

{3326: 0.3785120093869527, 819: 0.37453978153230283, 3383: 0.37417344990423734, 1448: 0.3696381774739505, 3917: 0.3611838390725039, 2705: 0.359621148457998, 3467: 0.3582110318668064, 109: 0.3574145057269234, 2479: 0.3574145057269234, 2213: 0.35703479932482507, 4342: 0.35703479932482507, 2199: 0.35564546750558135, 1477: 0.35401440089687874, 3041: 0.35358170309784437, 2170: 0.35353446301304725, 391: 0.35294117647058826, 694: 0.35294117647058826, 3185: 0.35110594645247517, 695: 0.3499487828207367, 2042: 0.3498448191532034}
{3917: 0.38406434084614316, 3185: 0.3824058831861537, 3308: 0.37944040479394364, 2170: 0.3787072866550178, 799: 0.3786993745854057, 1446: 0.3786694732152871, 3041: 0.378593836500585, 3326: 0.3785120093869527, 819: 0.37453978153230283, 3383: 0.37417344990423734, 241: 0.3737901799469377, 2119: 0.37358730671843837, 3999: 0.37358730671843837, 911: 0.37339899484968553, 273: 0.369915619158906, 1448: 0.3696381774739505, 1477: 0.36516397145269286, 3467: 0.3648941138971157, 223:

In [32]:
results_serendipity = {}
for i in range(len(test_dataset)):
    results_serendipity[i] = predict_ratings(rating_matrix, i, 10, 30, 20) #with serendipity

results_serendipity_df = pd.DataFrame(results_serendipity.items(), columns=['ID', 'scores_with_serendipity'])
results_serendipity_df.head()

Unnamed: 0,ID,scores
0,0,"{3326: 0.3785120093869527, 819: 0.374539781532..."
1,1,"{3468: 0.40364603521503745, 2989: 0.4028731378..."
2,2,"{4342: 0.3767480246888364, 2213: 0.37569636824..."
3,3,"{4448: 0.36025621693398535, 3245: 0.3547693906..."
4,4,"{1254: 0.38124031124909735, 3176: 0.3803370972..."


In [31]:
results_wo_serendipity = {}
for i in range(len(test_dataset)):
    results_wo_serendipity[i] = predict_ratings(rating_matrix, i, 0, 20, 20) #without serendipity

results_wo_serendipity_df = pd.DataFrame(results_wo_serendipity.items(), columns=['ID', 'scores_without_serendipity'])
results_wo_serendipity_df.head()

Unnamed: 0,ID,scores_wo_serendipity
0,0,"{3917: 0.38406434084614316, 3185: 0.3824058831..."
1,1,"{2857: 0.37889702227935546, 2409: 0.3774439090..."
2,2,"{347: 0.37504157037857033, 1597: 0.37298791244..."
3,3,"{381: 0.3875333601955412, 1828: 0.386575334678..."
4,4,"{2560: 0.3812612864740654, 1254: 0.38124031124..."


In [67]:
rows = []
for key, value in comp_scores.items():
    row = {'ID': int(key)}
    index_score_dict = {idx: score for idx, score in zip(value['index'], value['score'])}
    row['Original_Compatibility_Scores'] = index_score_dict
    rows.append(row)


# Create DataFrame
original_score_df = pd.DataFrame(rows)
original_score_df.head()
print(original_score_df)

        ID                      Original_Compatibility_Scores
0        0  {3916: 0.404574517131177, 786: 0.4005645957215...
1        1  {896: 0.40955449736165245, 2127: 0.40863726632...
2        2  {4264: 0.4048764780749057, 4745: 0.40487647807...
3        3  {1827: 0.36888572989623986, 4930: 0.3652856970...
4        4  {2328: 0.40846110821772325, 2261: 0.4084468473...
...    ...                                                ...
4995  4995  {34: 0.34947783312687497, 3578: 0.349306202124...
4996  4996  {3317: 0.3801343320274302, 2244: 0.37996413459...
4997  4997  {2448: 0.4089048901031154, 477: 0.404827391525...
4998  4998  {1238: 0.40453592405476707, 2017: 0.4045077953...
4999  4999  {587: 0.3798193194696908, 804: 0.3791391765241...

[5000 rows x 2 columns]


In [52]:
merged_df = results_serendipity_df.merge(results_wo_serendipity_df, on='ID')
merged_df = merged_df.merge(original_score_df, on='ID')

merged_df.to_csv("dataset/recommendations_all.csv", index=False)

In [78]:
scores_df = pd.read_csv("dataset/recommendations_all.csv")
scores_df.head()
# scores_df.columns

Unnamed: 0,ID,scores,scores_wo_serendipity,Original_Compatibility_Scores
0,0,"{3326: 0.3785120093869527, 819: 0.374539781532...","{3917: 0.38406434084614316, 3185: 0.3824058831...","{3916: 0.404574517131177, 786: 0.4005645957215..."
1,1,"{3468: 0.40364603521503745, 2989: 0.4028731378...","{2857: 0.37889702227935546, 2409: 0.3774439090...","{896: 0.40955449736165245, 2127: 0.40863726632..."
2,2,"{4342: 0.3767480246888364, 2213: 0.37569636824...","{347: 0.37504157037857033, 1597: 0.37298791244...","{4264: 0.4048764780749057, 4745: 0.40487647807..."
3,3,"{4448: 0.36025621693398535, 3245: 0.3547693906...","{381: 0.3875333601955412, 1828: 0.386575334678...","{1827: 0.36888572989623986, 4930: 0.3652856970..."
4,4,"{1254: 0.38124031124909735, 3176: 0.3803370972...","{2560: 0.3812612864740654, 1254: 0.38124031124...","{2328: 0.40846110821772325, 2261: 0.4084468473..."


In [141]:
def calc_rmse(scores_df, no_of_recs):
    mse_values_spty = []
    mse_values_wo_spty = []
    for row in scores_df.itertuples():
        original_scores = ast.literal_eval(row.Original_Compatibility_Scores)
        predicted_scores_spty = ast.literal_eval(row.scores)
        predicted_scored_wo_spty = ast.literal_eval(row.scores_wo_serendipity)
        squared_diff_sum_spty = 0
        squared_diff_sum_wo_spty = 0
        for id, predict in predicted_scores_spty.items():
            original_score = original_scores.get(id, 0.0)
            squared_diff_sum_spty += (predict - original_score) ** 2

        for id, predict in predicted_scored_wo_spty.items():
            original_score = original_scores.get(id, 0.0)
            squared_diff_sum_wo_spty += (predict - original_score) ** 2
        
        mse_spty = squared_diff_sum_spty / no_of_recs
        mse_values_spty.append(mse_spty)

        mse_wo_spty = squared_diff_sum_wo_spty / no_of_recs
        mse_values_wo_spty.append(mse_wo_spty)
    
    # Calculate average RMSE
    average_rmse_spty = np.sqrt(np.mean(mse_values_spty))
    average_rmse_wo_spty = np.sqrt(np.mean(mse_values_wo_spty))
    
    # print("RMSE values with serendipity:", rmse_values_spty)
    # print("RMSE values without serendipity:", rmse_values_wo_spty)
    # print("Average RMSE with serendipity:", average_rmse_spty)
    # print("Average RMSE without serendipity:", average_rmse_wo_spty)
    return average_rmse_spty, average_rmse_wo_spty

In [174]:
rmse_spty, rmse_wo_spty = calc_rmse(scores_df, 20)
print("RMSE with serendipity = ", rmse_spty)
print("RMSE without serendipity = ",rmse_wo_spty)

RMSE with serendipity =  0.2757859597632517
RMSE without serendipity =  0.265034329036201


In [168]:
def calc_ndcg_at_k(scores_df, k):
    ndcg_spty = []
    ndcg_wo_spty = []
    for row in scores_df.itertuples():
        original_scores = ast.literal_eval(row.Original_Compatibility_Scores)
        predicted_scores_spty = ast.literal_eval(row.scores)
        predicted_scored_wo_spty = ast.literal_eval(row.scores_wo_serendipity)
        # print(original_scores)
        # print(predicted_scores_spty)
        # print(predicted_scored_wo_spty)
        row_scores_act_spty = []
        row_scores_act_wo_spty = []
        for id, predict in predicted_scores_spty.items():
            original_score = original_scores.get(id, 0.0)
            row_scores_act_spty.append(original_score)
        
        for id, predict in predicted_scored_wo_spty.items():
            original_score = original_scores.get(id, 0.0)
            row_scores_act_wo_spty.append(original_score)

        row_scores_act_spty = row_scores_act_spty[:k]
        row_score_true_spty = sorted(row_scores_act_spty, reverse=True)
        # print(row_scores_act_spty)
        # print(row_score_true_spty)

        row_scores_act_wo_spty = row_scores_act_wo_spty[:k]
        row_score_true_wo_spty = sorted(row_scores_act_wo_spty, reverse=True)
        # print(row_scores_act_wo_spty)
        # print(row_score_true_wo_spty)

        row_ndcg_spty = ndcg_score(np.asarray([row_score_true_spty]), np.asarray([row_scores_act_spty]))
        row_ndcg_wo_spty = ndcg_score(np.asarray([row_score_true_wo_spty]), np.asarray([row_scores_act_wo_spty]))

        ndcg_spty.append(row_ndcg_spty)
        ndcg_wo_spty.append(row_ndcg_wo_spty)

    average_ndcg_spty = np.mean(ndcg_spty) 
    average_ndcg_wo_spty = np.mean(ndcg_wo_spty)

    return average_ndcg_spty, average_ndcg_wo_spty


In [176]:
# new_df = scores_df.head()
ndcg_spty, ndcg_wo_spty = calc_ndcg_at_k(scores_df, 5)
print("Print ndcg@5 with serendipity =", ndcg_spty)
print("Print ndcg@5 without serendipity  =", ndcg_wo_spty)

ndcg_spty, ndcg_wo_spty = calc_ndcg_at_k(scores_df, 10)
print("Print ndcg@10 with serendipity  =", ndcg_spty)
print("Print ndcg@10 without serendipity  =", ndcg_wo_spty)

ndcg_spty, ndcg_wo_spty = calc_ndcg_at_k(scores_df, 15)
print("Print ndcg@15 with serendipity  =", ndcg_spty)
print("Print ndcg@15 without serendipity  =", ndcg_wo_spty)

ndcg_spty, ndcg_wo_spty = calc_ndcg_at_k(scores_df, 20)
print("Print ndcg@20 with serendipity  =", ndcg_spty)
print("Print ndcg@20 without serendipity  =", ndcg_wo_spty)

[0.3, 0.2, 0, 1]
[1, 1, 0, 1]


In [181]:
def calc_precision_at_k(scores_df, k):
    prec_spty = []
    prec_wo_spty = []
    for row in scores_df.itertuples():
        original_scores = ast.literal_eval(row.Original_Compatibility_Scores)
        predicted_scores_spty = ast.literal_eval(row.scores)
        predicted_scored_wo_spty = ast.literal_eval(row.scores_wo_serendipity)
        # print(original_scores)
        # print(predicted_scores_spty)
        # print(predicted_scored_wo_spty)
        row_prec_spty = []
        row_prec_wo_spty = []
        for id, predict in predicted_scores_spty.items():
            original_score = original_scores.get(id, 0)
            row_prec_spty.append(original_score)
        
        for id, predict in predicted_scored_wo_spty.items():
            original_score = original_scores.get(id, 0)
            row_prec_wo_spty.append(original_score)

        row_prec_spty = row_prec_spty[:k]
        row_prec_spty = [1 if k > 0 else 0 for k in row_prec_spty]
        row_prec_spty = sum(row_prec_spty)/k
        # print(row_prec_spty)
        # print(row_score_true_spty)

        row_prec_wo_spty = row_prec_wo_spty[:k]
        row_prec_wo_spty = [1 if k > 0 else 0 for k in row_prec_wo_spty]
        row_prec_wo_spty = sum(row_prec_wo_spty)/k
        # print(row_prec_wo_spty)
        # print(row_score_true_wo_spty)

        prec_spty.append(row_prec_spty)
        prec_wo_spty.append(row_prec_wo_spty)

    average_prec_spty = np.mean(prec_spty) 
    average_prec_wo_spty = np.mean(prec_wo_spty)

    return average_prec_spty, average_prec_wo_spty


In [182]:
# new_df = scores_df.head()
prec_spty, prec_wo_spty = calc_precision_at_k(scores_df, 5)
print("Print prec@5 with serendipity =", prec_spty)
print("Print prec@5 without serendipity  =", prec_wo_spty)

prec_spty, prec_wo_spty = calc_precision_at_k(scores_df, 10)
print("Print prec@10 with serendipity  =", prec_spty)
print("Print prec@10 without serendipity  =", prec_wo_spty)

prec_spty, prec_wo_spty = calc_precision_at_k(scores_df, 15)
print("Print prec@15 with serendipity  =", prec_spty)
print("Print prec@15 without serendipity  =", prec_wo_spty)

prec_spty, prec_wo_spty = calc_precision_at_k(scores_df, 20)
print("Print prec@20 with serendipity  =", prec_spty)
print("Print prec@20 without serendipity  =", prec_wo_spty)

Print prec@5 with serendipity = 0.50388
Print prec@5 without serendipity  = 0.58116
Print prec@10 with serendipity  = 0.45498
Print prec@10 without serendipity  = 0.5241399999999999
Print prec@15 with serendipity  = 0.40954666666666667
Print prec@15 without serendipity  = 0.46588
Print prec@20 with serendipity  = 0.37051000000000006
Print prec@20 without serendipity  = 0.4154
