In [234]:
import pandas as pd
import numpy as np
import json
import random
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import ndcg_score

In [235]:
dataset = pd.read_csv("dataset/okcupid_profile_data.csv")
dataset.insert(0, 'ID', range(0, len(dataset)))
dataset = dataset.head(5000)


group_counts = dataset.groupby(['orientation', 'sex']).size()
print(group_counts)
# dataset['sex'].unique()

orientation  sex
bisexual     f        96
             m        32
gay          f        68
             m       189
straight     f      1810
             m      2805
dtype: int64


In [236]:
comp_scores = "dataset/compatibility_5k.json"
with open(comp_scores) as f:
    comp_scores = json.loads(f.read())

def get_dict(row):
    index = [random.randint(0, 9999) for _ in range(50)]
    score = [random.random() for _ in range(50)]
    return comp_scores.get(str(row['ID']), {'index':index, 'score':score})

# Adding new column with corresponding dictionary
dataset['top_compatible'] = dataset.apply(get_dict, axis=1)
test_dataset = dataset[dataset['top_compatible'] != {}]
len(test_dataset)

5000

In [237]:
test_dataset.head()

Unnamed: 0,ID,age,status,sex,orientation,body_type,diet,drinks,drugs,education,...,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9,top_compatible
0,0,35,single,m,straight,fit,,socially,,graduated from masters program,...,i do the corporate tech industry grind at an a...,what am i not good at? i am a jack of all trad...,i'm going to go with humor. or that i am just ...,"film: robert altman, coen brothers, terry gill...","well, when the world goes into economic and po...","art/film projects, the world's politics and ec...","lounging around the house watching tv/movie, e...","well, it's really not that private, but i weav...",if the above seems interesting and you fit at ...,"{'index': [3916, 786, 1818, 2134, 3736, 1896, ..."
1,1,27,single,m,straight,athletic,mostly anything,socially,never,graduated from masters program,...,right now i'm working on my ph.d. in stanford....,listening to people. learning new skills. tole...,i got a twin brother who also does his ph.d. a...,books: generally investigative journalism movi...,music friends sunshine ice cream sports iphone,...how to make people laugh ...how to make the...,...up for anything but staying at home!,ask me.,you wanna know more about me :),"{'index': [896, 2127, 3259, 4845, 2003, 3534, ..."
2,2,31,single,m,straight,fit,,socially,never,working on masters program,...,finishing graduate school. will be going for a...,"cooking italian/roman specialties, translating...","easy going, white teeth, nice shoes.","movies: bill and ted's excellent adventure, th...",family and friends strong coffee music beads o...,it might sound cliche but its true: how to liv...,ideally a live music show or having dinner wit...,have three visible scars on my body.,"you want to leisurely enjoy sf, want to get a ...","{'index': [4264, 4745, 2284, 4537, 4812, 587, ..."
3,3,28,single,f,straight,average,,not at all,never,graduated from college/university,...,i've been in real estate for most of my career...,"salsa dancing, hula hooping, singing, dancing ...",that i can get along and find something in com...,"books: too many to list! angela's ashes, ready...",lucy the greatest dog ever live music stimulat...,"the present, the next fun thing, my career, my...","salsa dancing or dancing with friends, going t...","when i'm in a really good mood, or if im in a ...",you aren't at all sneaky creepy ... you're pas...,"{'index': [1827, 4930, 4695, 1012, 2359, 2115,..."
4,4,24,single,f,straight,average,mostly anything,socially,,graduated from college/university,...,well i just got a job working as a program spe...,directions! my friends and family joke that i'...,,"books: harry potter, nicholas sparks books (gu...",family friends books music i'll come up with t...,the power of our thoughts. and now that i just...,probably enjoying a quiet night to wind down f...,,~ you made it this far ~ you're looking for so...,"{'index': [2328, 2261, 3617, 2149, 4212, 2684,..."


In [246]:
og_rating_matrix = np.zeros((len(test_dataset),len(test_dataset)))
rating_matrix = np.zeros((len(test_dataset),len(test_dataset)))

In [247]:
# Populate the ORIGINAL compatibility matrix
for idx, row in test_dataset.iterrows():
    user_id = row['ID']
    compatibilities = row['top_compatible']
    
    for compatible_id, score in zip(compatibilities['index'], compatibilities['score']):
        og_rating_matrix[user_id][compatible_id] = score
        og_rating_matrix[compatible_id][user_id] = score

for i in range(len(test_dataset)):
    og_rating_matrix[i][i] = 0

print(og_rating_matrix)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [248]:
# Populate the compatibility matrix TO USE
for idx, row in test_dataset.iterrows():
    user_id = row['ID']
    compatibilities = row['top_compatible']
    
    for compatible_id, score in zip(compatibilities['index'][:10], compatibilities['score'][:10]):
        rating_matrix[user_id][compatible_id] = score
        rating_matrix[compatible_id][user_id] = score

for i in range(len(test_dataset)):
    rating_matrix[i][i] = 0

print(rating_matrix)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [249]:
def predict_ratings(user_item_matrix, target_user_index, X, n_neighbors=20, n_recommendations=10):
    """
    Predict ratings for all items for a specific user using a modified k-NN approach
    that excludes the first X closest neighbors.

    Parameters:
    - user_item_matrix: 2D numpy array with rows as users and columns as items. Cell values are ratings.
    - target_user_index: Index of the target user for whom we want to predict ratings.
    - X: Number of closest neighbors to exclude.
    - n_neighbors: Number of neighbors to consider for prediction after excluding the first X.

    Returns:
    - predicted_ratings: A numpy array with the predicted ratings for each item.
    """
    
    # Ensure n_neighbors is less than the number of users minus the ones we skip
    total_users = user_item_matrix.shape[0]
    if n_neighbors + X >= total_users:
        raise ValueError("n_neighbors + X must be less than the total number of users.")

    # Initialize NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors=n_neighbors+X+1, metric='euclidean')
    nbrs.fit(user_item_matrix)

    # Find the nearest neighbors (including the user itself)
    distances, indices = nbrs.kneighbors(user_item_matrix[target_user_index].reshape(1, -1))
    # print(distances, indices)
    if(n_neighbors + X > len(indices[0])):
        print(f"could not find enough neighbours... only {len(indices[0])} could be found :(")
        return
    
    # Exclude the first X+1 closest neighbors to skip the user itself and the X closest neighbors
    relevant_indices = indices.flatten()[X+1:]
    relevant_distances = distances.flatten()[X+1:]
    # print("relevant_indices: ", relevant_indices)
    # print("relevant_distances: ", relevant_distances)
    
    # Initialize predictions array with zeros
    predicted_ratings = np.zeros(user_item_matrix.shape[1])
    
    # Calculate predictions for each item
    for item_index in range(user_item_matrix.shape[1]):
        # Skip if the user has already rated this item
        if user_item_matrix[target_user_index, item_index] > 0:
            continue
    
        # Aggregate ratings from neighbors for this item
        total_rating = 0
        total_weight = 0
        for i,neighbor_index in enumerate(relevant_indices):
            neighbor_rating = user_item_matrix[neighbor_index, item_index]
            if neighbor_rating > 0:
                # Use inverse distance as weight, add a small epsilon to avoid division by zero
                # print(i, neighbor_index, len(relevant_distances))
                weight = 1 / (relevant_distances[i] + 1e-5)
                total_rating += neighbor_rating * weight
                total_weight += weight
    
        # Predict rating if there are neighbors who rated this item
        if total_weight > 0:
            predicted_ratings[item_index] = total_rating / total_weight

    recommendations = {}

    for i,rating in enumerate(predicted_ratings):
        if(rating > 0):
            recommendations[i] = rating
    
    top_recommendations = dict(sorted(recommendations.items(), key=lambda item: item[1], reverse=True)[:n_recommendations])

    return top_recommendations

In [250]:
top_recommendations_s = predict_ratings(rating_matrix, 0, 10, 60, 50) #with serendipity
print(top_recommendations_s)

{2546: 0.4000639336588939, 3326: 0.3785120093869527, 1919: 0.3754953822642921, 510: 0.374913654250061, 819: 0.37453978153230283, 3383: 0.37417344990423734, 1448: 0.3696381774739505, 1838: 0.36533397642703225, 3125: 0.35977615932354995, 2705: 0.359621148457998, 109: 0.3574145057269234, 2479: 0.3574145057269234, 2213: 0.35703479932482507, 4342: 0.35703479932482507, 3917: 0.35620211294930565, 4650: 0.3559664367735224, 2199: 0.35564546750558135, 3467: 0.35534909083091554, 3041: 0.35358170309784437, 2170: 0.35353446301304725, 272: 0.3532579671606447, 1477: 0.3512783648513138, 3185: 0.3505350389173542, 695: 0.3499487828207367, 2042: 0.3498448191532034, 3796: 0.3494616328578646, 799: 0.3494280274653218, 1446: 0.34939669916315946, 3238: 0.34791394088549166, 4790: 0.347351546618459, 3718: 0.34663523937574797, 1735: 0.3457003427932689, 3429: 0.345649229721008, 4534: 0.3456322884048714, 1869: 0.345462190031814, 3: 0.34528835716008743, 1417: 0.34527009344247217, 1469: 0.34500745713087494, 825: 0.3

In [251]:
top_recommendations = predict_ratings(rating_matrix, 50, 0, 50, 50) #without serendipity
print(top_recommendations)

{4333: 0.3501470369662893, 3513: 0.3492951459540673, 4636: 0.34672659592062116, 2777: 0.3446542240320253, 4532: 0.3445031600243194, 224: 0.3408351351873404, 3293: 0.3408153876012288, 4320: 0.34078743267035644, 1342: 0.3356494204788103, 3777: 0.3356494204788103, 1215: 0.33215511148766386, 3426: 0.33125164376843896, 1145: 0.3310354424773807, 853: 0.32616146820164194, 4692: 0.32561999629119426, 878: 0.32553511728887413, 4539: 0.3242889687689726, 1247: 0.3240515574884495, 1549: 0.323830378057959, 3858: 0.32380071813011385, 1739: 0.3237509448829102, 3377: 0.3237255417049516, 3353: 0.32371824481111844, 2001: 0.32369547182524433, 4480: 0.32369547182524433, 3263: 0.3229990326073339, 2064: 0.3215041276957525, 4296: 0.3151357922562135, 1719: 0.3146404835900575, 1924: 0.3142116271497224, 1015: 0.3137254901960785, 2781: 0.30653312183688114, 2071: 0.2991696925988605, 3720: 0.2978865880765604, 2125: 0.2975606179015286, 4970: 0.296803020017712, 4279: 0.29624988814402203, 4345: 0.29604966432867397, 31

In [None]:
def calculate_serendipity(threshold, n_exclude=0):
    unexp = []
    useful_in_unexp = []
    for user in range(len(rating_matrix)):
        if(user%100 == 0):
            print(user)
        pop_recs = {}
        for i,r in enumerate(og_rating_matrix[user]):
            if(r>0):
                pop_recs[i] = r

        ##popularity based baseline model
        pop_recs_sorted = dict(sorted(pop_recs.items(), key=lambda item: item[1], reverse=True)[:50])
        
        ##KNN model without serendipity
        top_recommendations = predict_ratings(rating_matrix, user, n_exclude, 50, 50)
        
        unexp_temp = [[x,top_recommendations[x]] for x in top_recommendations if x in pop_recs_sorted]
    
        useful_in_unexp_temp = [x[0] for x in unexp_temp if x[1]>=0.356]
    
        unexp.extend(unexp_temp)
        useful_in_unexp.extend(useful_in_unexp_temp)

    srdp = len(useful_in_unexp)/len(unexp)
    print(srdp)
    return srdp
    

In [None]:
srdp_1 = calculate_serendipity(0.356, 0)
print(srdp_1)

0
100
200
300
400
500
600
700
800
900


In [None]:
srdp_2 = calculate_serendipity(0.350, 5)
print(srdp_2)