In [2]:
import pandas as pd
import json
import random
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, SVD
from surprise import accuracy

In [46]:
# random sampling of data: To be done only once

# df = pd.read_csv("okcupid_profiles.csv")
# weights = {
#     ('straight', 'm'): 1,
#     ('straight', 'f'): 1,
#     ('gay', 'm'): 0.5,
#     ('gay', 'f'): 0.5,
#     ('bisexual', 'm'): 0.5,
#     ('bisexual', 'f'): 0.5
# }
# n = 10000
# # Calculate the weight for each row based on orientation and sex
# df['weight'] = df.apply(lambda row: weights.get((row['orientation'], row['sex']), 0), axis=1)

# # Sample n rows with replacement based on the weights
# sampled_df = df.sample(n=n, replace=True, weights='weight')

# # Drop the weight column
# sampled_df.drop(columns=['weight'], inplace=True)

# sampled_df.count()
# group_counts = sampled_df.groupby(['orientation', 'sex']).size()
# print(group_counts)
# # sampled_df.to_csv('okcupid_profile_data.csv',index=False)

In [3]:
dataset = pd.read_csv("okcupid_profile_data.csv")
dataset.insert(0, 'ID', range(0, len(dataset)))

dataset.head()
dataset.count()
dataset['orientation'].unique()
dataset.isnull().sum()

group_counts = dataset.groupby(['orientation', 'sex']).size()
print(group_counts)
# dataset['sex'].unique()

orientation  sex
bisexual     f       178
             m        69
gay          f       142
             m       387
straight     f      3676
             m      5548
dtype: int64


In [4]:
# adding possible matches according to sex and orientation : not required

males = list(dataset[dataset['sex'] == 'm']["ID"])
females = list(dataset[dataset['sex'] == 'f']["ID"])
both = males + females

dataset['possible_match_ids'] = dataset.apply(lambda row:  
                                         females if row['sex'] == 'm' and row['orientation'] == 'straight' 
                                         else (males if row['sex'] == 'f' and row['orientation'] == 'straight'
                                            else (males if row['sex'] == 'm' and row['orientation'] == 'gay'
                                                else (females if row['sex'] == 'f' and row['orientation'] == 'gay'
                                                    else both      
                                                     )))
                                         , axis=1)
dataset.head()

Unnamed: 0,ID,age,status,sex,orientation,body_type,diet,drinks,drugs,education,...,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9,possible_match_ids
0,0,35,single,m,straight,fit,,socially,,graduated from masters program,...,i do the corporate tech industry grind at an a...,what am i not good at? i am a jack of all trad...,i'm going to go with humor. or that i am just ...,"film: robert altman, coen brothers, terry gill...","well, when the world goes into economic and po...","art/film projects, the world's politics and ec...","lounging around the house watching tv/movie, e...","well, it's really not that private, but i weav...",if the above seems interesting and you fit at ...,"[3, 4, 9, 10, 11, 12, 14, 17, 18, 19, 21, 24, ..."
1,1,27,single,m,straight,athletic,mostly anything,socially,never,graduated from masters program,...,right now i'm working on my ph.d. in stanford....,listening to people. learning new skills. tole...,i got a twin brother who also does his ph.d. a...,books: generally investigative journalism movi...,music friends sunshine ice cream sports iphone,...how to make people laugh ...how to make the...,...up for anything but staying at home!,ask me.,you wanna know more about me :),"[3, 4, 9, 10, 11, 12, 14, 17, 18, 19, 21, 24, ..."
2,2,31,single,m,straight,fit,,socially,never,working on masters program,...,finishing graduate school. will be going for a...,"cooking italian/roman specialties, translating...","easy going, white teeth, nice shoes.","movies: bill and ted's excellent adventure, th...",family and friends strong coffee music beads o...,it might sound cliche but its true: how to liv...,ideally a live music show or having dinner wit...,have three visible scars on my body.,"you want to leisurely enjoy sf, want to get a ...","[3, 4, 9, 10, 11, 12, 14, 17, 18, 19, 21, 24, ..."
3,3,28,single,f,straight,average,,not at all,never,graduated from college/university,...,i've been in real estate for most of my career...,"salsa dancing, hula hooping, singing, dancing ...",that i can get along and find something in com...,"books: too many to list! angela's ashes, ready...",lucy the greatest dog ever live music stimulat...,"the present, the next fun thing, my career, my...","salsa dancing or dancing with friends, going t...","when i'm in a really good mood, or if im in a ...",you aren't at all sneaky creepy ... you're pas...,"[0, 1, 2, 5, 6, 7, 8, 13, 15, 16, 20, 22, 23, ..."
4,4,24,single,f,straight,average,mostly anything,socially,,graduated from college/university,...,well i just got a job working as a program spe...,directions! my friends and family joke that i'...,,"books: harry potter, nicholas sparks books (gu...",family friends books music i'll come up with t...,the power of our thoughts. and now that i just...,probably enjoying a quiet night to wind down f...,,~ you made it this far ~ you're looking for so...,"[0, 1, 2, 5, 6, 7, 8, 13, 15, 16, 20, 22, 23, ..."


In [5]:
# read compatibility scores from file

import json
comp_scores = "compatibility.json"
with open(comp_scores) as f:
    comp_scores = json.loads(f.read())

def get_dict(row):
    return comp_scores.get(str(row['ID']), {})

# Adding new column with corresponding dictionary
dataset['top_compatible'] = dataset.apply(get_dict, axis=1)
test_dataset = dataset[dataset['top_compatible'] != {}]
len(test_dataset)

0

In [6]:
# add random compatibility scores - temp solution
def get_dict(row):
    index = [random.randint(0, 9999) for _ in range(50)]
    score = [random.random() for _ in range(50)]
    return({'index':index, 'score':score})

dataset['top_compatible'] = dataset.apply(get_dict, axis=1)

In [7]:
# generating compatibility dataset of size n * 50
compatibility_data = {'ID': [], 'compatible_ID': [], 'compatibility_score': []}
for index, row in dataset.iterrows():
    id_value = row['ID']
    compatible_ids = row['top_compatible']['index']
    compatibility_scores = row['top_compatible']['score']
    
    # Appending data for each compatible ID
    for compatible_id, compatibility_score in zip(compatible_ids, compatibility_scores):
        compatibility_data['ID'].append(id_value)
        compatibility_data['compatible_ID'].append(compatible_id)
        compatibility_data['compatibility_score'].append(compatibility_score)

compatibility_data = pd.DataFrame(compatibility_data)
# compatibility_data.head()
len(compatibility_data)

500000

In [8]:
# creating a surprise object

reader = Reader(rating_scale=(0, 1))
data   = Dataset.load_from_df(compatibility_data, reader)


# Split the data into training & testing sets. Python's surprise documentation has the steps detailed out
# https://surprise.readthedocs.io/en/stable/FAQ.html

raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)                 # shuffle dataset

threshold   = int(len(raw_ratings)*0.8)

train_raw_ratings = raw_ratings[:threshold] # 80% of data is trainset
test_raw_ratings  = raw_ratings[threshold:] # 20% of data is testset

data.raw_ratings = train_raw_ratings        # data is now the trainset
trainset         = data.build_full_trainset() 
testset          = data.construct_testset(test_raw_ratings)

In [9]:
# Trying KNN (K-Nearest Neighbors) & SVD (Singluar Value decomposition) algorithms using default model parameters

models=[KNNBasic(),KNNWithMeans(),KNNWithZScore(),KNNBaseline(),SVD()] 
results = {}

for model in models:
    # perform 5 fold cross validation
    # evaluation metrics: mean absolute error & root mean square error
    CV_scores = cross_validate(model, data, measures=["MAE","RMSE"], cv=5, n_jobs=-1)  
    
    # storing the average score across the 5 fold cross validation for each model
    result = pd.DataFrame.from_dict(CV_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_rmse': 'RMSE'})
    results[str(model).split("algorithms.")[1].split("object ")[0]] = result

In [10]:
performance_df = pd.DataFrame.from_dict(results)
print("Model Performance: \n")
performance_df.T.sort_values(by='RMSE')

Model Performance: 



Unnamed: 0,MAE,RMSE,fit_time,test_time
matrix_factorization.SVD,0.262844,0.309838,3.025808,0.551444
knns.KNNBasic,0.283953,0.342702,3.243219,2.147061
knns.KNNBaseline,0.284609,0.343441,2.908174,2.261356
knns.KNNWithZScore,0.284818,0.34402,2.966284,2.22458
knns.KNNWithMeans,0.285346,0.344542,2.71048,2.061376


In [13]:
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(compatibility_data, reader)
trainset = data.build_full_trainset()

In [18]:
def generate_recommendationsKNN(userID=13552, like_recommend=5, get_recommend =10):
    
    ''' This function generates "get_recommend" number of book recommendations using 
        KNNWithMeans & item based filtering. The function needs as input three 
        different parameters:
        (1) userID i.e., userID for which recommendations need to be generated 
        (2) like_recommend i.e., number of top recommendations for the userID to be 
        considered for making recommendations 
        (3) get_recommend i.e., number of recommendations to generate for the userID
        Default values are: userID=13552, like_recommend=5, get_recommend=10
    '''
    
    # Compute item based similarity matrix
    sim_options       = {'name':'cosine','min_support':3,'user_based':True}
    similarity_matrix = KNNWithMeans(sim_options=sim_options).fit(trainset).\
                        compute_similarities() 
    
    userID      = trainset.to_inner_uid(userID)    # converts the raw userID to innerID
    userRatings = trainset.ur[userID]              # method .ur takes user innerID & 
                                                   # returns back user ratings
    
    
    # userRatings is a list of tuples [(,),(,),(,)..]. Each tuple contains item & rating
    # given by the user for that item. Next, the tuples will be sorted within the list 
    # in decreasing order of rating. Then top 'like_recommend' items & ratings are extracted
    
    temp_df = pd.DataFrame(userRatings).sort_values(by=1, ascending=False).\
              head(like_recommend)
    userRatings = temp_df.to_records(index=False) 
    
    # for each (item,rating) in top like_recommend user items, multiply the user rating for
    # the item with the similarity score (later is obtained from item similarity_matrix) for
    # all items. This helps calculate the weighted rating for all items. The weighted ratings 
    # are added & divided by sum of weights to estimate rating the user would give an item
    
    recommendations   = {}

    for user_top_item, user_top_item_rating  in userRatings:

        all_item_indices          =   list(pd.DataFrame(similarity_matrix)[user_top_item].index)
        all_item_weighted_rating  =   list(pd.DataFrame(similarity_matrix)[user_top_item].values*\
                                          user_top_item_rating)
        
        all_item_weights          =   list(pd.DataFrame(similarity_matrix)[user_top_item].values)
        
        
        # All items & final estimated ratings are added to a dictionary called recommendations
        
        for index in range(len(all_item_indices)):
            if index in recommendations:
                # sum of weighted ratings
                recommendations[index] += all_item_weighted_rating[index]        
            else:                        
                recommendations[index]  = all_item_weighted_rating[index]

    
    for index in range(len(all_item_indices)):                               
            if all_item_weights[index]  !=0:
                # final ratings (sum of weighted ratings/sum of weights)
                recommendations[index]   =recommendations[index]/\
                                          (all_item_weights[index]*like_recommend)
                      

    # convert dictionary recommendations to a be a list of tuples [(,),(,),(,)]
    # with each tuple being an item & estimated rating user would give that item
    # sort the tuples within the list to be in decreasing order of estimated ratings

    temp_df = pd.Series(recommendations).reset_index().sort_values(by=0, ascending=False)
    recommendations = list(temp_df.to_records(index=False))
    
    # return get_recommend number of recommedations (only return items the user 
    # has not previously rated)
    
    final_recommendations = []
    count = 0
    
    for item, score in recommendations:
        flag = True
        for userItem, userRating in trainset.ur[userID]:
            if item == userItem: 
                flag = False       # If item in recommendations has not been rated by user, 
                break              # add to final_recommendations
        if flag == True:
            final_recommendations.append(trainset.to_raw_iid(item)) 
            count +=1              # trainset has the items stored as inner id,  
                                   # convert to raw id & append 
            
        if count > get_recommend:  # Only get 'get_recommend' number of recommendations
            break
    
    return(final_recommendations)

In [19]:
recommendationsKNN = generate_recommendationsKNN(userID=2, like_recommend=5, get_recommend=10)
recommendationsKNN

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[8664, 2562, 5515, 4448, 749, 1755, 1575, 8485, 169, 1217, 3099]

In [16]:
ids_to_match = [2, 4870, 1491, 7875, 7011, 4145, 2551, 9707, 3455, 21, 3119, 4575]
matched_rows = dataset[dataset['ID'].isin(ids_to_match)]
print(matched_rows)

        ID  age     status sex orientation body_type               diet  \
2        2   31     single   m    straight       fit                NaN   
21      21   26     single   f    bisexual     curvy  mostly vegetarian   
1491  1491   29     single   m    straight       NaN                NaN   
2551  2551   41     single   m    straight  athletic       mostly other   
3119  3119   43     single   m    straight  athletic                NaN   
3455  3455   32     single   m    straight   average                NaN   
4145  4145   34     single   m    straight   average    mostly anything   
4575  4575   38  available   m    bisexual       NaN                NaN   
4870  4870   28     single   m    straight       fit                NaN   
7011  7011   32     single   f    straight   average                NaN   
7875  7875   30     single   f         gay   average           anything   
9707  9707   41     single   m    straight   average  strictly anything   

          drinks      dr

In [20]:
ids_to_match = [2, 8664, 2562, 5515, 4448, 749, 1755, 1575, 8485, 169, 1217, 3099]
matched_rows = dataset[dataset['ID'].isin(ids_to_match)]
print(matched_rows)

        ID  age     status sex orientation body_type               diet  \
2        2   31     single   m    straight       fit                NaN   
169    169   34     single   f    straight       fit  mostly vegetarian   
749    749   48     single   f    straight  athletic                NaN   
1217  1217   42     single   m    straight       fit  strictly anything   
1575  1575   26     single   m    straight       fit    mostly anything   
1755  1755   33     single   f    straight   average                NaN   
2562  2562   19     single   f    straight   average                NaN   
3099  3099   34     single   m    straight   average  mostly vegetarian   
4448  4448   32  available   m    straight   average  strictly anything   
5515  5515   27     single   f    straight    skinny    mostly anything   
8485  8485   27     single   f    straight   average    mostly anything   
8664  8664   54     single   m    straight    skinny                NaN   

          drinks  drugs 