### JESTER JOKE RECOMMENDER SYSTEM 
##### &emsp;&emsp;- Collaborative Filtering -Item-Based Recommender Algorithm 
##### _Over 1.7 million ratings of 150 jokes from 59,132 users_
Xuyang Ji <br> Mar 9st, 2023

>_Data Abstract: The dataset "jester_ratings.csv", collected between Nov 2006 - May 2009, contains over 1.7 million continuous ratings (-10.00 to +10.00) of 150 jokes from 59,132 users, where each row is formatted as userID, itemID, and ratings. The dataset "jester_item.csv" maps item ID to the actual text of the jokes._

In [1]:
import os
os.getcwd()

'/Users/celine/Desktop/JESTER DS'

In [2]:
import math
import numpy as np
from numpy import *
from numpy import linalg as la
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from collections import Counter #finding the majority 
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.decomposition import TruncatedSVD,PCA #Dimension Reduction (LSA)
from sklearn.preprocessing import Normalizer, LabelBinarizer 
%matplotlib inline

In [3]:
rating_df= pd.read_csv("jester_ratings.csv")
items_df= pd.read_csv("jester_items.csv")

In [4]:
rating_df= rating_df.pivot(index='userId',columns='jokeId', values='rating')

In [5]:
scalar= MinMaxScaler(feature_range=(1,21))
rating_df[rating_df.columns]= scalar.fit_transform(rating_df[rating_df.columns])

In [6]:
rating_df.fillna(0,inplace=True)
rating_df= rating_df.to_numpy()

In [7]:
#inA and inB are coluumn vectors 
def ecludSim(inA,inB):
    return 1.0 / (1.0 + la.norm(inA - inB))

def pearsSim(inA,inB):
    if len(inA) < 3 : 
        return 1.0
    return 0.5 + 0.5 * np.corrcoef(inA, inB, rowvar = 0)[0][1]

def cosSim(inA,inB):
    num = float(inA.T * inB)
    denom = la.norm(inA)*la.norm(inB)
    return 0.5 + 0.5 * (num / denom)

In [8]:
cosSim(mat(rating_df)[:,0], mat(rating_df)[:,3])

0.5381343806127434

In [9]:
#item-based recommender sys.
def standEst(dataMat, user, simMeas, item):
    #calculates the estimated rating a user would give an item for 
    #a given similarity measure. 
    n = shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0: 
            continue
        # Find items rated by users and find indices of the trues in the array
        overLap = np.nonzero(np.array(np.logical_and(dataMat[:,item]>0, \
                                      dataMat[:,j]>0)))[0]
        if len(overLap) == 0: 
            similarity = 0
        else: 
            similarity = simMeas(dataMat[overLap,item], \
                                   dataMat[overLap,j])
        #print 'the %d and %d similarity is: %f' % (item, j, similarity)
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0: 
        return 0
    else: 
        return ratSimTotal/simTotal

In [10]:
standEst(rating_df,5,ecludSim,6)

10.891311834574973

In [11]:
U,Sigma,VT = la.svd(mat(rating_df), full_matrices=False)
sum((Sigma**2)[:50]) > sum(Sigma**2)*0.9

True

In [12]:
#using SVD 
def svdEst(data, user, simMeas, item):
    #Estimate rating for a given items for a given user by using SVD
    n = shape(data)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    data=mat(data)
    U,Sigma,VT = la.svd(data, full_matrices=False)
    Sig4 = np.mat(np.eye(50)*Sigma[:50]) #arrange Sig4 into a diagonal matrix
    xformedItems = data.T * U[:,:50] * Sig4.I  #create transformed items
    for j in range(n):
        userRating = data[user,j]
        if userRating == 0 or j==item: continue
        similarity = simMeas(xformedItems[item,:].T,\
                             xformedItems[j,:].T)
        #print 'the %d and %d similarity is: %f' % (item, j, similarity)
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    else: return ratSimTotal/simTotal

In [13]:
svdEst(rating_df,5,ecludSim,2)

8.64383732106096

In [14]:
def cross_validate_user(dataMat, user, test_ratio, estMethod=standEst, simMeas=pearsSim):
    number_of_items = np.shape(dataMat)[1]
    
    rated_items_by_user = np.array([i for i in range(number_of_items) if dataMat[user,i]>0])
    test_size = int(test_ratio * len(rated_items_by_user))
    test_indices = np.random.randint(0, len(rated_items_by_user), test_size)
    
    withheld_items = rated_items_by_user[test_indices]
    original_user_profile = np.copy(dataMat[user])
    dataMat[user, withheld_items] = 0 # So that the withheld test items is not used in the rating estimation below
    error_u = 0.0
    count_u = len(withheld_items)

	# Compute absolute error for user u over all test items
    for item in withheld_items:
        # Estimate rating on the withheld item
        estimatedScore = estMethod(dataMat, user, simMeas, item)
        error_u = error_u + (estimatedScore - original_user_profile[item])**2
    
    for item in withheld_items:
        dataMat[user, item] = original_user_profile[item]
    return error_u, count_u


def test(dataMat, test_ratio, estMethod):
    # iterate over all users and for each 
    # perform evaluation by calling the above cross_validate_user function on each user.
    total_error=0.0
    total_count=0
    for user in range(dataMat.shape[0]):
        if estMethod == standEst:
            error_u, count_u = cross_validate_user(dataMat,user,test_ratio, standEst)
            total_error += error_u
            total_count += count_u
        if estMethod == svdEst:
            if user % 50 ==0:
                error_u, count_u = cross_validate_user(dataMat,user,test_ratio,svdEst)
                total_error += error_u
                total_count += count_u
    RMSE= np.sqrt(total_error/total_count)
    print('Mean Absoloute Error for ',estMethod,' : ', RMSE)
    return RMSE

By iterating all users and for each performs evaluation with 20% test ratio, the overall Mean Absolute Error (MAE) using standard item-based collaborative filtering based on the rating prediction function "standEst" is approximately 4.6. Meantime, the MAE of using the SVD-based version of the rating item-based CF as the prediction engine, is approximately 4. Notice that the computing time for using 'svdEst' function is much more efficient than using 'standEst' function/ 

In [15]:
test(rating_df[::50], 0.2, standEst)

Mean Absoloute Error for  <function standEst at 0x7fe572fea040>  :  4.581238825488467


4.581238825488467

In [16]:
test(rating_df[::50],0.2,svdEst)

Mean Absoloute Error for  <function svdEst at 0x7fe572fea0d0>  :  4.278577840402492


4.278577840402492

In [17]:
def print_most_similar_jokes(dataMat, jokes, queryJoke, k, metric=pearsSim):
    # Get the ratings for the query joke
    queryJokeRatings = dataMat[:, queryJoke].flatten()
    
    # Calculate similarity between the query joke and all other jokes
    similarities = []
    for i in range(dataMat.shape[1]):
        jokeRatings = dataMat[:, i].flatten()
        similarity = metric(queryJokeRatings, jokeRatings)
        similarities.append(similarity)
    
    # Sort the similarities in descending order and get the indices of the top-k similar jokes
    sorted_similarities_indices = np.argsort(similarities)[::-1][:k]
    
    # Create a dataframe to store the top-k similar jokes and their similarity scores
    similar_jokes_df = pd.DataFrame({'joke_id': sorted_similarities_indices, 
                                      'similarity_score': [similarities[i] for i in sorted_similarities_indices],
                                      'joke_text': [jokes.loc[jokes['jokeId'] == i, 'jokeText'].iloc[0] for i in sorted_similarities_indices]})
    
    # Print the dataframe
    print(f"The {k} most similar jokes to joke {queryJoke} ({jokes.loc[jokes['jokeId'] == queryJoke, 'jokeText'].iloc[0]}) are:")
    print(similar_jokes_df)


In [18]:
print_most_similar_jokes(rating_df, items_df, 3, 5, pearsSim)

The 5 most similar jokes to joke 3 (Q. What's 200 feet long and has 4 teeth? 

A. The front row at a Willie Nelson Concert.
) are:
   joke_id  similarity_score  \
0        3          1.000000   
1        4          0.706858   
2        2          0.695585   
3        5          0.657029   
4        6          0.643112   

                                           joke_text  
0  Q. What's 200 feet long and has 4 teeth? \n\nA...  
1  Q. What's the difference between a man and a t...  
2  This couple had an excellent relationship goin...  
3  Q.\tWhat's O. J. Simpson's Internet address? \...  
4  Bill & Hillary are on a trip back to Arkansas....  
