In [7]:
import pandas as pd
import numpy as np

## Item Based Joke Recommedation

### Description of the Dataset:
- One file (modified_jester_data.csv) contains ratings on 100 jokes by 1000 users (each row is user profile)
- ratings have been normalized between 1 and 21 with 1 being lowest, a 0 indicates a missing rating
- Other file (jokes.csv) contains the actual 100 jokes that align with the columns of the modified_jester_data.csv

In [2]:
#load in jokes text data
joke_text = pd.read_csv('jokes.csv', header=None)
joke_text.head()

Unnamed: 0,0,1
0,0,"A man visits the doctor. The doctor says ""I ha..."
1,1,This couple had an excellent relationship goin...
2,2,Q. What's 200 feet long and has 4 teeth? A. Th...
3,3,Q. What's the difference between a man and a t...
4,4,Q. What's O. J. Simpson's Internet address? A....


In [23]:
jokeText = joke_text.values
jokeText[:2]

array([[0,
        'A man visits the doctor. The doctor says "I have bad news for you.You have cancer and Alzheimer\'s disease". The man replies "Well thank God I don\'t have cancer!"'],
       [1,
        'This couple had an excellent relationship going until one day he came home from work to find his girlfriend packing. He asked her why she was leaving him and she told him that she had heard awful things about him. "What could they possibly have said to make you move out?" "They told me that you were a pedophile." He replied "That\'s an awfully big word for a ten year old." ']],
      dtype=object)

In [3]:
#load in joke ratings data
jokes = pd.read_csv('modified_jester_data.csv', header=None)
jokes.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,3.18,19.79,1.34,2.84,3.48,2.5,1.15,15.17,2.02,6.24,...,13.82,0.0,0.0,0.0,0.0,0.0,5.37,0.0,0.0,0.0
1,15.08,10.71,17.36,15.37,8.62,1.34,10.27,5.66,19.88,20.22,...,13.82,6.05,10.71,18.86,10.81,8.86,14.06,11.34,6.68,12.07
2,0.0,0.0,0.0,0.0,20.03,20.27,20.03,20.27,0.0,0.0,...,0.0,0.0,0.0,20.08,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,19.35,0.0,0.0,12.8,19.16,8.18,17.21,0.0,12.84,...,0.0,0.0,0.0,11.53,0.0,0.0,0.0,0.0,0.0,0.0
4,19.5,15.61,6.83,5.61,12.36,12.6,18.04,15.61,10.56,16.73,...,16.19,16.58,15.27,16.19,16.73,12.55,14.11,17.55,12.8,12.6


In [10]:
jokes_np = jokes.values

#### Problem 2B
- Complete definition for the function 'test'
- function iterates over all users and returns the error information necessary to compute the MAE
- test function with standard item-based collborative filtering and with SVD-based version of the rating

In [6]:
from numpy import linalg as la

In [4]:
def ecludSim(inA,inB):
    return 1.0 / (1.0 + la.norm(inA - inB))

def pearsSim(inA,inB):
    if len(inA) < 3 : return 1.0
    return 0.5 + 0.5 * corrcoef(inA, inB, rowvar = 0)[0][1]

def cosSim(inA,inB):
    num = float(inA.T * inB)
    denom = la.norm(inA)*la.norm(inB)
    return 0.5 + 0.5 * (num / denom)

def standEst(dataMat, user, simMeas, item):
    n = shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0: continue
        overLap = nonzero(logical_and(dataMat[:,item]>0, \
                                      dataMat[:,j]>0))[0]
        if len(overLap) == 0: similarity = 0
        else: similarity = simMeas(dataMat[overLap,item], \
                                   dataMat[overLap,j])
        #print 'the %d and %d similarity is: %f' % (item, j, similarity)
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    else: return ratSimTotal/simTotal
    
def svdEst(dataMat, user, simMeas, item):
    n = shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    data=mat(dataMat)
    U,Sigma,VT = la.svd(data)
    Sig4 = mat(eye(4)*Sigma[:4]) #arrange Sig4 into a diagonal matrix
    xformedItems = data.T * U[:,:4] * Sig4.I  #create transformed items
    for j in range(n):
        userRating = data[user,j]
        if userRating == 0 or j==item: continue
        similarity = simMeas(xformedItems[item,:].T,\
                             xformedItems[j,:].T)
        #print 'the %d and %d similarity is: %f' % (item, j, similarity)
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    else: return ratSimTotal/simTotal

In [20]:
# This function performs evaluation on a single user based on the test_ratio
# For example, with test_ratio = 0.2, a randomly selected 20 percent of rated 
# items by the user are withheld and the rest are used to estimate the withheld ratings

def cross_validate_user(dataMat, user, test_ratio, estMethod=standEst, simMeas=pearsSim):
    number_of_items = np.shape(dataMat)[1]
    rated_items_by_user = np.array([i for i in range(number_of_items) if dataMat[user,i]>0])
    test_size = int((test_ratio * len(rated_items_by_user))//1)
    test_indices = np.random.randint(0, len(rated_items_by_user), test_size)
    withheld_items = rated_items_by_user[test_indices]
    original_user_profile = np.copy(dataMat[user])
    dataMat[user, withheld_items] = 0 # So that the withheld test items is not used in the rating estimation below
    error_u = 0.0
    count_u = len(withheld_items)

    # Compute absolute error for user u over all test items
    for item in withheld_items:
        # Estimate rating on the withheld item
        estimatedScore = estMethod(dataMat, user, simMeas, item)
        error_u = error_u + abs(estimatedScore - original_user_profile[item])

    # Now restore ratings of the withheld items to the user profile
    for item in withheld_items:
        dataMat[user, item] = original_user_profile[item]

    # Return sum of absolute errors and the count of test cases for this user
    # Note that these will have to be accumulated for each user to compute MAE
    return error_u, count_u

In [18]:
def test(dataMat, test_ratio, estMethod):
    # Write this function to iterate over all users and for each perform evaluation by calling
    # the above cross_validate_user function on each user. MAE will be the ratio of total error 
    # across all test cases to the total number of test cases, for all users
    total_error = 0
    total_count = 0
    for i in range(len(dataMat)):
        error_u, count_u = cross_validate_user(dataMat, i, test_ratio, estMethod)
        total_error += error_u
        total_count += count_u
    MAE = total_error / total_count
    print ('Mean Absoloute Error for ',estMethod,' : ', MAE)

In [21]:
test(jokes_np, .2, standEst)

Mean Absoloute Error for  <function standEst at 0x0000012E836B1E18>  :  3.6743728796049253


In [22]:
test(jokes_np, .2, svdEst)

Mean Absoloute Error for  <function svdEst at 0x0000012E836B1D90>  :  3.652462669153546


### Problem 2C
- write new function to print most similar jokes
- prints k most similar jokes in addition to the joke being queried

In [43]:
def print_most_similar_jokes(dataMat, jokes, queryJoke, k, metric=pearsSim):
    # Write this function to find the k most similar jokes (based on user ratings) to a queryJoke
    # The queryJoke is a joke id as given in the 'jokes.csv' file (an corresponding to the a column in dataMat)
    # You must compare ratings for the queryJoke (the column in dataMat corresponding to the joke), to all
    # other joke rating vectors and return the top k. Note that this is the same as performing KNN on the 
    # columns of dataMat. The function must retrieve the text of the joke from 'jokes.csv' file and print both
    # the queryJoke text as well as the text of the returned jokes.
    sim = np.zeros(len(jokes))
    for i in range(len(dataMat)):
        if i != queryJoke:
            sim[i] = metric(dataMat[queryJoke,:].T, dataMat[i,:].T)
    idx = np.argsort(sim)
    ids = idx[::-1][:k]
    print('Jokes most similar to: \n', jokes[queryJoke])
    print()
    for i in ids:
        print(jokes[i])
        print()

In [44]:
print_most_similar_jokes(jokes_np.T, jokeText[:,1], 3, 3)

Jokes most similar to: 
 Q. What's the difference between a man and a toilet? A. A toilet doesn't follow you around after you use it.

What do you get when you run over a parakeet with a lawnmower? Shredded tweet.

A country guy goes into a city bar that has a dress code and the maitred' demands he wear a tie. Discouraged the guy goes to his car to sulk when inspiration strikes: He's got jumper cables in the trunk! So he wraps them around his neck sort of like a string tie (a bulky string tie to be sure) and returns to the bar. The maitre d' is reluctant but says to the guy "Okay you're a pretty resourceful fellow you can come in... but just don't start anything"!  

Q. What's 200 feet long and has 4 teeth? A. The front row at a Willie Nelson Concert.

