In [1]:
import pandas as pd
import numpy as np
import random as rand
np.set_printoptions(precision = 3)

df = pd.read_csv('jester-data-1.csv')

df_copy = df.copy()

In [2]:
# Label approx 10% of the dataset cells as 99, to denote they are part of the validation set.
# Keep the the actual values of the cells so you can use them later.

users = df_copy.shape[0]
jokes = df_copy.shape[1]
counts = users * jokes

counts_not_specified = (df_copy.iloc[:,:] == 99).sum().sum()
counts_not_specified_percent = (counts_not_specified / counts) * 100

counts_specified = counts - counts_not_specified
counts_to_change = round(counts_specified * 0.1).astype(int)

row = 0
col = 0
for x in range(0, counts_to_change):  
    while (df_copy.iloc[row, col]) == 99:
        row = rand.randint(0, users - 1)
        col = rand.randint(0, jokes - 1)
    df_copy.at[row, col] = 99    

df_copy.head()

Unnamed: 0,74,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,...,99.18,99.19,99.20,99.21,99.22,-5.63,99.23,99.24,99.25,0
0,100,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07,99.0
1,49,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,...,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0,
2,48,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,...,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0,
3,91,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6,
4,100,-6.17,-3.54,0.44,-8.5,-7.09,-4.32,-8.69,-0.87,-6.65,...,-6.89,-0.68,-2.96,-2.18,-3.35,0.05,-9.08,-5.05,-3.45,


In [3]:
f = 3

# Use latent factor modeling to infer the hidden ratings of the users
# (they are labeled as "99" in the dataset) on the training set
latent_item_features = np.random.random((jokes, f))
latent_user_preferences = np.random.random((users, f))

In [4]:
# Calculate the performance of the algorithm on the validation dataset

def predict_rating(user_row, joke_col):
    """ Predict a rating given a user_row and an item_row. """
    
    user_values = latent_user_preferences[user_row]
    item_values = latent_item_features[joke_col]
    return user_values.dot(item_values)

# Training function.
def train(user_row, item_row, rating, alpha = 0.001):
    """ Adapt the values of user_preferences and item_factors to match
    the ones predicted by the users. """
    
    err = alpha * (rating - predict_rating(user_row, item_row))
    latent_user_preferences[user_row] += err * latent_item_features[item_row]
    latent_item_features[item_row] += err * latent_user_preferences[user_row]
    return err


def sgd(iterations = 100):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    for iteration in range(0,iterations):
        training_error = []
        for user_row in range(0, 100):
            for joke_col in range(0, jokes):
                rating = df_copy.iloc[user_row][joke_col]
                if(not np.isnan(rating) and rating < 99):
                    training_error.append(train(user_row, joke_col, rating))
    mse = (np.array(training_error) ** 2).mean()          
    print (mse)

In [5]:
sgd()

1.583362432464443e-05


In [6]:
print('Latent item features = ' + repr(latent_item_features))
print('Latent user preferences = ' + repr(latent_user_preferences))

Latent item features = array([[ 0.591,  0.726,  0.424],
       [ 1.503,  1.33 , -0.119],
       [ 0.635,  1.744,  0.349],
       [ 0.069,  2.458,  0.3  ],
       [ 0.083,  2.689, -0.972],
       [ 0.778,  1.66 , -0.411],
       [ 1.939,  0.424,  1.353],
       [ 1.895,  0.615, -0.425],
       [ 0.291,  1.395, -0.876],
       [-0.005,  2.777, -1.193],
       [ 0.367,  2.205, -0.333],
       [ 0.5  ,  2.109,  0.489],
       [ 0.859,  1.528,  1.711],
       [-1.714,  2.63 ,  0.142],
       [ 0.814,  1.582,  1.819],
       [-1.05 ,  1.563, -1.425],
       [-0.857,  0.795, -1.983],
       [-0.674,  1.013, -0.092],
       [ 0.215,  1.21 , -0.75 ],
       [ 1.67 ,  0.125,  0.147],
       [ 1.784,  0.52 , -0.853],
       [ 0.944,  0.878,  2.41 ],
       [ 0.116,  2.416,  0.688],
       [ 1.405,  1.954, -0.368],
       [-0.592,  2.954, -1.206],
       [ 0.197,  2.087,  0.64 ],
       [ 0.389,  1.649,  1.714],
       [ 1.094,  0.796,  2.634],
       [ 2.191,  0.993,  0.217],
       [ 1.06 ,  0.6

In [7]:
#MSE
validation_error = []
for user in range(0, 100):
    for joke in range(0, jokes):
        rating_training_data = df.iloc[user][joke]
        rating_validation_data = df_copy.iloc[user][joke]
        if (not rating_training_data == rating_validation_data):
            prediction = predict_rating(user, joke)
            validation_error.append(rating_training_data - predict_rating(user, joke))
            #print("Expected: " + str(rating_training_data) + "\tActual: " + str(predict_rating(user, joke)))
print("Validation MSE: " + str((np.array(validation_error) ** 2).mean()))

Validation MSE: 5716.000425411026
