In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, recall_score, confusion_matrix

In [2]:
#Working directory
working_directory = "C:/Users/Eric/Documents/Offline Uni/Social Networks Assignment 3/ml-100k/"
#Get our data
train_set = pd.read_csv(working_directory + "u1.base.csv", header=None, usecols=[0,1,2], names=['userId', 'movieId', 'rating'])
test_set = pd.read_csv(working_directory + "u1.test.csv", header=None, usecols=[0,1,2], names=['userId', 'movieId', 'rating'])
movie_info = pd.read_csv(working_directory + "u.item.csv", header=None, encoding='latin-1', usecols=[0,1], names=['movieId', 'Title'])

In [3]:
def pivot(data, val, ind, col):
    pivoted_data = pd.pivot_table(data, values=val, index=ind, columns=col, fill_value=0)
    return pivoted_data

In [4]:
def reduce_low_counts(pivot_data, amount):
    pivot_data = pivot_data.replace(0, np.NaN)
    pivot_data['counts'] = pivot_data.count(axis=1)
    pivot_data = pivot_data[pivot_data['counts'] > amount]
    pivot_data  = pivot_data.drop(columns=['counts'])
    pivot_data  = pivot_data.fillna(0)
    return pivot_data

In [5]:
#Calculate cosine value
def cosine(data):
    #Get cosine values
    cosined_table = pd.DataFrame(cosine_similarity(data), index=data.index.values.tolist(), columns=data.index.values.tolist())
    return cosined_table

In [6]:
#Predict test values with item-item CF weighted average
def item_item_cf_weighted_avg(cosine_dataset, training_dataset_pivoted, testing_dataset, n):
    predicted_values = []
    for i in testing_dataset.index:
        #Remove all diagonal values since they are all 1
        np.fill_diagonal(cosine_dataset.values, 0)
        #Get a user, movie from testing set and find the n most similar movies
        current_user = testing_dataset.at[i, 'userId']
        current_movie = testing_dataset.at[i, 'movieId']
        
        #Check if movie exists in the training dataset/cosine similarity matrix
        if current_movie in cosine_dataset.columns:
            current_df = cosine_dataset[[current_movie]].nlargest(n, current_movie)

            #List of cosines
            similar_movies_cosine = current_df[current_movie].values.tolist()
            #List of movies
            similar_movies_id = current_df.index.values.tolist()
            #Get numerator and denominator for weighted average
            part_numerator = 0
            part_denominator = 0
            for j in range(n):
                #Numerator - users rating for similar movie * movie cosine similarity - summed for number of similar movies
                part_numerator = part_numerator + training_dataset_pivoted.at[similar_movies_id[j], current_user] * similar_movies_cosine[j]
                #Denominoator
                part_denominator = part_denominator + similar_movies_cosine[j]
            #Round predicted value and add to list
            predicted_values.append(round(part_numerator / part_denominator))
        #Set rating as 2.5 if it doesn't exist in the testing set
        else:
            predicted_values.append(2.5)
    return predicted_values

In [10]:
#Preprocess the training set stuff
training_set_pivot = pivot(train_set, 'rating', 'movieId', 'userId')
reduced_training_set = reduce_low_counts(training_set_pivot, 0)
training_set_cosine = cosine(reduced_training_set)

In [11]:
#Obtain the predicted and actual values
predicted = item_item_cf_weighted_avg(training_set_cosine, reduced_training_set, test_set, 20)
actual = test_set['rating'].values.tolist()

In [24]:
#Change predicted to whole numbers
predicted = np.array(predicted)
predicted = predicted.astype(np.int64)
predicted = predicted.tolist()

In [25]:
#Calculate the rmse and recall
rmse = mean_squared_error(actual, predicted, squared=False)
print("rmse is: " + str(rmse))
recall = recall_score(actual, predicted, average='micro')
print("recall is: " + str(recall))

rmse is: 2.7254449178069993
recall is: 0.0562


In [None]:
test_actual = [[i] for i in actual]
test_predicted = [[i] for i in predicted]

In [None]:
rmse = mean_squared_error(actual, predicted, squared=False)
recall = recall_score(actual, predicted, average='micro')

In [None]:
#ORIGINAL
#rmse is: 2.7269121731364945
#recall is: 0.0562
#REMOVE MOVIES WITH LESS THAN 50 RATINGS
#rmse is: 2.8333196078098917
#recall is: 0.04615