In [1]:
import pandas as pd
import numpy as np

In [2]:
# process the data, This is specific to the movie data and is not used for the joke
def process_data(df):
    
    #copy the dataframe
    temp_df = df.copy()
    
    # replace all empty strings with Nan
    temp_df.replace(r'^\s*$', np.NaN, regex=True, inplace=True)

    # replace all NaN with 0
    temp_df.fillna(value=0, inplace=True)

    # rename the first column Users
    temp_df.rename(columns={temp_df.columns[0]: 'Users'}, inplace = True)

    # Set Users as the index
    temp_df.set_index('Users', inplace= True)

    # Drop the row without values
    temp_df.drop([0,0], inplace = True)
    
    return temp_df

# method to calculate the pearson coor value
# x is the target row
def pearson(temp_df, x , user):
    
    row_total = []
    target_total = []
    
    cor = 0
    x_cor = 0
    user_cor = 0
    
    
    p_cor = 0
    
    i = 0
    
    # append the values for each row if both x and user are not NaN
    for i in range(temp_df.shape[1]):
        # skips any NanN
        if not np.isnan(temp_df.iat[x,i]) and not np.isnan(temp_df.iat[user,i]):
            row_total.append(temp_df.iat[x,i])
            target_total.append(temp_df.iat[user,i])
    
    # calc the mean for both
    row_mean = np.mean(row_total)
    target_mean = np.mean(target_total)
    
    
    i = 0
    
    # apply the pearson equation
    for i in range(len(row_total)):
        cor = cor + ((row_total[i]-row_mean)*(target_total[i]-target_mean))
        x_cor = x_cor + ((row_total[i]-row_mean)**2)
        user_cor = user_cor + ((target_total[i]-target_mean)**2)
    
    p_cor = cor/(np.sqrt(x_cor * user_cor))
    
    return p_cor


def find_knn(df, user, test_size):
    
    # temp copy of the df
    temp_df = df.copy()

    #replace all 0 with Nan
    temp_df.replace(0,np.NaN, inplace = True)
    
    # create the p_corr column
    temp_df['P_corr'] = np.NaN
    
    # calculate the pearson coor for each row, excluding the testing section
    for i in range(temp_df.shape[0]-test_size):
        temp_df.iat[i,temp_df.shape[1]-1] = pearson(temp_df, i, user)
    
    # Sort values based on the value in a specific column
    temp_df.sort_values(['P_corr'], ascending=[False], inplace = True)
    
    return temp_df

def predict_rating(df, movie_target, k):
    # Prediction of rating

    pred_temp = 0
    sim_sum = 0
    pred_rating= 0

    i = 0

    # iterate through the the knn and calculate the predicted weighted rating
    for i in range(k):
        
        if not np.isnan(df.iat[i,movie_target]):

            pred_temp = pred_temp + (df.iat[i,movie_target]* df.iat[i,df.shape[1]-1])

            sim_sum = sim_sum + df.iat[i,df.shape[1]-1]
    
    # in case there are no knn with a rating within k use the entire training section for a predicted rating
    if sim_sum == 0:
        return predict_rating(df, movie_target, 20) 
    
    else:
        pred_rating = pred_temp/sim_sum     

    return pred_rating
 
def get_recommendations(user, df, k, n, test_size):
    # find recommendations
    
    temp_df = find_knn(df, user, test_size)
    
    i = 0
    
    # locate where the target user is located and use that as a pointer
    user_name = temp_df.index.get_loc(df.index[user])
    
    if user_name <= k:
        k = k +1

    recommendations = []

    for i in range(temp_df.shape[1]-1):
        
        if np.isnan(temp_df.iat[user_name,i]):
            recommendations.append([temp_df.columns[i],predict_rating(temp_df, i, k)]) # is k
    
    # sort the recommendations so they are sorted from largest to smallest
    recommendations.sort(key=lambda tup: tup[1], reverse=True)
    
    # print out the recommendations
    for i in range(len(recommendations[:n])):
        print("We recommend the following {0} with a predicted rating of {1:.2f}: ".format(recommendations[i][0], recommendations[i][1]))
    
    return recommendations

# Mean Absolute Error
# This will iterate through each input user for each k that is input 
# and return a 2d array with the results

def mae_calc(df, users, k, test_size):
    
    combined_mae = []
    
    j = 0
    
    # iterate through each user
    for j in range(len(users)):
        
        # find the knn for the user
        temp_df = find_knn(df, users[j], test_size)
        
        #initialize the individual user mae
        user_mae = []
        
        v = 0
        
        # iterate through all k values that were input
        for v in range(len(k)):
            i = 0
            temp = 0
            mae_sum = []
            
            
            # iterate through each movie
            for i in range(temp_df.shape[1]):
                
                # check to see if a rating exists
                if not np.isnan(temp_df.iat[users[j],i]):
                    
                    #calculate the absolute difference between the two
                    temp = abs(predict_rating(temp_df, i, k[v]) - temp_df.iat[users[j],i])

                    mae_sum.append(temp)
                    
            # check to see if the resulting difference is 0 to avoid a divide by zero        
            if mae_sum == 0:
                user_mae.append(mae_sum)
                
            # if the difference is not 0 then calculate the mae
            else:
                mae = sum(mae_sum)/len(mae_sum)
                user_mae.append(mae)
                
        #append each user to the combined and return
        combined_mae.append(user_mae)
        
    return combined_mae


# Run the movie KNN

In [3]:
#import the ratings file to dataframe
movie_df = pd.read_excel("knn-csc480-a4.xls")

In [4]:
movie_df = process_data(movie_df)

In [5]:
# Mean Absolute Error

# NU1 - NU5, k = 3, MAE 
users = [20,21,22,23,24]    
k = [3]
test_size = len(users)

mae_testing = mae_calc(movie_df, users, k, test_size)

#return
np.average(mae_testing)

0.951776813234291

In [6]:
# NU1 recommendations and predicted rating
users = 20
k = 3
n = 3
test_size = 5

recs = get_recommendations(users, movie_df, k, n, test_size)


We recommend the following THE DA VINCI CODE with a predicted rating of 3.48: 
We recommend the following RUNNY BABBIT with a predicted rating of 2.57: 


In [7]:
# NU2 recommendations and predicted rating
users = 21
k = 3
n = 3
test_size = 5

recs = get_recommendations(users, movie_df, k, n, test_size)

We recommend the following TRUE BELIEVER with a predicted rating of 2.00: 
We recommend the following HARRY POTTER with a predicted rating of 1.51: 
We recommend the following THE KITE RUNNER with a predicted rating of 1.50: 


In [13]:
# Finding the top 3 recommendations for U2, U5, U13, and U20, with k=4 
users = [1,4,12,19]
k = 4
n = 3
test_size = 5

for user in users:
    print("\nTop {} Recommendations for U{}: ".format(n, user+1))
    get_recommendations(user, movie_df, k, n, test_size)


Top 3 Recommendations for U2: 
We recommend the following MY LIFE SO FAR with a predicted rating of 3.33: 
We recommend the following HARRY POTTER with a predicted rating of 3.25: 
We recommend the following THE WORLD IS FLAT with a predicted rating of 2.33: 

Top 3 Recommendations for U5: 
We recommend the following MY LIFE SO FAR with a predicted rating of 3.84: 
We recommend the following THE TAKING with a predicted rating of 3.37: 
We recommend the following HARRY POTTER with a predicted rating of 2.53: 

Top 3 Recommendations for U13: 
We recommend the following THE DA VINCI CODE with a predicted rating of 4.24: 
We recommend the following TRUE BELIEVER with a predicted rating of 3.00: 
We recommend the following THE KITE RUNNER with a predicted rating of 3.00: 

Top 3 Recommendations for U20: 
We recommend the following RUNNY BABBIT with a predicted rating of 3.11: 


  p_cor = cor/(np.sqrt(x_cor * user_cor))


# Extra Credit: Joke Recommender

In [9]:
#import the ratings file to dataframe

joke_df = pd.read_csv("modified_jester_data.csv", header=None)
jokes_df = pd.read_csv("jokes.csv", header=None)

In [10]:
#keep the last 100 rows
joke_df.drop(joke_df.index[0:900], inplace=True)

# randomly mix up the rows
joke_df = joke_df.sample(frac=1).copy()

In [11]:
# Mean Absolute Error
users = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99]  
k = [47]
test_size = len(users)

mae_testing = mae_calc(joke_df, users, k, test_size)

#return
np.average(mae_testing)

3.332276519662198

In [12]:
# working
test_size = 20
users = 96
k = 47
n = 3


recs = get_recommendations(users, joke_df, k, n, test_size)

if not len(recs) == 0:
    for i in range(n):
        print("\nFor user {0}, We recommend the following: \n{1} \nwith a predicted rating of {2:.2f}: ".format(joke_df.index[users], jokes_df.iat[recs[i][0],1], recs[i][1]))
if len(recs) == 0:
    print("\nIt appears that this user has rated every joke. Please try another user.")

We recommend the following 71 with a predicted rating of 16.37: 
We recommend the following 88 with a predicted rating of 15.64: 
We recommend the following 92 with a predicted rating of 15.49: 

For user 987, We recommend the following: 
On the first day of college the Dean addressed the students pointing out some of the rules:"The female dormitory will be out-of-bounds for all male students and the male dormitory to the female students. Anybody caught breaking this rule will be fined $20 the first time." He continued "Anybody caught breaking this rule the second time will be fined $60. Being caught a third time will cost you a fine of $180. Are there any questions ?"At this point a male student in the crowd inquired:"How much for a season pass ?" 
with a predicted rating of 16.37: 

For user 987, We recommend the following: 
A radio conversation of a US naval ship with Canadian authorities ... Americans: Please divert your course 15 degrees to the North to avoid a collision.Canadians

In [None]:
joke_df.tail()