In [2]:
'''
Team Members

ZWE MIN MAW                                     
SAW ZWE WAI YAN                    
THANARIT KANJANAMATAWAT      
'''
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Part I: A simple recommendation algorithm

## 1.1 : Read train set file

In [53]:
rating_trainset = pd.read_csv("/kaggle/input/miniproject2/rating_trainset.csv")
rating_trainset.head()

Unnamed: 0,userID,placeID,rating
0,U1001,132825,2
1,U1001,132830,1
2,U1001,135025,2
3,U1001,135033,1
4,U1001,135039,1


## 1.2 : Read test set file

In [54]:
rating_testset = pd.read_csv("/kaggle/input/miniproject2/rating_testset.csv")
rating_testset.head()

Unnamed: 0,userID,PlaceID,Rating
0,U1003,132825,2
1,U1003,135079,2
2,U1006,132825,1
3,U1006,135079,1
4,U1009,132834,2


## 2 : Create pivoted data for train dataset

In [55]:
# Read the data into a pandas dataframe
data = rating_trainset
data.columns = ['user', 'place', 'rating']

# Pivot the data to transform it into a table with unique users and their ratings for each place
pivoted_data = data.pivot(index='user', columns='place', values='rating')

# Rename the columns to include 'Place' before the place id
pivoted_data.columns = ['Place ' + str(col) for col in pivoted_data.columns]

# Save the pivoted data to a new csv file
# pivoted_data.to_csv('pivoted_data.csv')

# Replace Nan Values with 0
# pivoted_data.fillna(0, inplace=True)
pivoted_data

Unnamed: 0_level_0,Place 132560,Place 132561,Place 132564,Place 132572,Place 132583,Place 132584,Place 132594,Place 132608,Place 132609,Place 132613,...,Place 135080,Place 135081,Place 135082,Place 135085,Place 135086,Place 135088,Place 135104,Place 135106,Place 135108,Place 135109
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U1001,,,,,,,,,,,...,,,,0.0,,,,,,
U1002,,,,,,,,,,,...,,,,1.0,,,,1.0,,
U1003,,,,,,,,,,,...,2.0,,,,,,,,,
U1004,,,,,,,,,,,...,,,,,,,,2.0,,
U1005,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U1134,,,,0.0,,,,,,,...,1.0,,,2.0,,,,,,
U1135,,,,,,,,,,,...,,,,0.0,,,,0.0,,
U1136,,,,,,,,,,,...,,,,,,,,,,
U1137,,,,,,,,,,,...,,,,2.0,,,,,,


## 3 : Calcualte cosine similarity matrix between users

In [75]:
# Similarity Matrix without function
pivoted_data_copy = pivoted_data.copy()
pivoted_data_copy.fillna(0, inplace=True)

# Compute cosine similarity matrix
cos_sim_matrix = np.zeros((len(pivoted_data_copy), len(pivoted_data_copy)))

for i in range(len(pivoted_data_copy)):
    for j in range(i, len(pivoted_data_copy)):
        dot_product = np.dot(pivoted_data_copy.iloc[i], pivoted_data_copy.iloc[j])
        norm_i = np.linalg.norm(pivoted_data_copy.iloc[i])
        norm_j = np.linalg.norm(pivoted_data_copy.iloc[j])
        cos_sim = dot_product / (norm_i * norm_j)
        cos_sim_matrix[i, j] = cos_sim
        cos_sim_matrix[j, i] = cos_sim

# Convert the matrix to a dataframe with user IDs as index and columns
cos_sim_df = pd.DataFrame(cos_sim_matrix, index=pivoted_data_copy.index, columns=pivoted_data_copy.index)

# Replace NaN values with 0
cos_sim_df.fillna(0, inplace=True)

# Print the resulting dataframe
cos_sim_df.to_csv('Group1_Part1_COSINE_11.csv', index=True)
print(cos_sim_df)

user      U1001     U1002     U1003     U1004     U1005     U1006     U1007  \
user                                                                          
U1001  1.000000  0.177522 -0.064804 -0.060746  0.004865 -0.061940  0.138926   
U1002  0.177522  1.000000  0.086377  0.100876  0.035967 -0.069679  0.011163   
U1003 -0.064804  0.086377  1.000000 -0.067709 -0.065078  0.174632 -0.068822   
U1004 -0.060746  0.100876 -0.067709  1.000000  0.115228  0.021572  0.075265   
U1005  0.004865  0.035967 -0.065078  0.115228  1.000000 -0.062202  0.189893   
...         ...       ...       ...       ...       ...       ...       ...   
U1134  0.010440  0.128066  0.326048 -0.083627 -0.080376 -0.011122  0.035109   
U1135  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
U1136 -0.066319  0.271774 -0.003261  0.230976 -0.066600 -0.026496 -0.070432   
U1137  0.099967  0.335226  0.429536 -0.076596 -0.073619  0.134903  0.037158   
U1138 -0.035629  0.332092  0.203237 -0.037226 -0.035

## 4 : Create 7-K nearest neighbour for each user

In [57]:
similarity_matrix = cos_sim_df.copy()

# create a new DataFrame to store the NN information
nn_data = pd.DataFrame(index=similarity_matrix.index, columns=[f'{i}thNNUserID' for i in range(1, 8)])

# for each row in the similarity matrix, find the 7 most similar users
for user in similarity_matrix.index:
    sim_series = similarity_matrix.loc[user].sort_values(ascending=False)
    nn_series = sim_series.iloc[1:8]
    nn_tuples = [(idx, round(nn_series.loc[idx], 2)) for idx in nn_series.index]
    nn_data.loc[user] = nn_tuples
    
nn_data

Unnamed: 0_level_0,1thNNUserID,2thNNUserID,3thNNUserID,4thNNUserID,5thNNUserID,6thNNUserID,7thNNUserID
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
U1001,"(U1054, 0.47)","(U1036, 0.45)","(U1024, 0.43)","(U1071, 0.42)","(U1092, 0.4)","(U1116, 0.4)","(U1055, 0.4)"
U1002,"(U1029, 0.58)","(U1009, 0.52)","(U1090, 0.44)","(U1045, 0.43)","(U1027, 0.4)","(U1132, 0.4)","(U1078, 0.4)"
U1003,"(U1029, 0.47)","(U1137, 0.47)","(U1061, 0.39)","(U1134, 0.38)","(U1009, 0.34)","(U1048, 0.3)","(U1108, 0.3)"
U1004,"(U1132, 0.35)","(U1016, 0.35)","(U1061, 0.31)","(U1097, 0.28)","(U1136, 0.28)","(U1078, 0.26)","(U1104, 0.26)"
U1005,"(U1075, 0.63)","(U1014, 0.58)","(U1125, 0.56)","(U1010, 0.45)","(U1053, 0.38)","(U1018, 0.35)","(U1016, 0.35)"
...,...,...,...,...,...,...,...
U1134,"(U1036, 0.52)","(U1059, 0.51)","(U1108, 0.4)","(U1003, 0.38)","(U1029, 0.38)","(U1122, 0.37)","(U1136, 0.35)"
U1135,"(U1095, 0.0)","(U1089, 0.0)","(U1090, 0.0)","(U1091, 0.0)","(U1092, 0.0)","(U1093, 0.0)","(U1094, 0.0)"
U1136,"(U1033, 0.41)","(U1134, 0.35)","(U1112, 0.34)","(U1089, 0.33)","(U1002, 0.32)","(U1038, 0.31)","(U1057, 0.3)"
U1137,"(U1003, 0.47)","(U1029, 0.45)","(U1002, 0.39)","(U1009, 0.37)","(U1045, 0.34)","(U1116, 0.34)","(U1077, 0.33)"


## 5 : Calculate the mean ratings of users

In [58]:
# create a new DataFrame to store the means
mean_data = pd.DataFrame(columns=['R_Mean'])

# for each user, calculate the mean rating
for user, row in pivoted_data_copy.iterrows():
    mean = row.sum() / row.count()
    mean_data.loc[user] = {'R_Mean': mean}
mean_data

Unnamed: 0,R_Mean
U1001,0.076923
U1002,0.107692
U1003,0.130769
U1004,0.115385
U1005,0.092308
...,...
U1134,0.176923
U1135,0.000000
U1136,0.123077
U1137,0.169231


## 6 : Create Prediciton Matrix

In [59]:
rating_data = pivoted_data.copy()
K_NNUsers = nn_data
R_Mean = mean_data

# create a copy of rating_data that replaces NaN with 0
rating_data_copy = rating_data.fillna(0)

# create an empty DataFrame for the predictions
Prediction_Matrix = pd.DataFrame(columns=rating_data.columns, index=rating_data.index)

# for each user and place, calculate the predicted rating
for user in rating_data.index:
    for place in rating_data.columns:
        if pd.isna(rating_data.loc[user, place]):
            r_mean = R_Mean.loc[user, 'R_Mean']
            knn_data = K_NNUsers.loc[user]
            num = 0
            den = 0
            for i in range(1, 8):
                nn_user, sim = knn_data[f'{i}thNNUserID']
                nn_rating = rating_data_copy.loc[nn_user, place]
                nn_mean = R_Mean.loc[nn_user, 'R_Mean']
                if nn_rating != 0:
                    num += sim * (nn_rating - nn_mean)
                    den += sim
            if den != 0:
                pred_rating = r_mean + num / den
            else:
                pred_rating = r_mean
            Prediction_Matrix.loc[user, place] = pred_rating
        else:
            Prediction_Matrix.loc[user, place] = rating_data.loc[user, place]

Prediction_Matrix

Unnamed: 0_level_0,Place 132560,Place 132561,Place 132564,Place 132572,Place 132583,Place 132584,Place 132594,Place 132608,Place 132609,Place 132613,...,Place 135080,Place 135081,Place 135082,Place 135085,Place 135086,Place 135088,Place 135104,Place 135106,Place 135108,Place 135109
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U1001,0.076923,0.076923,0.076923,1.484615,0.076923,0.076923,0.076923,0.076923,0.076923,0.076923,...,0.076923,0.076923,0.076923,0.0,0.076923,0.076923,0.076923,1.428638,0.076923,0.076923
U1002,0.107692,0.107692,0.107692,0.969231,0.107692,0.107692,0.107692,0.107692,0.107692,0.107692,...,0.107692,1.030769,0.107692,1.0,0.107692,0.107692,0.107692,1.0,0.107692,0.107692
U1003,0.130769,0.130769,0.130769,0.95786,0.130769,0.130769,0.130769,0.130769,0.130769,0.130769,...,2.0,0.130769,0.130769,1.524834,0.95786,0.130769,0.130769,0.130769,0.130769,0.130769
U1004,0.115385,0.115385,0.115385,0.892308,0.115385,0.115385,0.115385,0.115385,0.115385,0.115385,...,1.892308,1.038462,0.115385,1.986619,1.388787,0.115385,0.115385,2.0,0.115385,0.115385
U1005,0.092308,0.092308,0.092308,0.092308,0.092308,0.092308,0.092308,0.092308,0.092308,0.092308,...,0.092308,1.475456,2.030769,1.930769,0.092308,0.092308,0.092308,1.930769,1.030769,0.092308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U1134,0.176923,0.176923,0.176923,0.0,0.176923,0.176923,0.176923,0.176923,0.176923,0.176923,...,1.0,0.176923,0.176923,2.0,1.069231,0.176923,0.176923,0.176923,0.176923,0.176923
U1135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U1136,0.123077,0.123077,0.123077,1.0,0.123077,0.123077,0.123077,0.123077,0.123077,0.123077,...,0.992308,0.123077,0.123077,1.346231,0.123077,0.123077,0.123077,1.015385,0.123077,0.123077
U1137,0.169231,0.169231,0.169231,0.169231,0.169231,0.169231,0.169231,0.169231,0.169231,0.169231,...,2.038462,0.169231,0.169231,2.0,0.169231,0.169231,0.169231,1.061538,0.169231,0.169231


## 7.1 : Recommend 5 not-yet visted to user with place and predicited ratings

In [60]:
# recommended_places

# sort the predicted rating in descending order with its corresponding place
Prediction_Sorted = Prediction_Matrix.apply(lambda x: pd.Series(x.sort_values(ascending=False)
                                                         .iloc[:5].index.tolist()), axis=1)

# compare the place id with test data and skip the ones that are already in the test data
test_data_places = set(rating_testset["PlaceID"].unique().tolist())

Recommendations = Prediction_Sorted.apply(lambda x: [(place, Prediction_Matrix.loc[user, place]) for place in x
                                                      if place not in test_data_places][:5], axis=1)


Recommendations

user
U1001    [(Place 135025, 1.9000000000000004), (Place 13...
U1002    [(Place 132825, 1.4246963562753037), (Place 13...
U1003    [(Place 135052, 1.2634992458521872), (Place 13...
U1004    [(Place 135030, 0.038461538461538464), (Place ...
U1005    [(Place 135082, 0.038461538461538464), (Place ...
                               ...                        
U1134    [(Place 135032, 0.038461538461538464), (Place ...
U1135    [(Place 132560, 0.038461538461538464), (Place ...
U1136    [(Place 135041, 1.9307692307692306), (Place 13...
U1137    [(Place 135038, 0.038461538461538464), (Place ...
U1138    [(Place 132921, 2.0), (Place 132922, 2.0), (Pl...
Length: 138, dtype: object

## 7.2 : Seperate the Data Series to be each columns

In [61]:
# create an empty list to store the data
data = []

# iterate through the rows of the Series
for user, ratings in Recommendations.iteritems():
    # iterate through the ratings for each user
    for place, rating in ratings:
        # append the data as a tuple to the list
        data.append((user, place, rating))

# create a DataFrame from the list of tuples
Recommendations_Seperated = pd.DataFrame(data, columns=['user', 'PlaceID', 'predicted_rating'])

# output the DataFrame
Recommendations_Seperated

Unnamed: 0,user,PlaceID,predicted_rating
0,U1001,Place 135025,1.900000
1,U1001,Place 132825,1.424696
2,U1001,Place 135052,1.263499
3,U1001,Place 135062,1.263499
4,U1001,Place 135047,0.969231
...,...,...,...
685,U1138,Place 132921,2.000000
686,U1138,Place 132922,2.000000
687,U1138,Place 135058,1.946154
688,U1138,Place 135065,1.946154


## 8 : Merge nn-data and recommendation for each users

In [62]:
# merge nn_data and Recommendations_Seperated on 'user' column
merged_df = pd.merge(nn_data.reset_index(), Recommendations_Seperated, on='user')

# rename 'user' column to 'userID'
merged_df = merged_df.rename(columns={'user': 'userID'})

# select only the required columns
merged_df = merged_df[['userID', '1thNNUserID', '2thNNUserID', '3thNNUserID', '4thNNUserID', '5thNNUserID', 
                       '6thNNUserID', '7thNNUserID', 'PlaceID', 'predicted_rating']]

# output the merged dataframe
merged_df



Unnamed: 0,userID,1thNNUserID,2thNNUserID,3thNNUserID,4thNNUserID,5thNNUserID,6thNNUserID,7thNNUserID,PlaceID,predicted_rating
0,U1001,"(U1054, 0.47)","(U1036, 0.45)","(U1024, 0.43)","(U1071, 0.42)","(U1092, 0.4)","(U1116, 0.4)","(U1055, 0.4)",Place 135025,1.900000
1,U1001,"(U1054, 0.47)","(U1036, 0.45)","(U1024, 0.43)","(U1071, 0.42)","(U1092, 0.4)","(U1116, 0.4)","(U1055, 0.4)",Place 132825,1.424696
2,U1001,"(U1054, 0.47)","(U1036, 0.45)","(U1024, 0.43)","(U1071, 0.42)","(U1092, 0.4)","(U1116, 0.4)","(U1055, 0.4)",Place 135052,1.263499
3,U1001,"(U1054, 0.47)","(U1036, 0.45)","(U1024, 0.43)","(U1071, 0.42)","(U1092, 0.4)","(U1116, 0.4)","(U1055, 0.4)",Place 135062,1.263499
4,U1001,"(U1054, 0.47)","(U1036, 0.45)","(U1024, 0.43)","(U1071, 0.42)","(U1092, 0.4)","(U1116, 0.4)","(U1055, 0.4)",Place 135047,0.969231
...,...,...,...,...,...,...,...,...,...,...
685,U1138,"(U1086, 0.41)","(U1029, 0.4)","(U1002, 0.36)","(U1090, 0.34)","(U1027, 0.32)","(U1046, 0.31)","(U1112, 0.3)",Place 132921,2.000000
686,U1138,"(U1086, 0.41)","(U1029, 0.4)","(U1002, 0.36)","(U1090, 0.34)","(U1027, 0.32)","(U1046, 0.31)","(U1112, 0.3)",Place 132922,2.000000
687,U1138,"(U1086, 0.41)","(U1029, 0.4)","(U1002, 0.36)","(U1090, 0.34)","(U1027, 0.32)","(U1046, 0.31)","(U1112, 0.3)",Place 135058,1.946154
688,U1138,"(U1086, 0.41)","(U1029, 0.4)","(U1002, 0.36)","(U1090, 0.34)","(U1027, 0.32)","(U1046, 0.31)","(U1112, 0.3)",Place 135065,1.946154


## 9 : Merge and Recommend users in test set

In [63]:
# Find unique users in test data
unique_users = rating_testset['userID'].unique()
unique_users_df = pd.DataFrame({'userID': unique_users})

result = pd.merge(unique_users_df.reset_index(), merged_df, on='userID')
result = result.drop(['index'], axis=1)
result.to_csv('Group1_Part1_RECOMMEND_12.csv', index=False)
result

Unnamed: 0,userID,1thNNUserID,2thNNUserID,3thNNUserID,4thNNUserID,5thNNUserID,6thNNUserID,7thNNUserID,PlaceID,predicted_rating
0,U1003,"(U1029, 0.47)","(U1137, 0.47)","(U1061, 0.39)","(U1134, 0.38)","(U1009, 0.34)","(U1048, 0.3)","(U1108, 0.3)",Place 135052,1.263499
1,U1003,"(U1029, 0.47)","(U1137, 0.47)","(U1061, 0.39)","(U1134, 0.38)","(U1009, 0.34)","(U1048, 0.3)","(U1108, 0.3)",Place 135032,0.038462
2,U1003,"(U1029, 0.47)","(U1137, 0.47)","(U1061, 0.39)","(U1134, 0.38)","(U1009, 0.34)","(U1048, 0.3)","(U1108, 0.3)",Place 132937,1.315929
3,U1003,"(U1029, 0.47)","(U1137, 0.47)","(U1061, 0.39)","(U1134, 0.38)","(U1009, 0.34)","(U1048, 0.3)","(U1108, 0.3)",Place 135059,1.477328
4,U1003,"(U1029, 0.47)","(U1137, 0.47)","(U1061, 0.39)","(U1134, 0.38)","(U1009, 0.34)","(U1048, 0.3)","(U1108, 0.3)",Place 132755,0.038462
...,...,...,...,...,...,...,...,...,...,...
70,U1137,"(U1003, 0.47)","(U1029, 0.45)","(U1002, 0.39)","(U1009, 0.37)","(U1045, 0.34)","(U1116, 0.34)","(U1077, 0.33)",Place 135038,0.038462
71,U1137,"(U1003, 0.47)","(U1029, 0.45)","(U1002, 0.39)","(U1009, 0.37)","(U1045, 0.34)","(U1116, 0.34)","(U1077, 0.33)",Place 135051,0.900000
72,U1137,"(U1003, 0.47)","(U1029, 0.45)","(U1002, 0.39)","(U1009, 0.37)","(U1045, 0.34)","(U1116, 0.34)","(U1077, 0.33)",Place 135025,1.900000
73,U1137,"(U1003, 0.47)","(U1029, 0.45)","(U1002, 0.39)","(U1009, 0.37)","(U1045, 0.34)","(U1116, 0.34)","(U1077, 0.33)",Place 135080,0.038462


# Part II:A simple collaborative based filtering algorithm and evaluation

## 1 : Pivoted Data

In [64]:
# Read the data into a pandas dataframe
data = rating_trainset
data.columns = ['user', 'place', 'rating']

# Pivot the data to transform it into a table with unique users and their ratings for each place
pivoted_data = data.pivot(index='user', columns='place', values='rating')

# Rename the columns to include 'Place' before the place id
pivoted_data.columns = ['Place ' + str(col) for col in pivoted_data.columns]

user_profile = pivoted_data.copy()
user_profile.fillna(0, inplace=True)
user_profile

Unnamed: 0_level_0,Place 132560,Place 132561,Place 132564,Place 132572,Place 132583,Place 132584,Place 132594,Place 132608,Place 132609,Place 132613,...,Place 135080,Place 135081,Place 135082,Place 135085,Place 135086,Place 135088,Place 135104,Place 135106,Place 135108,Place 135109
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U1001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U1002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
U1003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U1004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
U1005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U1134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
U1135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U1136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U1137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2 : Adjusted Mean

In [65]:
# create a new DataFrame to store the means
mean_data = pd.DataFrame(columns=['R_Mean'])

# for each user, calculate the mean rating
for user, row in pivoted_data_copy.iterrows():
    mean = row.sum() / row.count()
    mean_data.loc[user] = {'R_Mean': mean}
mean_data

Unnamed: 0,R_Mean
U1001,0.076923
U1002,0.107692
U1003,0.130769
U1004,0.115385
U1005,0.092308
...,...
U1134,0.176923
U1135,0.000000
U1136,0.123077
U1137,0.169231


## 3 : User Profile

In [76]:
# Load the user profile data into a pandas DataFrame
user_profile = user_profile.copy()

# Load the R_Mean data into a pandas DataFrame
R_Mean = mean_data.copy()

# Subtract the R_Mean value from each column in the user profile
for user in user_profile.index:
    user_profile.loc[user] = user_profile.loc[user] - R_Mean.loc[user]['R_Mean']

user_profile.to_csv('Group1_Part2_PROFILE_21.csv', index=True)
# Output the modified user profile DataFrame
user_profile

Unnamed: 0_level_0,Place 132560,Place 132561,Place 132564,Place 132572,Place 132583,Place 132584,Place 132594,Place 132608,Place 132609,Place 132613,...,Place 135080,Place 135081,Place 135082,Place 135085,Place 135086,Place 135088,Place 135104,Place 135106,Place 135108,Place 135109
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U1001,-0.153846,-0.153846,-0.153846,-0.153846,-0.153846,-0.153846,-0.153846,-0.153846,-0.153846,-0.153846,...,-0.153846,-0.153846,-0.153846,-0.153846,-0.153846,-0.153846,-0.153846,-0.153846,-0.153846,-0.153846
U1002,-0.215385,-0.215385,-0.215385,-0.215385,-0.215385,-0.215385,-0.215385,-0.215385,-0.215385,-0.215385,...,-0.215385,-0.215385,-0.215385,0.784615,-0.215385,-0.215385,-0.215385,0.784615,-0.215385,-0.215385
U1003,-0.261538,-0.261538,-0.261538,-0.261538,-0.261538,-0.261538,-0.261538,-0.261538,-0.261538,-0.261538,...,1.738462,-0.261538,-0.261538,-0.261538,-0.261538,-0.261538,-0.261538,-0.261538,-0.261538,-0.261538
U1004,-0.230769,-0.230769,-0.230769,-0.230769,-0.230769,-0.230769,-0.230769,-0.230769,-0.230769,-0.230769,...,-0.230769,-0.230769,-0.230769,-0.230769,-0.230769,-0.230769,-0.230769,1.769231,-0.230769,-0.230769
U1005,-0.184615,-0.184615,-0.184615,-0.184615,-0.184615,-0.184615,-0.184615,-0.184615,-0.184615,-0.184615,...,-0.184615,-0.184615,-0.184615,-0.184615,-0.184615,-0.184615,-0.184615,-0.184615,-0.184615,-0.184615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U1134,-0.353846,-0.353846,-0.353846,-0.353846,-0.353846,-0.353846,-0.353846,-0.353846,-0.353846,-0.353846,...,0.646154,-0.353846,-0.353846,1.646154,-0.353846,-0.353846,-0.353846,-0.353846,-0.353846,-0.353846
U1135,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
U1136,-0.246154,-0.246154,-0.246154,-0.246154,-0.246154,-0.246154,-0.246154,-0.246154,-0.246154,-0.246154,...,-0.246154,-0.246154,-0.246154,-0.246154,-0.246154,-0.246154,-0.246154,-0.246154,-0.246154,-0.246154
U1137,-0.338462,-0.338462,-0.338462,-0.338462,-0.338462,-0.338462,-0.338462,-0.338462,-0.338462,-0.338462,...,-0.338462,-0.338462,-0.338462,1.661538,-0.338462,-0.338462,-0.338462,-0.338462,-0.338462,-0.338462


## 4 : Similarity Matrix using adjusted cosine

In [78]:
# Compute mean adjusted ratings
mean_adjusted_ratings = user_profile.sub(user_profile.mean(axis=0), axis=1)

# Compute adjusted cosine similarity matrix
adj_cos_sim_matrix = np.zeros((len(mean_adjusted_ratings.columns), len(mean_adjusted_ratings.columns)))

for i in range(len(mean_adjusted_ratings.columns)):
    for j in range(i, len(mean_adjusted_ratings.columns)):
        dot_product = np.dot(mean_adjusted_ratings.iloc[:, i], mean_adjusted_ratings.iloc[:, j])
        norm_i = np.linalg.norm(mean_adjusted_ratings.iloc[:, i])
        norm_j = np.linalg.norm(mean_adjusted_ratings.iloc[:, j])
        adj_cos_sim = dot_product / (norm_i * norm_j)
        adj_cos_sim_matrix[i, j] = adj_cos_sim
        adj_cos_sim_matrix[j, i] = adj_cos_sim

# Convert the matrix to a dataframe with place IDs as index and columns
adj_cos_sim_df = pd.DataFrame(adj_cos_sim_matrix, index=mean_adjusted_ratings.columns, columns=mean_adjusted_ratings.columns)

# Replace NaN values with 0
adj_cos_sim_df.fillna(0, inplace=True)

adj_cos_sim_df.to_csv('Group1_Part2_SIMILARITY_22.csv', index=True)
# Print the resulting dataframe
adj_cos_sim_df

Unnamed: 0,Place 132560,Place 132561,Place 132564,Place 132572,Place 132583,Place 132584,Place 132594,Place 132608,Place 132609,Place 132613,...,Place 135080,Place 135081,Place 135082,Place 135085,Place 135086,Place 135088,Place 135104,Place 135106,Place 135108,Place 135109
Place 132560,1.000000,0.380406,0.279328,0.001343,0.266694,0.489841,0.612450,0.242801,0.369639,0.226775,...,0.030004,0.011855,0.137641,-0.256516,0.056713,0.273819,0.422171,0.010027,0.081334,0.284485
Place 132561,0.380406,1.000000,0.261937,0.012145,0.254979,0.193826,0.333666,0.226507,0.351162,0.211330,...,0.051144,0.022350,0.135797,-0.222675,0.071116,0.254678,0.227821,0.021211,0.080295,0.269457
Place 132564,0.279328,0.261937,1.000000,-0.006481,0.183567,0.134839,0.242634,0.360778,0.255631,0.148222,...,0.030363,0.003520,0.092786,-0.193136,0.041872,0.180763,0.160794,0.004148,0.046625,0.194309
Place 132572,0.001343,0.012145,-0.006481,1.000000,-0.014555,-0.021234,-0.008009,-0.011193,0.005608,-0.020281,...,0.038764,-0.118941,-0.053837,-0.090957,0.189761,-0.008797,-0.025466,0.095926,-0.034253,-0.005204
Place 132583,0.266694,0.254979,0.183567,-0.014555,1.000000,0.133593,0.232743,0.158016,0.247723,0.145894,...,0.014099,-0.004810,0.086160,-0.203271,0.028552,0.416506,0.156255,-0.004480,0.042301,0.188685
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Place 135088,0.273819,0.254678,0.180763,-0.008797,0.416506,0.128412,0.236785,0.154263,0.248830,0.141836,...,0.031799,0.001743,0.089470,-0.193226,0.041082,1.000000,0.154782,0.002899,0.042451,0.540462
Place 135104,0.422171,0.227821,0.160794,-0.025466,0.156255,0.531880,0.841259,0.330198,0.375349,0.498159,...,0.008806,-0.014459,0.072145,-0.207006,0.018500,0.154782,1.000000,-0.012815,0.027651,0.166923
Place 135106,0.010027,0.021211,0.004148,0.095926,-0.004480,-0.008910,0.002297,-0.000371,0.015075,-0.008062,...,-0.100336,0.004080,-0.040620,-0.086355,-0.099321,0.002899,-0.012815,1.000000,0.131497,0.004345
Place 135108,0.081334,0.080295,0.046625,-0.034253,0.042301,0.021598,0.063863,0.035779,0.074953,0.026395,...,-0.059085,0.131890,0.602403,-0.196035,-0.059486,0.042451,0.027651,0.131497,1.000000,0.050448


## 5 : K-NN for places

In [68]:
similarity_matrix = adj_cos_sim_df.copy()

# create a new DataFrame to store the NN information
nn_data = pd.DataFrame(index=similarity_matrix.index, columns=[f'{i}thNNPlaceID' for i in range(1, 8)])

# for each row in the similarity matrix, find the 7 most similar users
for user in similarity_matrix.index:
    sim_series = similarity_matrix.loc[user].sort_values(ascending=False)
    nn_series = sim_series.iloc[1:8]
    nn_tuples = [(idx, round(nn_series.loc[idx], 2)) for idx in nn_series.index]
    nn_data.loc[user] = nn_tuples
nn_data

Unnamed: 0,1thNNPlaceID,2thNNPlaceID,3thNNPlaceID,4thNNPlaceID,5thNNPlaceID,6thNNPlaceID,7thNNPlaceID
Place 132560,"(Place 132732, 0.81)","(Place 132667, 0.56)","(Place 132594, 0.48)","(Place 132663, 0.48)","(Place 132584, 0.41)","(Place 132630, 0.31)","(Place 132740, 0.31)"
Place 132561,"(Place 132665, 0.73)","(Place 132654, 0.66)","(Place 132626, 0.61)","(Place 132706, 0.57)","(Place 135040, 0.19)","(Place 132560, 0.15)","(Place 132766, 0.15)"
Place 132564,"(Place 132717, 0.81)","(Place 132733, 0.43)","(Place 132715, 0.35)","(Place 132740, 0.3)","(Place 132626, 0.29)","(Place 132608, 0.27)","(Place 132660, 0.23)"
Place 132572,"(Place 135075, 0.5)","(Place 135048, 0.43)","(Place 132884, 0.31)","(Place 135074, 0.31)","(Place 135046, 0.28)","(Place 132875, 0.28)","(Place 135034, 0.22)"
Place 132583,"(Place 134986, 0.58)","(Place 132768, 0.46)","(Place 135001, 0.38)","(Place 135088, 0.32)","(Place 135000, 0.29)","(Place 132773, 0.29)","(Place 135018, 0.16)"
...,...,...,...,...,...,...,...
Place 135088,"(Place 135109, 0.47)","(Place 132768, 0.45)","(Place 134986, 0.37)","(Place 135016, 0.36)","(Place 135018, 0.36)","(Place 132583, 0.32)","(Place 132773, 0.15)"
Place 135104,"(Place 132594, 0.82)","(Place 132667, 0.81)","(Place 132663, 0.63)","(Place 132584, 0.48)","(Place 132613, 0.44)","(Place 132733, 0.39)","(Place 132740, 0.39)"
Place 135106,"(Place 135041, 0.33)","(Place 135052, 0.32)","(Place 135062, 0.26)","(Place 135060, 0.23)","(Place 135028, 0.21)","(Place 135048, 0.2)","(Place 132885, 0.18)"
Place 135108,"(Place 135054, 0.64)","(Place 135082, 0.59)","(Place 135071, 0.48)","(Place 132851, 0.47)","(Place 132854, 0.44)","(Place 135069, 0.41)","(Place 135057, 0.39)"


## 6 : Predicition Matrix

In [69]:
# Pred_Rating(u, j) = Σ sim(i, j) * Rating(u, i) / Σ sim(i, j)
rating_data = pivoted_data.copy()
K_NNPlaces = nn_data

# create a copy of rating_data that replaces NaN with 0
rating_data_copy = rating_data.fillna(0)

# create an empty DataFrame for the predictions
Prediction_Matrixp2 = pd.DataFrame(columns=rating_data.columns, index=rating_data.index)

# for each user and place, calculate the predicted rating
for user in rating_data.index:
    for place in rating_data.columns:
        if pd.isna(rating_data.loc[user, place]):
            knn_data = K_NNPlaces.loc[place]
            num = 0
            den = 0
            for i in range(1, 8):
                nn_place, sim = knn_data[f'{i}thNNPlaceID']
                nn_rating = rating_data_copy.loc[user, nn_place]
                if nn_rating != 0:
                    num += sim * nn_rating
                    den += sim
            if den != 0:
                pred_rating = num / den
            else:
                pred_rating = 0
            Prediction_Matrixp2.loc[user, place] = pred_rating
        else:
            Prediction_Matrixp2.loc[user, place] = rating_data.loc[user, place]

Prediction_Matrixp2

Unnamed: 0_level_0,Place 132560,Place 132561,Place 132564,Place 132572,Place 132583,Place 132584,Place 132594,Place 132608,Place 132609,Place 132613,...,Place 135080,Place 135081,Place 135082,Place 135085,Place 135086,Place 135088,Place 135104,Place 135106,Place 135108,Place 135109
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U1001,0,1.0,0,0,0,0,0,0,0,0,...,0,0,0,0.0,0,0,0,0,0,0
U1002,0,0,0,0,0,0,0,0,0,0,...,1.0,0,0,1.0,0,0,0,1.0,0,0
U1003,0,0,0,2.0,0,0,0,0,0,0,...,2.0,0,0,1.509434,2.0,0,0,0,0,0
U1004,0,0,0,0,0,0,0,0,0,0,...,2.0,0,0,0,2.0,0,0,2.0,0,0
U1005,0,0,0,0,0,0,0,0,0,0,...,0,1.563291,1.0,0,0,0,0,1.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U1134,0,0,0,0.0,0,0,0,0,0,0,...,1.0,0,0,2.0,0,0,0,0,0,0
U1135,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0.0,0,0,0,0.0,0,0
U1136,0,0,0,1.27451,0,0,0,0,0,0,...,1.0,0,0,1.490566,2.0,0,0,1.512195,0,0
U1137,0,0,0,2.0,0,0,0,0,0,0,...,2.0,0,0,2.0,2.0,0,0,0,0,0


## 7.1 : Recommend 10 not yet visited and predicited ratings

In [70]:
# sort the predicted rating in descending order with its corresponding place
Prediction_Sortedp2 = Prediction_Matrixp2.apply(lambda x: pd.Series(x.sort_values(ascending=False)
                                                         .iloc[:10].index.tolist()), axis=1)

# compare the place id with test data and skip the ones that are already in the test data
test_data_places = set(rating_testset["PlaceID"].unique().tolist())

Recommendationsp2 = Prediction_Sortedp2.apply(lambda x: [(place, Prediction_Matrix.loc[user, place]) for place in x
                                                      if place not in test_data_places and place not in rating_data.loc[user].dropna().index][:10], axis=1)
Recommendationsp2

user
U1001    [(Place 135047, 0.9692307692307692), (Place 13...
U1002    [(Place 135045, 1.9307692307692308), (Place 13...
U1003    [(Place 135065, 1.9461538461538463), (Place 13...
U1004    [(Place 135051, 0.9), (Place 135044, 0.9461538...
U1005    [(Place 132872, 0.9615384615384616), (Place 13...
                               ...                        
U1134    [(Place 132862, 1.449120879120879), (Place 132...
U1135    [(Place 132560, 0.038461538461538464), (Place ...
U1136    [(Place 132885, 0.038461538461538464), (Place ...
U1137    [(Place 135025, 1.9000000000000004), (Place 13...
U1138    [(Place 132958, 0.038461538461538464), (Place ...
Length: 138, dtype: object

## 7.2 : Seperate the Data Series to be each columns 

In [71]:
# create an empty list to store the data
data = []

# iterate through the rows of the Series
for user, ratings in Recommendationsp2.iteritems():
    # iterate through the ratings for each user
    for place, rating in ratings:
        # append the data as a tuple to the list
        data.append((user, place, rating))

# create a DataFrame from the list of tuples
Recommendations_Seperated = pd.DataFrame(data, columns=['userID', 'PlaceID', 'predicted_rating'])

# output the DataFrame
Recommendations_Seperated

Unnamed: 0,userID,PlaceID,predicted_rating
0,U1001,Place 135047,0.969231
1,U1001,Place 135025,1.900000
2,U1001,Place 132825,1.424696
3,U1001,Place 132834,1.000000
4,U1001,Place 135059,1.477328
...,...,...,...
1340,U1138,Place 132875,0.961538
1341,U1138,Place 135046,0.038462
1342,U1138,Place 132954,1.900000
1343,U1138,Place 132862,1.449121


## 8 : Merge test and recommendation

In [72]:
# Find unique users in test data
unique_users = rating_testset['userID'].unique()
unique_users_df = pd.DataFrame({'userID': unique_users})
result = pd.merge(unique_users_df.reset_index(), Recommendations_Seperated, on='userID')
result = result.drop(['index'], axis=1)
result.to_csv('Group1_Part2_RECOMMEND_23.csv', index=False)
result

Unnamed: 0,userID,PlaceID,predicted_rating
0,U1003,Place 135065,1.946154
1,U1003,Place 135035,0.038462
2,U1003,Place 132754,0.038462
3,U1003,Place 132755,0.038462
4,U1003,Place 132955,0.038462
...,...,...,...
139,U1137,Place 135059,1.477328
140,U1137,Place 132825,1.424696
141,U1137,Place 132951,0.944822
142,U1137,Place 132834,1.000000


## 9 : Calculate RSME, Precision, and Recall

In [73]:
pivoted_data = user_profile
test_data = rating_testset
Prediction_Matrix = Prediction_Matrixp2

# create a list to store the predicted and actual ratings
predicted_ratings = []
actual_ratings = []
userlist = []
placelist = []

# iterate over the rows in the test dataset
for index, row in test_data.iterrows():
    user_id = row['userID']
    place_id = row['PlaceID']
    actual_rating = row['Rating']
    
    # get the predicted rating from the Prediction_Matrix
    predicted_rating = Prediction_Matrix.loc[user_id, f'Place {place_id}']
    
    # add the predicted and actual ratings to the list
    userlist.append(user_id)
    placelist.append(place_id)
    predicted_ratings.append(predicted_rating)
    actual_ratings.append(actual_rating)
    

# compute the MSE and RMSE
mse = np.mean(np.power(np.array(actual_ratings) - np.array(predicted_ratings), 2))
rmse = np.sqrt(mse)

# Set a range of threshold values for the predicted ratings
threshold_values = [1.5]

# Initialize variables to store the best threshold and its corresponding precision and recall
best_threshold = None
best_precision = 0
best_recall = 0
# Initialize lists to store results
rmse_list = []
precision_list = []
recall_list = []

# Iterate over the threshold values
Test = pd.DataFrame({
    'userID': userlist,
    'PlaceID': placelist,
    'Actual Rating': actual_ratings,
    'Predicted Rating': predicted_ratings
})
for threshold in threshold_values:
    # Calculate the precision and recall for the current threshold
    predicted_labels = np.where(Test['Predicted Rating'] >= threshold, 1, 0)
    actual_labels = np.where(Test ['Actual Rating']>= threshold, 1, 0)

    true_positives = np.sum(np.logical_and(predicted_labels == 1, actual_labels == 1))
    false_positives = np.sum(np.logical_and(predicted_labels == 1, actual_labels == 0))
    false_negatives = np.sum(np.logical_and(predicted_labels == 0, actual_labels == 1))

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    # Update the best threshold and its corresponding precision and recall if applicable
    if precision + recall > best_precision + best_recall:
        best_threshold = threshold
        best_precision = precision
        best_recall = recall
rmse_list.append(rmse)
precision_list.append(precision)
recall_list.append(recall)

# Combine lists into a DataFrame
Evaluation = pd.DataFrame({
    'RMSE': rmse_list,
    'Precision': precision_list,
    'Recall': recall_list
})


Evaluation.to_csv("Group1_Part2_EVAL_24.csv",index=False)

Evaluation

Unnamed: 0,RMSE,Precision,Recall
0,0.983406,0.666667,0.5


In [74]:
Test

Unnamed: 0,userID,PlaceID,Actual Rating,Predicted Rating
0,U1003,132825,2,2.0
1,U1003,135079,2,2.0
2,U1006,132825,1,0.0
3,U1006,135079,1,0.0
4,U1009,132834,2,1.323529
5,U1009,135038,2,1.0
6,U1016,132834,2,2.0
7,U1016,135060,2,2.0
8,U1022,135038,2,1.515625
9,U1022,135062,1,2.0
