In [258]:
import pandas as pd
import numpy as np
import math

ratings_path = "./ml-latest-small/ratings.csv"
movies_path = "./ml-latest-small/movies.csv"
tags_path = "./ml-latest-small/tags.csv"
similar_users_path = "./similar_users.csv"

In [259]:
Ratings = pd.read_csv(ratings_path)
MeanUserRating = Ratings.groupby(['userId'], as_index = False, sort = False).mean().rename(columns = {'rating': 'rating_mean'})[['userId','rating_mean']]
Ratings = pd.merge(Ratings, MeanUserRating, on = 'userId', how = 'left', sort = False)
Ratings['rating_adjusted'] = Ratings['rating'] - Ratings['rating_mean']
Ratings['mean_skewness'] = (Ratings['rating_adjusted'])/(5-Ratings['rating_mean'])*(Ratings['rating_adjusted']>0).astype(float) - (Ratings['rating_adjusted'])/(Ratings['rating_mean']-0.5)*(Ratings['rating_adjusted']<0).astype(float)
Ratings['rating_unskewed'] = Ratings['rating_adjusted'] * np.sqrt(1+(Ratings['mean_skewness']**2)*2) 
distinct_users = np.unique(Ratings['userId'])

In [260]:
similar_users = pd.DataFrame()

for user1 in distinct_users:
    user1_data = Ratings[Ratings['userId']==user1]
    user1_data = user1_data.rename(columns={'rating_adjusted':'rating_adjusted1'})
    user1_data = user1_data.rename(columns={'rating_unskewed':'rating_unskewed1'})
    user1_vector_length = np.sqrt(np.sum(np.square(user1_data['rating_adjusted1']), axis=0))
    
    for user2 in distinct_users:
        user2_data = Ratings[Ratings['userId']==user2]

        user2_data = user2_data.rename(columns={'rating_adjusted':'rating_adjusted2'})
        user2_data = user2_data.rename(columns={'rating_unskewed':'rating_unskewed2'})
        user2_data = user2_data.rename(columns={'userId':'userId2'}) 
        user2_vector_length = np.sqrt(np.sum(np.square(user2_data['rating_adjusted2']), axis=0))
        
        user_data = pd.merge(user1_data,user2_data[['rating_adjusted2','userId2','movieId', 'rating_unskewed2']], on = 'movieId', how = 'inner', sort = False)
        user_data['vector_product']=(user_data['rating_adjusted1']*user_data['rating_adjusted2'])
        user_data['unskewed_product']=(user_data['rating_unskewed1']*user_data['rating_unskewed2'])
        user_data = user_data.groupby(['userId','userId2'], as_index = False, sort = False)['vector_product', 'unskewed_product'].agg('sum')
        user_data['pearson'] = user_data['vector_product']/(user1_vector_length*user2_vector_length)
        user_data['unskewed_pearson'] = user_data['unskewed_product']/(user1_vector_length*user2_vector_length)
        user_data = user_data.drop(columns=['vector_product', 'unskewed_product'])
        similar_users = similar_users.append(user_data, ignore_index=True)
        
similar_users = similar_users[similar_users['pearson']<0.99999]
similar_users.to_csv(similar_users_path)

#results = similar_users.sort_values(['unskewed_pearson'], ascending=False).head(20)

In [None]:
similar_users

In [262]:
def user_twins(userId):
    top_twins = similar_users[similar_users['userId']==userId].sort_values(['unskewed_pearson'], ascending=False).head(30)
    top_twins = top_twins[top_twins['pearson']>0]
    top_twins = top_twins[top_twins['unskewed_pearson']>0].drop(columns=['userId', 'pearson']).as_matrix()
    return top_twins

def user_collaborative_recommendations(userId):
    twin_recommendations = pd.DataFrame()
    top_twins = user_twins(userId)
    for twin in top_twins:
        twin_data = Ratings[Ratings['userId']==twin[0].astype(int)]
        twin_data = twin_data[twin_data['rating_unskewed']>0].sort_values(['rating_unskewed'], ascending=False)
        twin_data['proximity_weight'] = twin[1]
        twin_data['proximity_rating'] = twin_data['rating_unskewed']*twin[1]
        twin_recommendations = twin_recommendations.append(twin_data, ignore_index=True)
    twin_recommendations = twin_recommendations.sort_values(['proximity_rating'], ascending=False).iloc[:, 1].values
    return twin_recommendations

In [263]:
user_collaborative_recommendations(4)

array([ 1405,  1223,  1242,   110,   260,  1148,  1225,   318,  1210,
         720,   745,  1198,  1196,    47,  6874,  5064,  4226,  3578,
        2918,  2571,  2502,   527,  1961,  1777,  1625,  2329,    50,
          32,   543,   858,   318,   356,  1197,  3949,  2959,   318,
        2762,  2302,  4011,  4963,  3147,  2858,  2791,  5445,   457,
        2194,  4034,   593, 33166,  8784, 33794, 33493,  1204,  7153,
        5952,   293,  1220,   551,   480,   380,   151,  1275,  1288,
         141,   112,   924,   534,   541,   588,  1079,  1287,  1231,
        1136,  1302,   590,  1240,    34,  1080,  1374,   594,   671,
          40, 50068, 48783,  1721,   296,  2761,  1276,  1285,  1259,
        1250,  8874,   597,  1380,  1035,  2081, 33166,   778,   110,
        1235,  2858,  3510,  2028,  2841, 84236,  1378,  2318,  1884,
        8636,  2529,  2072,  2019,   903,   596,   111,  3114,  7361,
        2174,  2692,    50,  1358,  1719,  1923,  2344,  2571,  2826,
        2926,  1611,

In [264]:
Ratings

Unnamed: 0,userId,movieId,rating,timestamp,rating_mean,rating_adjusted,mean_skewness,rating_unskewed
0,1,31,2.5,1260759144,2.550000,-0.050000,0.024390,-0.050030
1,1,1029,3.0,1260759179,2.550000,0.450000,0.183673,0.464933
2,1,1061,3.0,1260759182,2.550000,0.450000,0.183673,0.464933
3,1,1129,2.0,1260759185,2.550000,-0.550000,0.268293,-0.588259
4,1,1172,4.0,1260759205,2.550000,1.450000,0.591837,1.890870
5,1,1263,2.0,1260759151,2.550000,-0.550000,0.268293,-0.588259
6,1,1287,2.0,1260759187,2.550000,-0.550000,0.268293,-0.588259
7,1,1293,2.0,1260759148,2.550000,-0.550000,0.268293,-0.588259
8,1,1339,3.5,1260759125,2.550000,0.950000,0.387755,1.083462
9,1,1343,2.0,1260759131,2.550000,-0.550000,0.268293,-0.588259
