In [1]:
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

In [68]:
data = pd.read_csv("/Users/tramy/Documents/IntrotoML/Collaborative Filtering/dataset/toy_dataset.csv", index_col = 0)

In [69]:
data.index

Index(['user 1', 'user 2', 'user 3', 'user 4', 'user 5'], dtype='object')

In [70]:
# Step 1: Fill in missing data
data.fillna(0, inplace=True)
data.isna().sum()

action1      0
action2      0
action3      0
romantic1    0
romantic2    0
romantic3    0
dtype: int64

In [71]:
data

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,4.0,5.0,3.0,0.0,2.0,1.0
user 2,5.0,3.0,3.0,2.0,2.0,0.0
user 3,1.0,0.0,0.0,4.0,5.0,4.0
user 4,0.0,2.0,1.0,4.0,0.0,3.0
user 5,1.0,0.0,2.0,3.0,3.0,4.0


In [72]:
# Step 2: Standardize data
def standardize(rating):
    new_rating = (rating - rating.mean())
    return new_rating

In [76]:
data_std = data.apply(standardize).T

In [80]:
data_std.values

array([[ 1.8,  2.8, -1.2, -2.2, -1.2],
       [ 3. ,  1. , -2. ,  0. , -2. ],
       [ 1.2,  1.2, -1.8, -0.8,  0.2],
       [-2.6, -0.6,  1.4,  1.4,  0.4],
       [-0.4, -0.4,  2.6, -2.4,  0.6],
       [-1.4, -2.4,  1.6,  0.6,  1.6]])

In [85]:
# Step 3: Turn data in to sparse matrix
sparse_data = sparse.csr_matrix(data_std.values)
# Step 4: Calculate cosine_similarity, the distance, meaning the level of similarity 
sim_matrix = cosine_similarity(sparse_data)

In [87]:
# There are 2 ways to create a correlation matrix
##1 – From the cosine-similarity matrix
corrMatrix = pd.DataFrame(sim_matrix, index=data.columns, columns = data.columns)
corrMatrix

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
action1,1.0,0.706689,0.813682,-0.799411,-0.025392,-0.914106
action2,0.706689,1.0,0.723102,-0.845154,-0.518999,-0.843374
action3,0.813682,0.723102,1.0,-0.847946,-0.37998,-0.802181
romantic1,-0.799411,-0.845154,-0.847946,1.0,0.148039,0.723747
romantic2,-0.025392,-0.518999,-0.37998,0.148039,1.0,0.393939
romantic3,-0.914106,-0.843374,-0.802181,0.723747,0.393939,1.0


In [105]:
##2 – Use Pearson correlation
# Pearson correlation is a number between -1 and 1 that indicates the extent to which two variables are linearly related.
corrMatrix = data.corr(method ='pearson')
corrMatrix

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
action1,1.0,0.706689,0.813682,-0.799411,-0.025392,-0.914106
action2,0.706689,1.0,0.723102,-0.845154,-0.518999,-0.843374
action3,0.813682,0.723102,1.0,-0.847946,-0.37998,-0.802181
romantic1,-0.799411,-0.845154,-0.847946,1.0,0.148039,0.723747
romantic2,-0.025392,-0.518999,-0.37998,0.148039,1.0,0.393939
romantic3,-0.914106,-0.843374,-0.802181,0.723747,0.393939,1.0


In [110]:
# Step 5: Define a fuction thats will output a movie similar to users' fav
def get_similar(movie_name, rating):
    similar_score = corrMatrix[movie_name]*(rating -2.5)
    similar_score = similar_score.sort_values(ascending = False)
    return similar_score 

In [111]:
# Try out some examples

action_lover = [("action1",5),("romantic2",1),("romantic3",1)]
type(action_lover)

list

In [112]:
similar_scores = pd.DataFrame()
for movie,rating in action_lover:
    similar_movies = get_similar(movie,rating)
    similar_scores = similar_scores.append(get_similar(movie,rating), ignore_index= True)

In [113]:
similar_scores.head(15)

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
0,2.5,1.766722,2.034204,-1.998527,-0.06348,-2.285265
1,0.038088,0.778499,0.56997,-0.222059,-1.5,-0.590909
2,1.371159,1.265061,1.203271,-1.08562,-0.590909,-1.5


In [114]:
similar_scores.sum().sort_values(ascending=False)

action1      3.909247
action2      3.810282
action3      3.807445
romantic2   -2.154389
romantic1   -3.306206
romantic3   -4.376174
dtype: float64