In [None]:
import pandas as pd
import numpy as np
import math

ratings_path = "./ml-latest-small/ratings.csv"
movies_path = "./ml-latest-small/movies.csv"
tags_path = "./ml-latest-small/tags.csv"
top_similar_items_path = "./top_similar_items.csv"

Ratings = pd.DataFrame()
Movies = pd.DataFrame()
TopSimilarItems = pd.DataFrame()

In [None]:
def load_dataset(dataset):
    dataset_name = dataset[0]
    dataset_path = dataset[1]
    try:
        data = pd.read_csv(dataset_path)
        print("{} is successfully read from memory.").format(dataset_name)
        return data
    except:
        print("CAUTION: {} cannot be read from memory.").format(dataset_name)
        return 

In [None]:
def cosine_similarity(movie1, movie2):
    dot_product = movie1.transpose().dot(movie2)
    item1_vector_length = math.sqrt(movie1.transpose().dot(movie1))
    item2_vector_length = math.sqrt(movie2.transpose().dot(movie2))
    if item1_vector_length < 0.0000001 or item2_vector_length < 0.0000001:
        return 0
    else:
        return dot_product / item1_vector_length / item2_vector_length

In [None]:
def item_similarity_matrix(Ratings):
    MeanUserRating = Ratings.groupby(['userId'], as_index = False, sort = False).mean().rename(columns = {'rating': 'rating_mean'})[['userId','rating_mean']]
    Ratings = pd.merge(Ratings, MeanUserRating, on = 'userId', how = 'left', sort = False)
    Ratings['rating_adjusted'] = Ratings['rating'] - Ratings['rating_mean']
    Ratings['mean_skewness'] = (Ratings['rating_adjusted'])/(5-Ratings['rating_mean'])*(Ratings['rating_adjusted']>0).astype(float) - (Ratings['rating_adjusted'])/(Ratings['rating_mean']-0.5)*(Ratings['rating_adjusted']<0).astype(float)
    Ratings['rating_unskewed'] = Ratings['rating_adjusted'] * np.sqrt(1+(Ratings['mean_skewness']**2)*2) 
    RatingsMatrix = Ratings.pivot_table(index='userId', columns='movieId', values='rating_unskewed', fill_value=0)

    distinct_movies = np.unique(Ratings['movieId'])
    MoviesMatrix = pd.DataFrame(0, index=distinct_movies, columns=np.arange(distinct_movies.size)+1, dtype='float')
    TopSimilarItems = pd.DataFrame(0, index=distinct_movies, columns=np.arange(1000)+1, dtype='float')
    
    for movie in distinct_movies[:10]:
        movieIndex = np.searchsorted(distinct_movies, movie)
        
        for movie2 in distinct_movies:
            movie2Index = np.searchsorted(distinct_movies, movie2)
            proximity = cosine_similarity(RatingsMatrix.iloc[:,movieIndex], RatingsMatrix.iloc[:,movie2Index])
            MoviesMatrix[movie2Index+1][movieIndex+1] = proximity
            
        similarity_values = np.copy(MoviesMatrix.values[movieIndex])
        sorted_movie_indexes = np.argsort(MoviesMatrix.values[movieIndex])[::-1]
        MoviesMatrix.values[movieIndex] = MoviesMatrix.index[sorted_movie_indexes]
        similarity_values.sort()
        similarity_values = similarity_values[::-1]
        MoviesMatrix.values[movieIndex] = np.where(similarity_values > 0, MoviesMatrix.values[movieIndex], 0)
        TopSimilarItems.values[movieIndex] = MoviesMatrix.values[movieIndex][1:1001]
        print("Calculated for {} items out of {}.").format(movieIndex+1, distinct_movies.size)

    TopSimilarItems.to_csv(top_similar_items_path)
    return TopSimilarItems

In [None]:
Ratings = load_dataset(("Ratings", ratings_path))
Movies = load_dataset(("Movies", movies_path))
TopSimilarItems = load_dataset(("TopSimilarItems", top_similar_items_path))
if TopSimilarItems is None:
    print("Item similarity matrix is being recalculated. It might take a while...")
    TopSimilarItems = item_similarity_matrix(Ratings)

In [None]:
def item_collaborative_recommendations(movieId):
    movieIndex = np.searchsorted(np.unique(Ratings['movieId']), movieId)
    item_recommendations = np.extract(TopSimilarItems.values[movieIndex]>0, TopSimilarItems.values[movieIndex])
    return item_recommendations