In [1]:
import json
import random
import pathlib
import warnings
import numpy as np
import pandas as pd
from scipy.stats import pearsonr

warnings.filterwarnings('ignore')

In [2]:
def calcSimilarity():
    similarity = {}
    for u in userList:
        dfu = df[df.userId == u]
        similarity[str(u)] = {}
        for v in userList:
            if u == v:
                continue

            dfv = df[df.userId == v]
            dfuc = dfu[dfu.movieId.isin(dfv.movieId)]
            dfv = dfv[dfv.movieId.isin(dfu.movieId)]
            
            try:
                similarity[str(u)][str(v)] = similarity[str(v)][str(u)]
            except KeyError:
                if len(dfuc) > 1 and len(dfv) > 1:
                    similarity[str(u)][str(v)] = pearsonr(dfuc["rating"], dfv["rating"])[0]
                else:
                    similarity[str(u)][str(v)] = 0
        if u % 100 == 0 or u == 1:
            print(f"{u}/{totalUsers} calculados, progresso: {u / len(userList) * 100}%")
    print(f"{totalUsers}/{totalUsers} calculados, função concluída.")
    saveSimilarityJson(similarity)
    return similarity

In [3]:
def saveSimilarityJson(similarity):
    with open('result.json', 'w') as fp:
        json.dump(similarity, fp, indent=4)

In [4]:
def loadSimilarityJson():
    with open('result.json', 'r') as fp:
        return json.load(fp)

In [5]:
def resScore(users_and_scores, movie):
    n = 0.0
    d = 0.0
    for user, sim_score in users_and_scores:
        dfv = df[df.userId == user]
        rating = dfv[dfv.movieId == movie]["rating"].to_numpy()
        if len(rating) > 0:
            n += rating[0] * sim_score
            d += abs(sim_score)
    return n/d

In [6]:
def getMovieList(reference_users, user, dfml):
    df.userId = df.userId.astype(str)
    dfu = df[df.userId == user]
    dfml = dfml[~dfml.movieId.isin(dfu.movieId)]
    df_ref_users = df[df.userId.isin([ru[0] for ru in reference_users])]
    dfml = dfml[dfml.movieId.isin(df_ref_users.movieId)]
    movie_scores = []
    for new_movie in dfml.movieId:
        movie_scores.append((new_movie, resScore(reference_users, new_movie)))
        if len(movie_scores) % 100 == 0:
            print(f"{len(movie_scores)}/{len(dfml)}")
    print(f"{len(movie_scores)}/{len(dfml)}")
    return movie_scores, dfml

In [7]:
def getSimilarUsers(user, k, similarity_dict):
    return sortByTupleValue(similarity_dict[str(user)].items(), reversed=True)[:k]

In [8]:
def sortByTupleValue(base_tuple, reversed=False):
    return sorted(base_tuple, key=lambda x: x[1] * -1 if reversed else 1)

In [9]:
ratings_small_dir = pathlib.Path('datasets/ratings_small.csv')
links_small_dir = pathlib.Path('datasets/links_small.csv')
movies_metadata_dir = pathlib.Path('datasets/movies_metadata.csv')

In [10]:
df = pd.read_csv(ratings_small_dir)

userList = df["userId"].unique()
totalUsers = len(df.userId.unique())

loadJson = True

k = 7

In [11]:
similarity = loadSimilarityJson() if loadJson else calcSimilarity()

In [12]:
user = random.choice(userList)

In [13]:
k_similar_users = getSimilarUsers(user, k, similarity)

In [14]:
k_similar_users

[('26', 1.0),
 ('17', 0.5951700641394974),
 ('37', 0.5773502691896258),
 ('2', 0.5625),
 ('36', 0.5222329678670935),
 ('33', 0.5),
 ('34', 0.33333333333333337)]

In [15]:
df_movies_links = pd.read_csv(links_small_dir)
df_movies_links = df_movies_links.drop(columns=['tmdbId'])

In [16]:
recomendations, df_movies_links = getMovieList(k_similar_users, user, df_movies_links)

100/840
200/840
300/840
400/840
500/840
600/840
700/840
800/840
840/840


In [17]:
recomendations = sortByTupleValue(recomendations, reversed=True)[:10]

In [18]:
df_recommended_movies_links = df_movies_links[df_movies_links.movieId.isin([m[0] for m in recomendations])]
df_recommended_movies_links["imdbId"] = df_recommended_movies_links["imdbId"].astype(str)
df_recommended_movies_links["imdbId"] = df_recommended_movies_links["imdbId"].transform(lambda x: "tt" + (7 - len(x)) * "0" + x)

In [19]:
df_movie_metadata = pd.read_csv(movies_metadata_dir)
df_movie_metadata = df_movie_metadata[df_movie_metadata.imdb_id.isin(df_recommended_movies_links.imdbId)]

In [20]:
df_movie_metadata

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
16,False,,16500000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,4584,tt0114388,en,Sense and Sensibility,"Rich Mr. Dashwood dies, leaving his second wif...",...,1995-12-13,135000000.0,136.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Lose your heart and come to your senses.,Sense and Sensibility,False,7.2,364.0
57,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,11010,tt0110877,it,Il postino,Simple Italian postman learns to love poetry w...,...,1994-09-22,0.0,108.0,"[{'iso_639_1': 'it', 'name': 'Italiano'}, {'is...",Released,,The Postman,False,7.6,181.0
218,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",,22625,tt0112679,en,Circle of Friends,'Circle Of Friends' is set in 1950's Ireland. ...,...,1995-03-16,0.0,103.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Circle of Friends,False,6.5,30.0
228,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,10451,tt0111797,zh,飲食男女,The film tells the story of a retired and wido...,...,1994-08-03,7294403.0,123.0,"[{'iso_639_1': 'zh', 'name': '普通话'}]",Released,,Eat Drink Man Woman,False,7.5,76.0
261,False,,2000000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",http://www.miramax.com/movie/like-water-for-ch...,18183,tt0103994,es,Como agua para chocolate,"Tita is passionately in love with Pedro, but h...",...,1992-04-16,21665468.0,105.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,"In Tita's kitchen, ordinary spices become a re...",Like Water for Chocolate,False,6.6,70.0
262,False,,30000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",,4476,tt0110322,en,Legends of the Fall,An epic tale of three brothers and their fathe...,...,1994-12-16,160638883.0,133.0,"[{'iso_639_1': 'kw', 'name': ''}, {'iso_639_1'...",Released,After the Fall from Innocence the Legend begins.,Legends of the Fall,False,7.2,636.0
302,False,"{'id': 131, 'name': 'Three Colors Collection',...",0,"[{'id': 18, 'name': 'Drama'}, {'id': 9648, 'na...",,110,tt0111495,fr,Trois couleurs : Rouge,Red This is the third film from the trilogy by...,...,1994-05-27,0.0,99.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,,Three Colors: Red,False,7.8,246.0
304,False,"{'id': 131, 'name': 'Three Colors Collection',...",0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,109,tt0111507,fr,Trois couleurs : Blanc,Polish immigrant Karol Karol finds himself out...,...,1994-01-26,0.0,91.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,,Three Colors: White,False,7.3,218.0
311,False,,45000000,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,2636,tt0111255,en,The Specialist,May Munro is a woman obsessed with getting rev...,...,1994-10-07,170362582.0,110.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The government taught him to kill. Now he's us...,The Specialist,False,5.5,317.0
317,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,12527,tt0106966,es,Fresa y chocolate,"Havana, Cuba, 1979. Flamboyantly gay artist Di...",...,1993-01-01,0.0,108.0,"[{'iso_639_1': 'es', 'name': 'Español'}]",Released,,Strawberry and Chocolate,False,7.1,16.0
