In [120]:
# Importación de paqueterías necesarias
import pandas as pd
import numpy as np
import random
from sklearn.metrics.pairwise import cosine_similarity
####
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy import sparse
#import jovian

In [121]:
# Carga de datos
links_small = pd.read_csv('links_small.csv')
ratings_small = pd.read_csv('ratings_small.csv').drop('timestamp', axis=1)
movies_metadata = pd.read_csv('movies_metadata.csv', usecols=[6, 8], names=['imdbId', 'title'])[1:]


ratings_small.movieId = ratings_small.movieId.astype(object)


# Convertimos ImdbId a enteros 
for index, imdb in enumerate(movies_metadata.imdbId):
    _str = str(imdb)
    if _str != '0' and _str != 'nan':
        try:
            movies_metadata.imdbId[index] = int(_str[2:])
        except:            
            movies_metadata.imdbId[index] = 0


In [128]:
ratings_small.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [129]:
# Hacemos merge
movies_title = links_small.merge(movies_metadata, on='imdbId')
movies_title.head(2)

Unnamed: 0,movieId,imdbId,tmdbId,title
0,2,113497,8844.0,Toy Story
1,3,113228,15602.0,Jumanji


In [130]:
ratings_merged = ratings_small.merge(movies_title, on='movieId')
ratings_merged.head(2)

Unnamed: 0,userId,movieId,rating,imdbId,tmdbId,title
0,1,31,2.5,112792,9909.0,摇啊摇，摇到外婆桥
1,7,31,3.0,112792,9909.0,摇啊摇，摇到外婆桥


In [131]:
# Estas son las peliculas que no han sido rankeadas
non_rated_moviess = movies_title[~movies_title.movieId.isin(ratings_merged.movieId)]
non_rated_moviess

Unnamed: 0,movieId,imdbId,tmdbId,title
3683,4712,64285,1627.0,Theremin: An Electronic Odyssey
3722,4763,263957,106230.0,جمعه
3972,5169,38057,17058.0,Royal Wedding
4035,5289,39204,17487.0,The Atomic Cafe
4401,5984,73115,4561.0,Invaders from Mars
4509,6229,67893,27236.0,The Talk of the Town
4663,6515,29808,52758.0,Ring of Terror
4742,6683,116308,513.0,1947: Earth
4820,6830,45205,19171.0,The Gun in Betty Lou's Handbag
5057,7243,6864,3059.0,Cat Chaser


In [132]:
random.seed(0)
train_df, valid_df = train_test_split(ratings_small, test_size=0.2)

train_df = train_df.reset_index()[['userId', 'movieId', 'rating']]
valid_df = valid_df.reset_index()[['userId', 'movieId', 'rating']]

In [133]:
train_df

Unnamed: 0,userId,movieId,rating
0,608,95,3.0
1,102,1172,4.0
2,174,186,3.5
3,128,160,4.0
4,306,585,4.0
...,...,...,...
79998,345,3543,5.0
79999,232,1079,5.0
80000,587,4179,5.0
80001,647,296,5.0


In [134]:
train_df.head()

Unnamed: 0,userId,movieId,rating
0,608,95,3.0
1,102,1172,4.0
2,174,186,3.5
3,128,160,4.0
4,306,585,4.0


In [135]:
movies_title.head()

Unnamed: 0,movieId,imdbId,tmdbId,title
0,2,113497,8844.0,Toy Story
1,3,113228,15602.0,Jumanji
2,4,114885,31357.0,Grumpier Old Men
3,5,113041,11862.0,Waiting to Exhale
4,6,113277,949.0,Father of the Bride Part II


In [140]:
# Construyendo la matriz Y_ai
y_ia = movies_title.set_index('movieId').join(train_df.set_index('movieId'))
y_ia = y_ia.reset_index()
y_ia.head()

Unnamed: 0,movieId,imdbId,tmdbId,title,userId,rating
0,2,113497,8844.0,Toy Story,224.0,4.0
1,2,113497,8844.0,Toy Story,218.0,2.5
2,2,113497,8844.0,Toy Story,268.0,3.5
3,2,113497,8844.0,Toy Story,253.0,4.0
4,2,113497,8844.0,Toy Story,561.0,3.0


In [141]:
y_ia = pd.DataFrame(y_ia.pivot_table(index='userId', columns='title', values='rating'))
y_ia = pd.DataFrame(y_ia.to_records())

In [143]:
# Eliminando usuario Nan
y_ia = y_ia[pd.notnull(y_ia['userId'])]
# Hacer userId el index
y_ia = y_ia.set_index('userId')
y_ia

Unnamed: 0_level_0,!Women Art Revolution,$ Dollars,$5 a Day,"'night, Mother",(500) Days of Summer,*batteries not included,...All the Marbles,...And Justice for All,...E tu vivrai nel terrore! L'aldilà,1 Chance 2 Dance,...,"장화, 홍련",짝패,취화선,친구사이?,태극기 휘날리며,파이란,포화 속으로,해안선,해운대,후궁: 제왕의 첩
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667.0,,,,,,,,,,,...,,,,,,,,,,
668.0,,,,,,,,,,,...,,,,,,,,,,
669.0,,,,,,,,,,,...,,,,,,,,,,
670.0,,,,,,,,,,,...,,,,,,,,,,


In [146]:
Y_0 = y_ia.fillna(0)
Y_0

Unnamed: 0_level_0,!Women Art Revolution,$ Dollars,$5 a Day,"'night, Mother",(500) Days of Summer,*batteries not included,...All the Marbles,...And Justice for All,...E tu vivrai nel terrore! L'aldilà,1 Chance 2 Dance,...,"장화, 홍련",짝패,취화선,친구사이?,태극기 휘날리며,파이란,포화 속으로,해안선,해운대,후궁: 제왕의 첩
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
## Esto se cambiaría por el fitlrado colaborativo 
def standarized(row):
    new_row = (row-row.mean()) / (row.max()-row.min())
    return new_row

Y_std = Y_0.apply(standarized)

In [9]:
Y_std

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,161830,161918,161944,162376,162542,162672,163056,163949,164977,164979
1,-0.285097,-0.108495,-0.055589,-0.0132,-0.054545,-0.120417,-0.051863,-0.005663,-0.018778,-0.125484,...,-0.00149,-0.00149,-0.00149,-0.00149,-0.00149,-0.00149,,-0.00149,,
2,-0.285097,-0.108495,-0.055589,-0.0132,-0.054545,-0.120417,-0.051863,-0.005663,-0.018778,0.674516,...,-0.00149,-0.00149,-0.00149,-0.00149,-0.00149,-0.00149,,-0.00149,,
3,-0.285097,-0.108495,-0.055589,-0.0132,-0.054545,-0.120417,-0.051863,-0.005663,-0.018778,-0.125484,...,-0.00149,-0.00149,-0.00149,-0.00149,-0.00149,-0.00149,,-0.00149,,
4,-0.285097,-0.108495,-0.055589,-0.0132,-0.054545,-0.120417,-0.051863,-0.005663,-0.018778,0.674516,...,-0.00149,-0.00149,-0.00149,-0.00149,-0.00149,-0.00149,,-0.00149,,
5,-0.285097,-0.108495,0.744411,-0.0132,-0.054545,-0.120417,-0.051863,-0.005663,-0.018778,-0.125484,...,-0.00149,-0.00149,-0.00149,-0.00149,-0.00149,-0.00149,,-0.00149,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,-0.285097,-0.108495,-0.055589,-0.0132,-0.054545,0.679583,-0.051863,-0.005663,-0.018778,-0.125484,...,-0.00149,-0.00149,-0.00149,-0.00149,-0.00149,-0.00149,,-0.00149,,
668,-0.285097,-0.108495,-0.055589,-0.0132,-0.054545,-0.120417,-0.051863,-0.005663,-0.018778,-0.125484,...,-0.00149,-0.00149,-0.00149,-0.00149,-0.00149,-0.00149,,-0.00149,,
669,-0.285097,-0.108495,-0.055589,-0.0132,-0.054545,-0.120417,-0.051863,-0.005663,-0.018778,-0.125484,...,-0.00149,-0.00149,-0.00149,-0.00149,-0.00149,-0.00149,,-0.00149,,
670,0.514903,-0.108495,-0.055589,-0.0132,-0.054545,-0.120417,-0.051863,-0.005663,-0.018778,-0.125484,...,-0.00149,-0.00149,-0.00149,-0.00149,-0.00149,-0.00149,,-0.00149,,


In [10]:
Y_std = Y_std.fillna(0)

In [11]:
similarity = cosine_similarity(Y_std.T)

In [12]:
sim_df = pd.DataFrame(similarity, index=Y_0.columns, columns=Y_0.columns)

In [13]:
# esto es qué tan similares son las películas entre ellas
sim_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,161830,161918,161944,162376,162542,162672,163056,163949,164977,164979
1,1.000000,0.223742,0.183266,0.071055,0.105076,0.201503,0.156075,0.019379,0.023699,0.089163,...,0.070607,0.070607,0.070607,0.070607,-0.028157,-0.028157,0.0,0.040978,0.0,0.0
2,0.223742,1.000000,0.123790,0.125014,0.193144,0.085889,0.117211,0.209299,0.053810,0.306685,...,0.073388,0.073388,0.133113,0.058457,-0.016200,-0.016200,0.0,-0.016200,0.0,0.0
3,0.183266,0.123790,1.000000,0.147771,0.317911,0.158071,0.390331,0.109818,0.274638,0.086065,...,0.109898,0.109898,-0.011221,-0.011221,-0.011221,-0.011221,0.0,-0.011221,0.0,0.0
4,0.071055,0.125014,0.147771,1.000000,0.150562,0.024466,0.156876,0.496859,0.238193,0.063511,...,-0.005073,-0.005073,-0.005073,-0.005073,-0.005073,-0.005073,0.0,-0.005073,0.0,0.0
5,0.105076,0.193144,0.317911,0.150562,1.000000,0.186936,0.339605,0.179371,0.339402,0.150292,...,0.111648,0.111648,-0.011165,-0.011165,-0.011165,-0.011165,0.0,-0.011165,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162672,-0.028157,-0.016200,-0.011221,-0.005073,-0.011165,-0.016119,-0.010689,-0.003121,-0.006561,-0.017686,...,-0.001493,-0.001493,-0.001493,-0.001493,1.000000,1.000000,0.0,-0.001493,0.0,0.0
163056,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
163949,0.040978,-0.016200,-0.011221,-0.005073,-0.011165,0.050812,0.071754,-0.003121,-0.006561,-0.017686,...,-0.001493,-0.001493,-0.001493,-0.001493,-0.001493,-0.001493,0.0,1.000000,0.0,0.0
164977,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0


In [14]:
# Aquí entrarían las recomendaciones

def obtener_recomendacion(movie, user_rating):
    similar_score = sim_df[movie]*(user_rating-sim_df[movie].mean())
    similar_score = similar_score.sort_values(ascending=False)
    
    return similar_score

In [15]:
obtener_recomendacion('3', 1) # Se obtiene la recomendación de una película con un rating de.. ??

3        0.980569
802      0.450200
719      0.430396
494      0.396271
7        0.382746
           ...   
7153    -0.080192
4993    -0.081987
79132   -0.083644
5952    -0.084301
4973    -0.084983
Name: 3, Length: 9125, dtype: float64

In [16]:
# Veremos que le podemos recomendar al usuario 1 si nos dio estos ratings
user1 = ratings_small[ratings_small.userId == 1]
user1

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [17]:
similar_movies = pd.DataFrame()
for user, movie, rating, timestamp in user1.values:
    similar_movies = similar_movies.append(obtener_recomendacion(str(int(movie)), rating),ignore_index=True)
    

In [18]:
similar_movies.sum().sort_values(ascending=False)

2105      15.547508
1953      15.247066
2366      13.975904
3108      13.961787
1266      13.913193
            ...    
78637     -1.031995
51084     -1.057467
106487    -1.137312
98491     -1.168576
92259     -2.070977
Length: 9125, dtype: float64

In [19]:
Y_0_1 = Y.fillna(0)

In [20]:
# similar matrix con Pearson
similar_df_matrix = Y.corr(method="pearson")
similar_df_matrix

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,161830,161918,161944,162376,162542,162672,163056,163949,164977,164979
1,1.000000,0.363256,0.255080,-0.031564,0.279521,0.030998,0.350519,,-0.312002,0.096792,...,,,,,,,,,,
2,0.363256,1.000000,0.187217,0.133631,0.035446,-0.019630,0.425928,0.816497,0.301511,0.438066,...,,,,,,,,,,
3,0.255080,0.187217,1.000000,0.133631,0.560180,-0.075918,0.050120,0.693375,0.496904,0.164039,...,,,,,,,,,,
4,-0.031564,0.133631,0.133631,1.000000,0.612372,-0.500000,0.662842,,-0.577350,0.202465,...,,,,,,,,,,
5,0.279521,0.035446,0.560180,0.612372,1.000000,0.383228,0.012493,,0.225689,0.077864,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162672,,,,,,,,,,,...,,,,,,,,,,
163056,,,,,,,,,,,...,,,,,,,,,,
163949,,,,,,,,,,,...,,,,,,,,,,
164977,,,,,,,,,,,...,,,,,,,,,,


In [21]:
# Aquí entrarían las recomendaciones

def obtener_recomendacion_pearson(movie, user_rating):
    similar_score = similar_df_matrix[movie]*(user_rating-similar_df_matrix[movie].mean())
    similar_score = similar_score.sort_values(ascending=False)
    
    return similar_score

In [22]:
# Veremos que le podemos recomendar al usuario 2 si nos dio estos ratings
user1 = ratings_small[ratings_small.userId == 2]
user1

similar_movies = pd.DataFrame()
for user, movie, rating, timestamp in user1.values:
    similar_movies = similar_movies.append(obtener_recomendacion(str(int(movie)), rating),ignore_index=True)
    
similar_movies.sum().sort_values(ascending=False)

454       82.669014
500       81.946552
597       81.940570
590       81.299146
457       81.260202
            ...    
109487    -9.866858
26776    -10.318387
79132    -10.343755
58559    -10.948288
92259    -12.436425
Length: 9125, dtype: float64