In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt

In [2]:
from io import BytesIO
from zipfile import ZipFile
import requests

url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%205/data/moviedataset.zip"
filename = requests.get(url).content
zf = ZipFile(BytesIO(filename),'r')
for item in zf.namelist():
    print(item)

ml-latest/
ml-latest/links.csv
ml-latest/movies.csv
ml-latest/ratings.csv
ml-latest/README.txt
ml-latest/tags.csv


In [3]:
movies_df = pd.read_csv(zf.open("ml-latest/movies.csv"))

In [4]:
ratings_df = pd.read_csv(zf.open("ml-latest/ratings.csv"))

In [5]:
# Uso de expresiones regulares

# Extraccion de año de la columna 'title'
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)

# Remocion de año de la columna title
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))','')
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

movies_df = movies_df.drop("genres",axis=1)
ratings_df = ratings_df.drop('timestamp',axis=1)

  movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))','')


## Sistema de recomendacion basado en usuarios

In [6]:
# Crear una lista de peliculas vistas por un usuario
userInput = [
    {'title':'Breakfast Club, The','rating':5},
    {'title':'Toy Story','rating':3.5},
    {'title':'Jumanji','rating':2},
    {'title':'Pulp Fiction','rating':5},
    {'title':'Akira','rating':4.5},
]

inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [7]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId,inputMovies)
inputMovies = inputMovies.drop('year',axis=1)
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [15]:
# Obtener los usuarios que han visto las mismas peliculas
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head(2)

# Agrupar las filas por user ID
userSubsetGroup = userSubset.groupby(['userId'])

# Obtener los datos de un usuario
userSubsetGroup.get_group(1130)

userSubsetGroup = sorted(userSubsetGroup, key=lambda x: len(x[1]),reverse=True)
len(userSubsetGroup)

  userSubsetGroup = sorted(userSubsetGroup, key=lambda x: len(x[1]),reverse=True)


116140

In [17]:
# Correlacion de Pearson
userSubsetGroup_filter = userSubsetGroup[:100]

pearsonCorrelationDict = {}

for name,group in userSubsetGroup_filter:
    
    # Ordenar el grupo y el usuario
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    
    # Obtener la longitud del grupo
    nRatings = len(group)

    # Obtener los puntajes de las peliculas que se tienen en comun entre el grupo y el usuario
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    tempRatingList = temp_df['rating'].tolist()
    tempGroupList = group['rating'].tolist()

    # Calculo de la correlacion de Pearson
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum(i*j for i,j in zip(tempRatingList,tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    # Si el denominador es diferente de 0 se divide, sino es 0
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

pearsonCorrelationDict.items()

dict_items([(75, 0.8272781516947562), (106, 0.5860090386731182), (686, 0.8320502943378437), (815, 0.5765566601970551), (1040, 0.9434563530497265), (1130, 0.2891574659831201), (1502, 0.8770580193070299), (1599, 0.4385290096535153), (1625, 0.716114874039432), (1950, 0.179028718509858), (2065, 0.4385290096535153), (2128, 0.5860090386731196), (2432, 0.1386750490563073), (2791, 0.8770580193070299), (2839, 0.8204126541423674), (2948, -0.11720180773462392), (3025, 0.45124262819713973), (3040, 0.89514359254929), (3186, 0.6784622064861935), (3271, 0.26989594817970664), (3429, 0.0), (3734, -0.15041420939904673), (4099, 0.05860090386731196), (4208, 0.29417420270727607), (4282, -0.4385290096535115), (4292, 0.6564386345361464), (4415, -0.11183835382312353), (4586, -0.9024852563942795), (4725, -0.08006407690254357), (4818, 0.4885967564883424), (5104, 0.7674257668936507), (5165, -0.4385290096535153), (5547, 0.17200522903844556), (6082, -0.04728779924109591), (6207, 0.9615384615384616), (6366, 0.65779

In [18]:
pearson_df = pd.DataFrame.from_dict(pearsonCorrelationDict,orient='index')
pearson_df.columns = ['similarityIndex']
pearson_df['userId'] = pearson_df.index
pearson_df.index = range(len(pearson_df))
pearson_df.head()

Unnamed: 0,similarityIndex,userId
0,0.827278,75
1,0.586009,106
2,0.83205,686
3,0.576557,815
4,0.943456,1040


In [19]:
# Obtener usuarios similares
topUser = pearson_df.sort_values(by='similarityIndex',ascending=False)[:50]
topUser.head()

Unnamed: 0,similarityIndex,userId
64,0.961678,12325
34,0.961538,6207
55,0.961538,10707
67,0.960769,13053
4,0.943456,1040


In [20]:
# Generacion de recomendaciones
topUsersRating = topUser.merge(ratings_df,left_on='userId',right_on='userId',how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,0.961678,12325,1,3.5
1,0.961678,12325,2,1.5
2,0.961678,12325,3,3.0
3,0.961678,12325,5,0.5
4,0.961678,12325,6,2.5


In [22]:
# Obtener el peso por usuario
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']

# Sumar los puntajes del grupo de usuarios
tempTopUsersRating =topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similartityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similartityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,38.376281,140.800834
2,38.376281,96.656745
3,10.253981,27.254477
4,0.929294,2.787882
5,11.723262,27.151751


In [24]:
recommendation_df = pd.DataFrame()

# Calcular los pesos de cada pelicula
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similartityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index

# Obtener las peliculas segun puntajes
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score',ascending=False)
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
5073,5.0,5073
3329,5.0,3329
2284,5.0,2284
26801,5.0,26801
6776,5.0,6776


In [25]:
# Obtener el nombre de las peliculas recomendadas
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
2200,2284,Bandit Queen,1994
3243,3329,"Year My Voice Broke, The",1987
3669,3759,Fun and Fancy Free,1947
3679,3769,Thunderbolt and Lightfoot,1974
3685,3775,Make Mine Music,1946
4978,5073,"Son's Room, The (Stanza del figlio, La)",2001
6563,6672,War Photographer,2001
6667,6776,Lagaan: Once Upon a Time in India,2001
9064,26801,Dragon Inn (Sun lung moon hak chan),1992
18106,90531,Shame,2011
