In [2]:
# Data Citation:
# F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on 
# Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. <https://doi.org/10.1145/2827872>

# ! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

In [3]:
import torch

print("Versión de PyTorch:", torch.__version__)
print("CUDA disponible:", torch.cuda.is_available())
print("Nombre de la GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU disponible")


Versión de PyTorch: 2.1.0+cu121
CUDA disponible: True
Nombre de la GPU: NVIDIA GeForce RTX 2060


In [4]:
#import zipfile
#with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
#    zip_ref.extractall('data')

In [5]:
# import the dataset
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [6]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [7]:
# Take a look at movies_df
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
# Take a look at ratings_df
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
# Movie ID to movie name mapping
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("La matriz de calificación completa tendrá:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Por lo tanto: ", len(ratings_df) / (n_users*n_items) * 100, '% de la matriz está llena.')
print("""
Tenemos una matriz increíblemente escasa con la que trabajar aquí.
Y... como puedes imaginar, a medida que crezca el número de usuarios y productos, el número de elementos aumentará en n*2
Necesitará mucha memoria para trabajar con escala global... almacenar una matriz completa en la memoria sería un desafío. 
Una ventaja aquí es que la factorización matricial puede realizar la matriz de calificación implícitamente, por lo que no necesitamos todos los datos
      """)

Number of unique users: 610
Number of unique movies: 9724
La matriz de calificación completa tendrá: 5931640 elements.
----------
Number of ratings: 100836
Por lo tanto:  1.6999683055613624 % de la matriz está llena.

Tenemos una matriz increíblemente escasa con la que trabajar aquí.
Y... como puedes imaginar, a medida que crezca el número de usuarios y productos, el número de elementos aumentará en n*2
Necesitará mucha memoria para trabajar con escala global... almacenar una matriz completa en la memoria sería un desafío. 
Una ventaja aquí es que la factorización matricial puede realizar la matriz de calificación implícitamente, por lo que no necesitamos todos los datos
      


In [10]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()


        # crear incrustaciones de user
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # Piense en esto como una tabla de búsqueda para la entrada.
        self.user_factors.weight.data.uniform_(0, 0.05)
        # crear incrustaciones de item

        self.item_factors = torch.nn.Embedding(n_items, n_factors) # Piense en esto como una tabla de búsqueda para la entrada.
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # Toma un tensor data que contiene información sobre usuarios y ítems.
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)

    def predict(self, user, item):
      # Dado un usuario y un ítem, predice la puntuación de la interacción entre ellos utilizando el modelo.
        return self.forward(user, item)

In [11]:
# Importa las clases Dataset y DataLoader de PyTorch, que son esenciales para la manipulación y carga eficiente de datos en modelos de aprendizaje profundo.

from torch.utils.data.dataset import Dataset

from torch.utils.data import DataLoader # paquete que ayuda a transformar sus datos para que estén preparados para el aprendizaje automático

# Nota: Esta no es una "buena" práctica, en el sentido de MLops, pero continuaremos con esto ya que los datos ya están cargados en la memoria.
class Loader(Dataset):
    def __init__(self):
        # Hacemos una copia de dataframe original
        self.ratings = ratings_df.copy()

        # Extrae todos los IDs únicos de usuarios y películas del DataFrame.
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        #--- Producing new continuous IDs for users and movies ---

        # Crea diccionarios que mapean IDs únicos de usuarios y películas a índices continuos.
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Crea diccionarios inversos que mapean índices continuos a IDs únicos de usuarios y películas.
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # Mapea los IDs de películas y usuarios en el DataFrame ratings a sus respectivos índices continuos.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])

        # self.x contiene todas las columnas excepto 'rating' y 'timestamp'
        # self.y contiene la columna 'rating'.
        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        #Convierte las características y etiquetas a tensores de PyTorch.
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y)


    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [12]:
torch.cuda.is_available()

True

In [13]:

cuda = torch.cuda.is_available()
cuda = True

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)

for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# Define la función de pérdida como el error cuadrático medio (MSELoss)
loss_fn = torch.nn.MSELoss()

#  optimizador como el optimizador Adam con una tasa de aprendizaje de 1e-3.
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0443, 0.0322, 0.0498,  ..., 0.0022, 0.0207, 0.0101],
        [0.0207, 0.0069, 0.0249,  ..., 0.0142, 0.0161, 0.0026],
        [0.0334, 0.0415, 0.0030,  ..., 0.0431, 0.0335, 0.0172],
        ...,
        [0.0188, 0.0368, 0.0491,  ..., 0.0175, 0.0004, 0.0162],
        [0.0431, 0.0163, 0.0066,  ..., 0.0428, 0.0340, 0.0033],
        [0.0045, 0.0411, 0.0004,  ..., 0.0272, 0.0255, 0.0237]])
item_factors.weight tensor([[0.0270, 0.0359, 0.0167,  ..., 0.0197, 0.0184, 0.0304],
        [0.0497, 0.0004, 0.0436,  ..., 0.0468, 0.0333, 0.0168],
        [0.0342, 0.0223, 0.0176,  ..., 0.0316, 0.0084, 0.0224],
        ...,
        [0.0472, 0.0448, 0.0225,  ..., 0.0019, 0.0004, 0.0062],
        [0.0487, 0.0036, 0.0263,  ..., 0.0285, 0.0270, 0.0259],
        [0.0431, 0.0258, 0.0320,  ..., 0.0484, 0.0323, 0.0331]])


In [14]:
from tqdm import tqdm_notebook as tqdm

num_epochs = 128

# tqdm visualiza el progreso del bucle.
for it in tqdm(range(num_epochs)):
    losses = []
    # x son las características (entrada) y y son las etiquetas (salida)
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            # Inicialización del Gradiente, Propagación Adelante y Cálculo de Pérdida:
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            # Cálculo de Gradientes y Actualización de Pesos:
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    # Verifica si hay pérdidas antes de imprimir

    if losses:
        average_loss = sum(losses) / len(losses)
        print("iter #{}".format(it), "Loss:", average_loss)
    else:
        print("iter #{}".format(it), "No data for computing loss.")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.075400052941996
iter #1 Loss: 4.746397714021847
iter #2 Loss: 2.472628945777864
iter #3 Loss: 1.719985492187103
iter #4 Loss: 1.3455705547393275
iter #5 Loss: 1.128437907184441
iter #6 Loss: 0.991310812948924
iter #7 Loss: 0.9004460595888535
iter #8 Loss: 0.8372818646848504
iter #9 Loss: 0.792234019012318
iter #10 Loss: 0.7595388162408383
iter #11 Loss: 0.7353618554960047
iter #12 Loss: 0.7162613731426031
iter #13 Loss: 0.7015610217367332
iter #14 Loss: 0.6904950463000288
iter #15 Loss: 0.681627619274074
iter #16 Loss: 0.6751321600324611
iter #17 Loss: 0.6696772380226155
iter #18 Loss: 0.6656377699396332
iter #19 Loss: 0.6630503520142609
iter #20 Loss: 0.6607554376427898
iter #21 Loss: 0.6586895007923775
iter #22 Loss: 0.6576607305991468
iter #23 Loss: 0.6565653477147751
iter #24 Loss: 0.6558209695111071
iter #25 Loss: 0.6548138886009376
iter #26 Loss: 0.6540766993755012
iter #27 Loss: 0.6531224778385332
iter #28 Loss: 0.652026730934678
iter #29 Loss: 0.651025378121639

In [15]:
c = 0

# usuarios
uw = 0
# items
iw = 0

# Itera sobre los parámetros del modelo
for name, param in model.named_parameters():

    # Verifica si el parámetro requiere gradientes
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[ 1.3285,  1.3629,  1.7520,  ...,  1.3042,  1.2849,  0.5552],
        [ 1.0820,  0.6502,  1.0494,  ...,  1.3099,  2.1068,  1.0343],
        [-1.8383,  1.5846, -1.5784,  ...,  2.3367,  0.7989,  0.4721],
        ...,
        [ 0.5156,  0.6930,  2.0743,  ...,  1.1045, -1.1341,  1.8804],
        [ 1.4261,  1.0993,  1.2326,  ...,  0.1788,  0.9403,  1.1811],
        [ 0.8831,  1.9629,  0.3672,  ...,  1.4508,  0.8691,  1.3575]],
       device='cuda:0')
item_factors.weight tensor([[0.6700, 0.4776, 0.1801,  ..., 0.2215, 0.8661, 0.6523],
        [0.5958, 0.1288, 0.4608,  ..., 0.7945, 0.8960, 0.2366],
        [0.4061, 0.6020, 0.2270,  ..., 0.5393, 0.2178, 0.7980],
        ...,
        [0.3528, 0.3489, 0.3530,  ..., 0.3064, 0.3053, 0.3108],
        [0.4145, 0.3674, 0.3966,  ..., 0.3933, 0.3877, 0.3889],
        [0.4211, 0.4062, 0.4074,  ..., 0.4275, 0.4121, 0.4144]],
       device='cuda:0')


In [16]:

# Accede a los pesos de las incrustaciones de elementos (item_factors)


trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()



In [17]:
len(trained_movie_embeddings) # valores unicos de movies

9724

In [18]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
 # 10 clústeres (n_clusters=10) y un estado aleatorio fijo (random_state=0) para reproducibilidad
 # ajusta el modelo a los pesos de las incrustaciones de elementos (trained_movie_embeddings) utilizando el método .fit().
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

  super()._check_params_vs_input(X, default_n_init=10)


In [19]:
'''Se puede ver aquí que las películas que están en el mismo grupo tienden a tener
géneros similares. También tenga en cuenta que el algoritmo no está familiarizado con el nombre de la película.
y sólo obtuvo las relaciones mirando los números que representan cómo
Los usuarios han respondido a las selecciones de películas..'''
for cluster in range(10):
  print("Cluster #{}".format(cluster))

  # np.where(kmeans.labels_ == cluster)[0] devuelve los índices de las películas en el clúster.
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    #Convierte el índice del conjunto de entrenamiento al ID de la película original y cuenta el número de calificaciones para esa película.
    #Obtención del ID de la Película y Recuento de Calificaciones:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    #Almacenamiento de la Información de la Película:
    movs.append((movie_names[movid], rat_count))
    #Impresión de las 10 Películas Más Relevantes en el Clúster:
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0


  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = rating

	 Seven (a.k.a. Se7en) (1995)
	 Blade Runner (1982)
	 Shining, The (1980)
	 Interview with the Vampire: The Vampire Chronicles (1994)
	 Big Lebowski, The (1998)
	 Taxi Driver (1976)
	 Trainspotting (1996)
	 Natural Born Killers (1994)
	 Ace Ventura: When Nature Calls (1995)
	 Mars Attacks! (1996)
Cluster #1


  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = rating

	 Independence Day (a.k.a. ID4) (1996)
	 Apollo 13 (1995)
	 Batman (1989)
	 True Lies (1994)
	 Speed (1994)
	 Gladiator (2000)
	 Shrek (2001)
	 Men in Black (a.k.a. MIB) (1997)
	 Dances with Wolves (1990)
	 Pirates of the Caribbean: The Curse of the Black Pearl (2003)
Cluster #2


  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = rating

	 Forrest Gump (1994)
	 Silence of the Lambs, The (1991)
	 Fight Club (1999)
	 Star Wars: Episode V - The Empire Strikes Back (1980)
	 Star Wars: Episode VI - Return of the Jedi (1983)
	 Sixth Sense, The (1999)
	 Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
	 Lion King, The (1994)
	 Back to the Future (1985)
	 Dark Knight, The (2008)
Cluster #3


  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = rating

	 Pulp Fiction (1994)
	 Star Wars: Episode IV - A New Hope (1977)
	 Schindler's List (1993)
	 Usual Suspects, The (1995)
	 American Beauty (1999)
	 Lord of the Rings: The Fellowship of the Ring, The (2001)
	 Godfather, The (1972)
	 Lord of the Rings: The Two Towers, The (2002)
	 Lord of the Rings: The Return of the King, The (2003)
	 Fargo (1996)
Cluster #4


  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = rating

	 Shawshank Redemption, The (1994)
	 Matrix, The (1999)
	 Jurassic Park (1993)
	 Braveheart (1995)
	 Terminator 2: Judgment Day (1991)
	 Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
	 Fugitive, The (1993)
	 Saving Private Ryan (1998)
	 Alien (1979)
	 Die Hard (1988)
Cluster #5


  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = rating

	 Godzilla (1998)
	 Super Mario Bros. (1993)
	 Fantastic Four: Rise of the Silver Surfer (2007)
	 Honey, I Blew Up the Kid (1992)
	 Reign of Fire (2002)
	 Superman IV: The Quest for Peace (1987)
	 Karate Kid, Part III, The (1989)
	 Shark Tale (2004)
	 Rambo III (1988)
	 Dukes of Hazzard, The (2005)
Cluster #6


  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = rating

	 Star Wars: Episode I - The Phantom Menace (1999)
	 Batman Forever (1995)
	 Twister (1996)
	 Net, The (1995)
	 Crimson Tide (1995)
	 Matrix Reloaded, The (2003)
	 Armageddon (1998)
	 Star Wars: Episode II - Attack of the Clones (2002)
	 Matrix Revolutions, The (2003)
	 Star Wars: Episode III - Revenge of the Sith (2005)
Cluster #7


  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = rating

	 Aladdin (1992)
	 Mrs. Doubtfire (1993)
	 Titanic (1997)
	 Pretty Woman (1990)
	 Babe (1995)
	 E.T. the Extra-Terrestrial (1982)
	 Ghost (1990)
	 Jumanji (1995)
	 Sleepless in Seattle (1993)
	 Clueless (1995)
Cluster #8


  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = rating

	 Mask, The (1994)
	 Home Alone (1990)
	 Broken Arrow (1996)
	 Demolition Man (1993)
	 Liar Liar (1997)
	 RoboCop (1987)
	 American President, The (1995)
	 Eraser (1996)
	 Coneheads (1993)
	 Judge Dredd (1995)
Cluster #9


  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = rating

	 Toy Story (1995)
	 Mission: Impossible (1996)
	 Ace Ventura: Pet Detective (1994)
	 Stargate (1994)
	 GoldenEye (1995)
	 Waterworld (1995)
	 Indiana Jones and the Temple of Doom (1984)
	 Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
	 Mummy, The (1999)
	 Birdcage, The (1996)


  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
  rat_count = rating