<a href="https://colab.research.google.com/github/Azmain173/Movie-Recommendation-System/blob/main/Movie_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Load ratings
ratings = pd.read_csv("ratings.csv")

# Load movies
movies = pd.read_csv("movies.csv")

# Load tags
tags = pd.read_csv("tags.csv")

# Load links (for IMDb/TMDb connection)
links = pd.read_csv("links.csv")


In [2]:
df = pd.merge(ratings, movies, on="movieId")
df.head()


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [3]:
user_movie_matrix = df.pivot_table(index="userId", columns="title", values="rating")
user_movie_matrix.head()


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [6]:
import tensorflow as tf

# Assuming user_movie_matrix is already created
Y = user_movie_matrix.values  # Get ratings matrix
R = user_movie_matrix.notna().astype(int).values  # Create a mask matrix for rated movies


In [10]:
num_movies, num_users = Y.shape
num_features = 10  # You can tune this parameter

# Randomly initialize matrices
X = tf.Variable(tf.random.normal([num_movies, num_features]))
W = tf.Variable(tf.random.normal([num_users, num_features]))
b = tf.Variable(tf.random.normal([1, num_users]))


In [11]:
def cofi_cost_func_v(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the collaborative filtering
    Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop.
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movie was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    prediction = tf.linalg.matmul(X, tf.transpose(W)) + b  # Predicted ratings
    error = (prediction - Y) * R  # Only consider the ratings for which R(i,j) == 1
    J = 0.5 * tf.reduce_sum(error**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))  # Regularized cost
    return J


In [12]:
import tensorflow as tf

# Regularization parameter
lambda_ = 1

# Learning rate for Adam optimizer
learning_rate = 0.1

# Number of iterations for training
iterations = 100

# Initialize parameters (X, W, b)
num_movies, num_users = Y.shape
num_features = 10  # Number of latent features
X = tf.Variable(tf.random.normal([num_movies, num_features]))
W = tf.Variable(tf.random.normal([num_users, num_features]))
b = tf.Variable(tf.random.normal([1, num_users]))

# Initialize Adam optimizer
optimizer = tf.optimizers.Adam(learning_rate)

# Training loop
for iter in range(iterations):
    # Use TensorFlow's GradientTape to track the operations
    with tf.GradientTape() as tape:
        # Compute the cost (forward pass included in cost)
        cost_value = cofi_cost_func_v(X, W, b, Y, R, lambda_)

    # Use the gradient tape to retrieve gradients of the cost wrt X, W, b
    grads = tape.gradient(cost_value, [X, W, b])

    # Apply gradients to minimize the cost by updating the values of X, W, and b
    optimizer.apply_gradients(zip(grads, [X, W, b]))

    # Print the cost every 100 iterations
    if iter % 20 == 0:
        print(f"Iteration {iter}, Cost: {cost_value.numpy()}")


Iteration 0, Cost: nan
Iteration 20, Cost: nan
Iteration 40, Cost: nan
Iteration 60, Cost: nan
Iteration 80, Cost: nan


In [15]:
import tensorflow as tf

# Regularization parameter
lambda_ = 0.1

# Learning rate for Adam optimizer
learning_rate = 0.001  # Try a smaller learning rate

# Number of iterations for training
iterations = 100

# Initialize parameters (X, W, b)
num_movies, num_users = Y.shape
num_features = 10  # Number of latent features
X = tf.Variable(tf.random.normal([num_movies, num_features], mean=0.0, stddev=0.01))
W = tf.Variable(tf.random.normal([num_users, num_features], mean=0.0, stddev=0.01))
b = tf.Variable(tf.zeros([1, num_users]))

# Initialize Adam optimizer
optimizer = tf.optimizers.Adam(learning_rate)

# Training loop
for iter in range(iterations):
    with tf.GradientTape() as tape:
        cost_value = cofi_cost_func_v(X, W, b, Y, R, lambda_)

    grads = tape.gradient(cost_value, [X, W, b])

    # Check for NaN in gradients
    for grad in grads:
        if tf.reduce_any(tf.math.is_nan(grad)):
            print(f"NaN detected in gradients at iteration {iter}")
            break

    # Apply gradients to minimize the cost
    optimizer.apply_gradients(zip(grads, [X, W, b]))

    # Print the cost every 100 iterations
    if iter % 20 == 0:
        print(f"Iteration {iter}, Cost: {cost_value.numpy()}")

    # If the cost becomes too large, break
    if cost_value.numpy() > 1e10:
        print(f"Cost is too large at iteration {iter}, stopping.")
        break


NaN detected in gradients at iteration 0
Iteration 0, Cost: nan
NaN detected in gradients at iteration 1
NaN detected in gradients at iteration 2
NaN detected in gradients at iteration 3
NaN detected in gradients at iteration 4
NaN detected in gradients at iteration 5
NaN detected in gradients at iteration 6
NaN detected in gradients at iteration 7
NaN detected in gradients at iteration 8
NaN detected in gradients at iteration 9
NaN detected in gradients at iteration 10
NaN detected in gradients at iteration 11
NaN detected in gradients at iteration 12
NaN detected in gradients at iteration 13
NaN detected in gradients at iteration 14
NaN detected in gradients at iteration 15
NaN detected in gradients at iteration 16
NaN detected in gradients at iteration 17
NaN detected in gradients at iteration 18
NaN detected in gradients at iteration 19
NaN detected in gradients at iteration 20
Iteration 20, Cost: nan
NaN detected in gradients at iteration 21
NaN detected in gradients at iteration 

In [16]:
import tensorflow as tf
import numpy as np

# Regularization parameter
lambda_ = 0.1

# Learning rate
learning_rate = 0.0005  # Lower to avoid instability

# Training iterations
iterations = 5000

# Initialize parameters
num_movies, num_users = Y.shape
num_features = 10  # Latent features
X = tf.Variable(tf.random.normal([num_movies, num_features], mean=0.0, stddev=0.01))
W = tf.Variable(tf.random.normal([num_users, num_features], mean=0.0, stddev=0.01))
b = tf.Variable(tf.zeros([1, num_users]))  # Bias initialized to 0

# Optimizer
optimizer = tf.optimizers.Adam(learning_rate)

# Training loop
for iter in range(iterations):
    with tf.GradientTape() as tape:
        cost_value = cofi_cost_func_v(X, W, b, Y, R, lambda_)

    grads = tape.gradient(cost_value, [X, W, b])

    # Check for NaN in gradients
    nan_found = False
    for grad in grads:
        if tf.reduce_any(tf.math.is_nan(grad)):
            print(f"NaN detected in gradients at iteration {iter}")
            nan_found = True
            break

    if nan_found:
        break  # Stop training if NaNs appear

    # Apply gradient clipping
    grads, _ = tf.clip_by_global_norm(grads, 5.0)

    # Apply gradients
    optimizer.apply_gradients(zip(grads, [X, W, b]))

    # Print every 100 iterations
    if iter % 100 == 0:
        print(f"Iteration {iter}, Cost: {cost_value.numpy()}")

    # Stop training if cost explodes
    if cost_value.numpy() > 1e10:
        print(f"Cost too large at iteration {iter}, stopping training.")
        break


NaN detected in gradients at iteration 0


In [17]:
import numpy as np
print(np.isnan(X).any(), np.isinf(X).any())


False False


In [18]:
print(f"Iteration {iter}, Cost: {cost_value.numpy()}")


Iteration 0, Cost: nan


In [20]:
Ymean = tf.reduce_mean(Y, axis=1, keepdims=True)  # Compute mean rating per movie
Ynorm = Y - Ymean  # Normalize ratings


In [22]:
print("Ynorm contains NaN:", tf.reduce_any(tf.math.is_nan(Ynorm)).numpy())
Ynorm = tf.where(tf.math.is_nan(Ynorm), tf.zeros_like(Ynorm), Ynorm)


Ynorm contains NaN: True


In [24]:
print("Y contains NaN:", tf.reduce_any(tf.math.is_nan(Y)).numpy())


Y contains NaN: True


In [28]:
Ymean = tf.math.reduce_sum(tf.where(tf.math.is_nan(Y), 0.0, Y), axis=1, keepdims=True) / \
        tf.math.reduce_sum(tf.where(tf.math.is_nan(Y), 0.0, 1.0), axis=1, keepdims=True)


In [29]:
print("Ymean contains NaN:", tf.reduce_any(tf.math.is_nan(Ymean)).numpy())  # Should print False


Ymean contains NaN: False


In [26]:
Ynorm = tf.where(tf.math.is_nan(Ynorm), tf.zeros_like(Ynorm), Ynorm)


In [27]:
print("Ynorm contains NaN:", tf.reduce_any(tf.math.is_nan(Ynorm)).numpy())


Ynorm contains NaN: False


In [30]:
R = tf.where(tf.math.is_nan(Y), 0.0, 1.0)  # 1 for rated movies, 0 for missing
Y = tf.where(tf.math.is_nan(Y), 0.0, Y)  # Replace NaNs with 0 in Y


In [31]:
print("Y shape:", Y.shape)
print("Ynorm shape:", Ynorm.shape)
print("R shape:", R.shape)
print("Ymean shape:", Ymean.shape)
print("X shape:", X.shape)
print("W shape:", W.shape)
print("b shape:", b.shape)


Y shape: (610, 9719)
Ynorm shape: (610, 9719)
R shape: (610, 9719)
Ymean shape: (610, 1)
X shape: (610, 10)
W shape: (9719, 10)
b shape: (1, 9719)


In [33]:
Y = tf.cast(Y, tf.float32)


In [34]:
def cofi_cost_func_v(X, W, b, Y, R, lambda_):
    """
    Ensure all tensors are of type float32 for consistency.
    """
    X = tf.cast(X, tf.float32)
    W = tf.cast(W, tf.float32)
    b = tf.cast(b, tf.float32)
    Y = tf.cast(Y, tf.float32)
    R = tf.cast(R, tf.float32)

    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y) * R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J


In [35]:
X = tf.Variable(tf.random.normal([num_movies, num_features], dtype=tf.float32))
W = tf.Variable(tf.random.normal([num_users, num_features], dtype=tf.float32))
b = tf.Variable(tf.zeros([1, num_users], dtype=tf.float32))


In [36]:
import tensorflow as tf

# Set optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

# Training loop
iterations = 200
lambda_ = 0.1

for iter in range(iterations):
    with tf.GradientTape() as tape:
        cost_value = cofi_cost_func_v(X, W, b, Ynorm, R, lambda_)

    grads = tape.gradient(cost_value, [X, W, b])

    # Check for NaNs in gradients
    if any(tf.math.reduce_any(tf.math.is_nan(g)) for g in grads):
        print(f"NaN detected in gradients at iteration {iter}")
        break  # Stop training if NaNs appear

    optimizer.apply_gradients(zip(grads, [X, W, b]))

    if iter % 20 == 0:
        print(f"Iteration {iter}, Cost: {cost_value.numpy()}")


Iteration 0, Cost: 539676.4375
Iteration 20, Cost: 263402.0
Iteration 40, Cost: 141750.359375
Iteration 60, Cost: 85667.7421875
Iteration 80, Cost: 56360.87109375
Iteration 100, Cost: 39183.22265625
Iteration 120, Cost: 28256.765625
Iteration 140, Cost: 20921.640625
Iteration 160, Cost: 15821.591796875
Iteration 180, Cost: 12190.970703125


In [49]:
import pandas as pd
import numpy as np
import tensorflow as tf

# Load ratings and movies
ratings = pd.read_csv("ratings.csv")
movies = pd.read_csv("movies.csv")

# Merge ratings and movies on movieId
df = pd.merge(ratings, movies, on="movieId")

# Calculate number of ratings and mean ratings for each movie
movie_stats = df.groupby('title').agg(
    number_of_ratings=('rating', 'count'),
    mean_rating=('rating', 'mean')
).reset_index()

# Assuming you have the X, W, b, and Ymean from the collaborative filtering model
# Make predictions using the trained model
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

# Restore the mean (add the mean of ratings to the predictions)
pm = p + Ymean
my_predictions = pm[:, 0]

# Sort predictions (indices of the predicted ratings sorted in descending order)
ix = tf.argsort(my_predictions, direction='DESCENDING')

# Let's assume you want top N recommendations for a user
top_n = min(10, len(movies))  # Ensure we don't exceed the size of movieList

# Get the indices of the top N recommended movies
sorted_indices = np.argsort(my_predictions)[::-1]  # Sort in descending order

# Display the top N recommendations for a user
for i in range(top_n):
    idx = sorted_indices[i]
    if idx < len(movies):  # Check if the index is valid
        print(f"Recommended movie: {movies['title'].iloc[idx]} with predicted rating: {my_predictions[idx]:0.2f}")
    else:
        print(f"Index {idx} is out of range for the movie list.")





Recommended movie: Postman, The (Postino, Il) (1994) with predicted rating: 5.00
Recommended movie: Only You (1994) with predicted rating: 4.91
Recommended movie: Pretty Woman (1990) with predicted rating: 4.84
Recommended movie: Leaving Las Vegas (1995) with predicted rating: 4.80
Recommended movie: Shanghai Triad (Yao a yao yao dao waipo qiao) (1995) with predicted rating: 4.72
Recommended movie: Heavy Metal (1981) with predicted rating: 4.71
Recommended movie: Faster Pussycat! Kill! Kill! (1965) with predicted rating: 4.68
Recommended movie: Total Eclipse (1995) with predicted rating: 4.62
Recommended movie: Body Snatchers (1993) with predicted rating: 4.57
Recommended movie: Awfully Big Adventure, An (1995) with predicted rating: 4.55


In [50]:
print(f"Length of my_predictions: {len(my_predictions)}")
print(f"Length of movieList_df: {len(movieList_df)}")


Length of my_predictions: 610
Length of movieList_df: 1235


In [51]:
# Ensure the length of movieList_df matches the length of my_predictions
movieList_df = movieList_df.iloc[:len(my_predictions)]  # Slice to match predictions length

# Add predictions to the movie stats DataFrame
movieList_df["pred"] = my_predictions

# Reindex the DataFrame columns
movieList_df = movieList_df.reindex(columns=["pred", "mean_rating", "number_of_ratings", "title"])

# Sort by mean rating and show recommendations
filtered_movies = movieList_df.sort_values("mean_rating", ascending=False)

# Print top 10 filtered and sorted recommendations
print("\nTop recommended movies based on mean rating and more than 20 ratings:")
print(filtered_movies.head(10))



Top recommended movies based on mean rating and more than 20 ratings:
          pred  mean_rating  number_of_ratings  \
4313  3.624386     4.300000                 25   
4018  3.606284     4.293103                 29   
3499  2.853266     4.289062                192   
3782  3.040849     4.288462                 26   
3011  3.845543     4.272936                218   
1961  3.970910     4.271930                 57   
2531  3.686392     4.268041                 97   
3500  3.319184     4.259690                129   
2334  2.381076     4.252336                107   
3564  3.411285     4.250000                126   

                                                  title  
4313                   In the Name of the Father (1993)  
4018                                 Hoop Dreams (1994)  
3499                              Godfather, The (1972)  
3782                            Harold and Maude (1971)  
3011                                  Fight Club (1999)  
1961                          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movieList_df["pred"] = my_predictions


In [52]:
# Ensure the length of movieList_df matches the length of my_predictions
movieList_df = movieList_df.iloc[:len(my_predictions)].copy()  # Create a copy to avoid SettingWithCopyWarning

# Add predictions to the movie stats DataFrame
movieList_df["pred"] = my_predictions

# Reindex the DataFrame columns
movieList_df = movieList_df.reindex(columns=["pred", "mean_rating", "number_of_ratings", "title"])

# Sort by mean rating and show recommendations
filtered_movies = movieList_df.sort_values("mean_rating", ascending=False)

# Print top 10 filtered and sorted recommendations
print("\nTop recommended movies based on mean rating and more than 20 ratings:")
print(filtered_movies.head(10))  # Adjust based on how many you want to show



Top recommended movies based on mean rating and more than 20 ratings:
          pred  mean_rating  number_of_ratings  \
4313  3.624386     4.300000                 25   
4018  3.606284     4.293103                 29   
3499  2.853266     4.289062                192   
3782  3.040849     4.288462                 26   
3011  3.845543     4.272936                218   
1961  3.970910     4.271930                 57   
2531  3.686392     4.268041                 97   
3500  3.319184     4.259690                129   
2334  2.381076     4.252336                107   
3564  3.411285     4.250000                126   

                                                  title  
4313                   In the Name of the Father (1993)  
4018                                 Hoop Dreams (1994)  
3499                              Godfather, The (1972)  
3782                            Harold and Maude (1971)  
3011                                  Fight Club (1999)  
1961                          