In [1]:
# Importing necessary libraries 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
ratings = pd.read_csv("/Users/brijeenarana/My Projects/ml-latest-small/ratings.csv")
tags = pd.read_csv("/Users/brijeenarana/My Projects/ml-latest-small/tags.csv")
movies = pd.read_csv("/Users/brijeenarana/My Projects/ml-latest-small/movies.csv")
links = pd.read_csv("/Users/brijeenarana/My Projects/ml-latest-small/links.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/brijeenarana/My Projects/ml-latest-small/ratings.csv'

In [None]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'])
tags['timestamp'] = pd.to_datetime(tags['timestamp'])

In [None]:
print("Ratings Dataset:")
print(ratings.head())
print("\nTags Dataset:")
print(tags.head())
print("\nMovies Dataset:")
print(movies.head())
print("\nLinks Dataset:")
print(links.head())

In [None]:
print(ratings.info())
print(tags.info())
print(movies.info())
print(links.info())

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(ratings['rating'], bins = 10, kde = False)
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Genre analysis
movies['genre_list'] = movies['genres'].str.split('|')
all_genres = [genre for genres in movies['genre_list'] for genre in genres]

from collections import Counter
genre_counts = Counter(all_genres)

plt.figure(figsize = (12,6))
sns.barplot(x = list(genre_counts.keys()), y = list(genre_counts.values()))
plt.title('Genre Frequency')
plt.xlabel('Genres')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Checking for missing values in the dataset

print(ratings.isnull().sum())
print(tags.isnull().sum())
print(movies.isnull().sum())
print(links.isnull().sum())

In [None]:
# Eliminating duplicates in the ratings table

ratings.drop_duplicates(inplace=True)
ratings

In [None]:
interaction_matrix = ratings.pivot(index = 'userId', columns = 'movieId', values= 'rating').fillna(0)
print(interaction_matrix.head)

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

user_ids = ratings['userId'].unique()
movie_ids = ratings['movieId'].unique()

print(user_ids)
print(movie_ids)

In [None]:
# Create mappings
user_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
movie_to_index = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}

print(user_to_index)
print(movie_to_index)

In [None]:
# Replace original IDs with indices
ratings['userId'] = ratings['userId'].map(user_to_index)
ratings['movieId'] = ratings['movieId'].map(movie_to_index)

In [None]:
print(ratings)

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(ratings, test_size=0.2, random_state=42)

In [None]:
train_user = train['userId'].values
train_movie = train['movieId'].values
train_rating = train['rating'].values

test_user = test['userId'].values
test_movie = test['movieId'].values
test_rating = test['rating'].values

In [None]:
class CollaborativeFilteringModel(tf.keras.Model):
    def __init__(self, num_users, num_movies, embedding_dim):
        super().__init__()
        self.user_embedding = tf.keras.layers.Embedding(num_users, embedding_dim, embeddings_initializer="he_normal")
        self.movie_embedding = tf.keras.layers.Embedding(num_movies, embedding_dim, embeddings_initializer="he_normal")

    def call(self, inputs):
        user_input, movie_input = inputs
        user_vector = self.user_embedding(user_input)
        movie_vector = self.movie_embedding(movie_input)
        # Compute dot product
        dot_product = tf.reduce_sum(user_vector * movie_vector, axis=1)
        return dot_product

# Model parameters
num_users = len(user_ids)
num_movies = len(movie_ids)
embedding_dim = 50

# Instantiate the model
model = CollaborativeFilteringModel(num_users, num_movies, embedding_dim)

In [None]:
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

# Train the model
history = model.fit(
    [train_user, train_movie],
    train_rating,
    validation_data=([test_user, test_movie], test_rating),
    batch_size=64,
    epochs=10
)

In [None]:
# Evaluate the model
test_loss, test_mae = model.evaluate([test_user, test_movie], test_rating)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")

# Predict ratings for a specific user
def recommend_movies(user_id, num_recommendations=10):
    user_idx = user_to_index[user_id]
    movie_idxs = np.arange(num_movies)

    # Predict ratings for all movies
    predicted_ratings = model.predict([np.full(movie_idxs.shape, user_idx), movie_idxs])

    # Get top N recommendations
    top_movie_indices = predicted_ratings.flatten().argsort()[-num_recommendations:][::-1]
    return [(movie_id, predicted_ratings[i]) for i, movie_id in enumerate(movie_idxs[top_movie_indices])]

# Example: Get recommendations for user ID 1
recommendations = recommend_movies(1)
print("Top Recommendations for User 1:")
for movie_id, rating in recommendations:
    print(f"Movie ID {movie_id}: Predicted Rating {rating:.2f}")

In [None]:
import matplotlib.pyplot as plt

# Plot training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss Curve')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
