# NLP Course Project

- Team Member 1: Abbinav Sankar Kailasam
- Team Memeber 2: Ishaan Reddy
- School: Computing and Data Science

In [48]:
!pip install node2vec



In [49]:
# Import the necessary packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from node2vec import Node2Vec

np.random.seed(42)

## Preprocessing Pipeline

In [50]:
# Read the csv data files and merge them

movies = pd.read_csv("/content/movies.csv")
ratings = pd.read_csv("/content/ratings.csv")

data = pd.merge(movies, ratings, on="movieId")
data = data.drop(columns=["timestamp"])
data = data.sample(n=10000)

data.head()

Unnamed: 0,movieId,title,genres,userId,rating
67037,5418,"Bourne Identity, The (2002)",Action|Mystery|Thriller,599,3.0
42175,2329,American History X (1998),Crime|Drama,282,4.5
93850,91529,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX,282,4.0
6187,230,Dolores Claiborne (1995),Drama|Thriller,414,3.0
12229,440,Dave (1993),Comedy|Romance,136,5.0


## Building the Graph Network for Movie Recommendation

In [51]:
G = nx.MultiDiGraph()

# Add nodes and edges from the merged dataframe
for _, row in data.iterrows():
    user_id = f"user_{row['userId']}"  # Prefix to distinguish users
    movie_id = f"movie_{row['movieId']}"  # Prefix to distinguish movies
    genres = row['genres'].split('|')

    # Add user and movie nodes
    G.add_node(user_id, label='User')
    G.add_node(movie_id, label='Movie', title=row['title'])

    # Add edge for the user's rating of the movie and interaction
    G.add_edge(user_id, movie_id, relation='rates', weight=row['rating'])
    G.add_edge(user_id, movie_id, relation='interaction', weight=1)

    for genre in genres:
        genre_node = f"genre_{genre}"
        G.add_node(genre_node, label='Genre')
        G.add_edge(movie_id, genre_node, relation='belongs_to', weight=1)

## Ranking Recommendations based on cosine similarity

In [52]:
# Generate embeddings using Node2Vec

node2vec = Node2Vec(G, dimensions=64, walk_length=10, num_walks=100, workers=4)
model = node2vec.fit(window=10, min_count=1, batch_words=4)

embeddings = {node: model.wv[node] for node in G.nodes()}

Computing transition probabilities:   0%|          | 0/4269 [00:00<?, ?it/s]

In [53]:
# Function to rank movies for a user using cosine similarity
def cosine_rank_movies(user_id, candidate_movies, embeddings, top_k=5):
    user_node = f"user_{user_id}"
    if user_node not in embeddings:
        return []  # Return empty if no embeddings for user

    user_vec = embeddings[user_node]

    movie_scores = []
    for movie_id in candidate_movies:
        movie_node = f"movie_{movie_id}"
        if movie_node in embeddings:
            movie_vec = embeddings[movie_node]
            score = cosine_similarity([user_vec], [movie_vec])[0][0]
            movie_scores.append((movie_id, score))

    # Sort movies by similarity score and return top K
    ranked_movies = sorted(movie_scores, key=lambda x: x[1], reverse=True)

    return [movie for movie, _ in ranked_movies[:top_k]]

In [54]:
unique_movies = data['movieId'].unique()
movie_title = movies.set_index('movieId')['title'].to_dict()

In [55]:
# Generate recommendations for all users using cosine similarity
all_user_ids = data['userId'].unique()[:20]

user_recommendations = {
    user_id: [movie_title[int(movie_id)] for movie_id in cosine_rank_movies(user_id, unique_movies, embeddings, top_k=5)]
    for user_id in all_user_ids
}

for user_id, movie_titles in user_recommendations.items():
    print(f"User {user_id}: Recommended Movies: {movie_titles}")

User 599: Recommended Movies: ['Rocketeer, The (1991)', 'X-Men: Days of Future Past (2014)', 'Mad Max (1979)', 'Valerian and the City of a Thousand Planets (2017)', 'Star Trek Beyond (2016)']
User 282: Recommended Movies: ['Avatar (2009)', 'Oblivion (2013)', 'Gravity (2013)', "Ender's Game (2013)", 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)']
User 414: Recommended Movies: ['Bad Girls (1994)', 'Pale Rider (1985)', 'Rio Bravo (1959)', 'Little Big Man (1970)', 'Open Range (2003)']
User 136: Recommended Movies: ['Aladdin (1992)', 'Ice Age (2002)', 'Home Alone (1990)', 'Wallace & Gromit: The Wrong Trousers (1993)', 'Finding Nemo (2003)']
User 58: Recommended Movies: ['Wyatt Earp (1994)', 'In Crowd, The (2000)', 'Rio Bravo (1959)', 'Maverick (1994)', 'Road Trip (2000)']
User 448: Recommended Movies: ['Battlefield Earth (2000)', 'Highlander II: The Quickening (1991)', 'Beneath the Planet of the Apes (1970)', 'Mission to Mars (2000)', 'When Worlds Collide (

## Ranking Recommendations based on pagerank

In [56]:
# Function to rank movies for a user using pagerank
def pagerank_rank_movies(G, user_id, top_k=5):
    personalization = {node: 0 for node in G.nodes()}
    user_node = f"user_{user_id}"

    if user_node not in G:
        return []  # Return empty if no embeddings for user

    personalization[user_node] = 1  # Focus on the current user

    pagerank_scores = nx.pagerank(G, personalization=personalization, alpha=0.85)
    movie_scores = {node: score for node, score in pagerank_scores.items() if node.startswith("movie_")}
    ranked_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)

    return [int(movie.replace("movie_", "")) for movie, _ in ranked_movies[:top_k]]

In [57]:
# Generate recommendations for all users using pagerank
user_recommendations = {
    user_id: [movie_title[movie_id] for movie_id in pagerank_rank_movies(G, user_id, top_k=5)]
    for user_id in all_user_ids
}

for user_id, movie_titles in user_recommendations.items():
    print(f"User {user_id}: Recommended Movies: {movie_titles}")

User 599: Recommended Movies: ['Star Wars: Episode IV - A New Hope (1977)', 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)', 'Star Wars: Episode VI - Return of the Jedi (1983)', 'Star Wars: Episode V - The Empire Strikes Back (1980)', 'Terminator 2: Judgment Day (1991)']
User 282: Recommended Movies: ['Back to the Future (1985)', 'Fear and Loathing in Las Vegas (1998)', 'American History X (1998)', 'Terminator 2: Judgment Day (1991)', 'Apollo 13 (1995)']
User 414: Recommended Movies: ['Volunteers (1985)', 'Lord of the Rings: The Fellowship of the Ring, The (2001)', 'Great Escape, The (1963)', 'Chinatown (1974)', 'Terminator 2: Judgment Day (1991)']
User 136: Recommended Movies: ['Dave (1993)', "What's Eating Gilbert Grape (1993)", 'Broken Arrow (1996)', 'Aladdin (1992)', 'Batman Forever (1995)']
User 58: Recommended Movies: ['Murder in the First (1995)', 'Speed (1994)', 'Tommy Boy (1995)', 'Much Ado About Nothing (1993)', 'Dave (1993)']
User 448: Recommend

In [58]:
def precision_recall_k(ranked_movies, ground_truth, k=5):

    # Take top-K recommendations
    top_k = set(ranked_movies[:k])
    relevant_movies = set(ground_truth)

    if not relevant_movies:
        return 0.0, 0.0  # Avoid division by zero if no relevant movies

    # Calculate true positives
    true_positives = top_k & relevant_movies

    # Calculate precision and recall
    precision = len(true_positives) / k
    recall = len(true_positives) / len(relevant_movies)

    return precision, recall

## Results

In [59]:
# Calculating the precision and recall for cs & pr ranking methods

pagerank_precision, pagerank_recall = [], []
cosine_precision, cosine_recall = [], []

for user_id in all_user_ids:
    ground_truth = data[data['userId'] == user_id]['movieId'].tolist()

    pagerank_recommendations = pagerank_rank_movies(G, user_id, top_k=5)
    cosine_recommendations = cosine_rank_movies(user_id, unique_movies, embeddings, top_k=5)

    pr_prec, pr_rec = precision_recall_k(pagerank_recommendations, ground_truth, k=5)
    pagerank_precision.append(pr_prec)
    pagerank_recall.append(pr_rec)

    cs_prec, cs_rec = precision_recall_k(cosine_recommendations, ground_truth, k=5)
    cosine_precision.append(cs_prec)
    cosine_recall.append(cs_rec)

avg_pagerank_precision = sum(pagerank_precision) / len(pagerank_precision)
avg_pagerank_recall = sum(pagerank_recall) / len(pagerank_recall)

avg_cosine_precision = sum(cosine_precision) / len(cosine_precision)
avg_cosine_recall = sum(cosine_recall) / len(cosine_recall)

print(f"PageRank - Precision: {avg_pagerank_precision:.2f}, Recall: {avg_pagerank_recall:.2f}")
print(f"Cosine Similarity - Precision: {avg_cosine_precision:.2f}, Recall: {avg_cosine_recall:.2f}")

PageRank - Precision: 1.00, Recall: 0.32
Cosine Similarity - Precision: 0.14, Recall: 0.05
