# **LamdaMART**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec

In [None]:
movies = pd.read_csv("/content/movies.csv")
ratings = pd.read_csv("/content/ratings.csv")

data = pd.merge(movies, ratings, on="movieId")
data = data.drop(columns=["timestamp"])
data = data.sample(n=10000)

data.head()

Unnamed: 0,movieId,title,genres,userId,rating
87159,59315,Iron Man (2008),Action|Adventure|Sci-Fi,78,3.0
42861,2379,Police Academy 2: Their First Assignment (1985),Comedy|Crime,217,3.0
51855,3146,Deuce Bigalow: Male Gigolo (1999),Comedy,68,3.0
68960,5944,Star Trek: Nemesis (2002),Action|Drama|Sci-Fi|Thriller,274,1.0
45331,2572,10 Things I Hate About You (1999),Comedy|Romance,500,3.0


In [None]:
!pip install node2vec

Collecting node2vec
  Downloading node2vec-0.5.0-py3-none-any.whl.metadata (849 bytes)
Downloading node2vec-0.5.0-py3-none-any.whl (7.2 kB)
Installing collected packages: node2vec
Successfully installed node2vec-0.5.0


In [None]:
import networkx as nx
import pandas as pd

G = nx.MultiDiGraph()

for _, row in data.iterrows():
    user_id = f"user_{row['userId']}"
    movie_id = f"movie_{row['movieId']}"
    genres = row['genres'].split('|')

    G.add_node(user_id, label='User')
    G.add_node(movie_id, label='Movie', title=row['title'])

    G.add_edge(user_id, movie_id, relation='rates', weight=row['rating'])

    for genre in genres:
        genre_node = f"genre_{genre}"
        G.add_node(genre_node, label='Genre')
        G.add_edge(movie_id, genre_node, relation='belongs_to', weight=1)

print("Knowledge Graph constructed with nodes and edges!")

Knowledge Graph constructed with nodes and edges!


In [None]:
from node2vec import Node2Vec

node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4)
node2vec_model = node2vec.fit(window=10, min_count=1, batch_words=4)

data["user_embedding"] = data["userId"].apply(lambda x: node2vec_model.wv.get_vector(f"user_{x}") if f"user_{x}" in node2vec_model.wv else [0]*64)
data["movie_embedding"] = data["movieId"].apply(lambda x: node2vec_model.wv.get_vector(f"movie_{x}") if f"movie_{x}" in node2vec_model.wv else [0]*64)

print("Node2Vec embeddings generated for users and movies!")

Computing transition probabilities:   0%|          | 0/4289 [00:00<?, ?it/s]

Node2Vec embeddings generated for users and movies!


In [None]:
user_embeddings = pd.DataFrame(data["user_embedding"].tolist(), index=data.index).add_prefix("user_emb_")
movie_embeddings = pd.DataFrame(data["movie_embedding"].tolist(), index=data.index).add_prefix("movie_emb_")

data = pd.concat([data, user_embeddings, movie_embeddings], axis=1)

data["user_avg_rating"] = data.groupby("userId")["rating"].transform("mean")
data["movie_avg_rating"] = data.groupby("movieId")["rating"].transform("mean")
data["relevance"] = (data["rating"] >= 4.0).astype(int)

print("Feature matrix prepared with graph embeddings and additional features!")

Feature matrix prepared with graph embeddings and additional features!


In [None]:
from sklearn.model_selection import train_test_split

group = data.groupby("userId").size().tolist()  # Total reviews per user

unique_users = data["userId"].unique()

train_users, test_users = train_test_split(unique_users, test_size=0.2, random_state=42)

train_data = data[data["userId"].isin(train_users)]
test_data = data[data["userId"].isin(test_users)]

X_train = train_data[["user_avg_rating", "movie_avg_rating", "genre_count"]]
y_train = (train_data["rating"] >= 4.0).astype(int)  # Relevance in Binary format
group_train = train_data.groupby("userId").size().tolist()  # Interaction counts

X_test = test_data[["user_avg_rating", "movie_avg_rating", "genre_count"]]
y_test = (test_data["rating"] >= 4.0).astype(int)
group_test = test_data.groupby("userId").size().tolist()

print(f"Train: X={X_train.shape}, y={len(y_train)}, group={len(group_train)}")
print(f"Test: X={X_test.shape}, y={len(y_test)}, group={len(group_test)}")

Train: X=(7948, 3), y=7948, group=477
Test: X=(2052, 3), y=2052, group=120


In [None]:
import lightgbm as lgb
from sklearn.metrics import precision_score

# LambdaMART model
model = lgb.LGBMRanker(
    boosting_type="gbdt",
    objective="lambdarank",
    metric="ndcg",
    n_estimators=100,
    learning_rate=0.05
)

model.fit(X_train, y_train, group=group_train)

data["predicted_score"] = model.predict(data[["user_avg_rating", "movie_avg_rating", "genre_count"]])

K = 3  # Number of recommendations
sample_users = data["userId"].unique()[:5]  # First 5 unique users

recommendations = {}
for user_id in sample_users:
    user_data = data[data["userId"] == user_id]
    user_data = user_data.sort_values(by="predicted_score", ascending=False)[:K]  # Top-K recommended movies
    recommendations[user_id] = user_data[['movieId', 'title', 'predicted_score']]

print("\nTop-K Recommendations for Users:")
for user_id, recs in recommendations.items():
    print(f"User {user_id}:")
    for _, row in recs.iterrows():
        print(f"  Movie: {row['title']}, Predicted Score: {row['predicted_score']:.4f}")

# Precision@K
def precision_at_k(y_true, y_pred, k):
    relevant_items = set(np.where(y_true == 1)[0])
    top_k_preds = set(np.argsort(y_pred)[::-1][:k])
    return len(relevant_items.intersection(top_k_preds)) / k

precision_scores = {}
for user_id in sample_users:
    user_data = data[data["userId"] == user_id]
    true_relevance = user_data["rating"].values >= 4  # Assuming ratings >= 4 are relevant
    predicted_scores = user_data["predicted_score"].values
    precision_scores[user_id] = precision_at_k(true_relevance, predicted_scores, K)

print("\nPrecision@K for selected users:")
for user_id, score in precision_scores.items():
    print(f"  User {user_id}: {score:.2f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001423 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 435
[LightGBM] [Info] Number of data points in the train set: 7948, number of used features: 3

Top-K Recommendations for Users:
User 78:
  Movie: Bill & Ted's Excellent Adventure (1989), Predicted Score: -1.5850
  Movie: Apollo 13 (1995), Predicted Score: -1.5850
  Movie: Iron Man (2008), Predicted Score: -1.6097
User 217:
  Movie: Auntie Mame (1958), Predicted Score: 0.6559
  Movie: As Good as It Gets (1997), Predicted Score: 0.6104
  Movie: Swiss Family Robinson (1960), Predicted Score: 0.5433
User 68:
  Movie: Fired Up (2009), Predicted Score: 4.6713
  Movie: Wimbledon (2004), Predicted Score: 1.6015
  Movie: Ever After: A Cinderella Story (1998), Predicted Score: 1.5345
User 274:
  Movie: Thing, The (1982), Predicted Score: 2.1531

In [None]:
def precision_recall_k(ranked_movies, ground_truth, k=5):

    top_k = set(ranked_movies[:k])
    relevant_movies = set(ground_truth)

    if not relevant_movies:
        return 0.0, 0.0

    true_positives = top_k & relevant_movies

    # Precision and Recall
    precision = len(true_positives) / k
    recall = len(true_positives) / len(relevant_movies)

    return precision, recall


all_user_ids = data['userId'].unique()

lamdamart_precision, lamdamart_recall = [], []

for user_id in all_user_ids:
    ground_truth = data[data['userId'] == user_id]['movieId'].tolist()

    user_data = data[data["userId"] == user_id]
    user_data = user_data.sort_values(by="predicted_score", ascending=False)
    lamdamart_recommendations = user_data['movieId'].head(5).tolist()

    pr_prec, pr_rec = precision_recall_k(lamdamart_recommendations, ground_truth, k=5)
    lamdamart_precision.append(pr_prec)
    lamdamart_recall.append(pr_rec)

avg_lamdamart_precision = sum(lamdamart_precision) / len(lamdamart_precision)
avg_lamdamart_recall = sum(lamdamart_recall) / len(lamdamart_recall)

print(f"LambdaMART - Precision: {avg_lamdamart_precision:.2f}, Recall: {avg_lamdamart_recall:.2f}")

LambdaMART - Precision: 0.84, Recall: 0.64
