# Similarity-Based Link Prediction on Yelp Recommender

In [1]:
from random import choices, sample
from itertools import combinations
from collections import defaultdict
from tqdm import tqdm
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import networkx as nx

from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
t0 = datetime.now()

## Construct full network graph

Vertices = businesses (approximately 27K businesses)

Edges = There's an edge between $i$ and $j$ if the number of users are more than xx.

Note: See https://networkx.org/documentation/latest/reference/introduction.html#networkx-basics for `nx` package documentation

To determine an edge, look at the distribution of `num_users`.

Couldn't plot a histogram, probably because it's highly skewed

In [3]:
edge_df = pd.read_feather("data/business_edge_user_count.feather")

In [4]:
edge_df.head()

Unnamed: 0,b1,b2,num_users
0,0ZsqqzHu1HHkDdIKoivi5g,1An4DxtMmvvSe0HX4viRCA,4
1,0ZsqqzHu1HHkDdIKoivi5g,3YqUe2FTCQr0pPVK8oCv6Q,105
2,0ZsqqzHu1HHkDdIKoivi5g,3gXgILE2YWVidJDvVWBT6Q,6
3,0ZsqqzHu1HHkDdIKoivi5g,HpWi2CRJlxVCYKd8kS0X-A,4
4,0ZsqqzHu1HHkDdIKoivi5g,KP5OncF2jhT7_J1phHPPww,69


In [5]:
edge_df.describe()

Unnamed: 0,num_users
count,20653320.0
mean,2.321564
std,3.913244
min,1.0
25%,1.0
50%,1.0
75%,2.0
max,1446.0


In [6]:
# sns.displot(data = edge_df, x = "num_users")
# plt.yscale('log')

In [7]:
for cutoff in [1, 5, 10, 20, 50, 100]:
    print(f"If cutoff = {cutoff}, there will be {len(edge_df[edge_df['num_users'] >= cutoff])} edges")

If cutoff = 1, there will be 20653315 edges
If cutoff = 5, there will be 1974099 edges
If cutoff = 10, there will be 588147 edges
If cutoff = 20, there will be 151273 edges
If cutoff = 50, there will be 18145 edges
If cutoff = 100, there will be 2471 edges


Since there's about 27K businesses (vertices), probably should take the cutoff = 5 for the existence of an edge

In [8]:
cutoff = 5
edge_subset = edge_df[edge_df['num_users'] >= cutoff]
edge_list = list(zip(*map(edge_subset.get, ['b1', 'b2', 'num_users'])))

In [9]:
G = nx.Graph()
G.add_weighted_edges_from(edge_list) # or use G.add_edges_from(edge_list) for unweifhted graph

In [10]:
deg_assort = nx.degree_assortativity_coefficient(G, x = 'out', y = 'out')
print(f"The network has degree assortativity: {deg_assort}")

The network has degree assortativity: -0.21032761437631706


In [11]:
# nx.degree_histogram(G)

This graph will be referred to as "fully observed graph"

## Train-test split

The following is how we approached train-test split. The process commenced with randomly dividing the existing edges (positive samples) of the graph into training and test sets, allocating 75% to the training set and 25% to the testing set.

To enhance the model's predictive power, negative sampling was employed. This involved generating non-existent edges (negative samples) between random pairs of nodes. The key modification here was the efficient generation of these negative edges: instead of checking all possible node pairs, we randomly selected node pairs and verified the absence of an edge between them. The number of negative edges generated matched the number of positive edges in the test set. These negative samples were then divided using the same 75-25 split as the positive samples.

The training graph was carefully constructed using only the positive edges from the training set. This step is critical to prevent data leakage and ensures that the training data reflects realistic scenarios for link prediction. The final training and test datasets were then prepared by combining their respective positive and negative edges. This balanced approach in dataset composition is essential for providing a comprehensive and unbiased evaluation of the link prediction model, thereby enhancing its applicability and accuracy in real-world recommender system scenarios.

In [12]:
# ensure edge_list is a list of tuples (u, v, weight)
edge_list = [(u, v, d['weight']) for u, v, d in G.edges(data=True)]

In [13]:
# splitting positive edges
train_edges, test_edges = train_test_split(edge_list, test_size=0.25, random_state=42)

In [14]:
%%time
# function to generate a single negative edge
def generate_neg_edge(G):
    nodes_list = list(G.nodes)  # convert nodes to a list
    while True:
        u, v = sample(nodes_list, 2)
        if not G.has_edge(u, v):
            return u, v

# number of negative edges needed
num_neg_edges_needed = len(test_edges)

# generate negative edges
neg_edges = set()
while len(neg_edges) < num_neg_edges_needed:
    neg_edges.add(generate_neg_edge(G))

# convert to list and sample if necessary
neg_edges_sample = list(neg_edges)
if len(neg_edges_sample) > num_neg_edges_needed:
    neg_edges_sample = sample(neg_edges_sample, num_neg_edges_needed)

CPU times: user 45.5 s, sys: 159 ms, total: 45.6 s
Wall time: 45.7 s


In [15]:
# split negative edges into train and test
train_neg_edges, test_neg_edges = train_test_split(neg_edges_sample, test_size=0.25, random_state=42)

In [16]:
# creating a new graph for training (useful for feature extraction)
g_train = nx.Graph()
g_train.add_nodes_from(G.nodes)
g_train.add_weighted_edges_from(train_edges)  # Add only training positive edges

In [17]:
# combine positive and negative edges for train and test sets (what we use for actual training and testing)
train_set = train_edges + [(u, v, 0) for u, v in train_neg_edges]  # 0 weight for negative edges
test_set = test_edges + [(u, v, 0) for u, v in test_neg_edges]  # 0 weight for negative edges

## Compute similarity scores with different measures

In [18]:
def calculate_similarity_scores(G, edge_list):
    # precompute neighbors for each node
    neighbors = {node: set(G.neighbors(node)) for node in G.nodes()}

    # precompute Adamic-Adar contributions for each node
    adamic_adar_contrib = {node: 1 / np.log(G.degree(node)) if G.degree(node) > 1 else 0 for node in G.nodes()}

    common_neighbors = []
    jaccard_coefficients = []
    adamic_adar_indices = []

    # calculate similarity measures
    for u, v, _ in tqdm(edge_list):
        # Common Neighbors
        common_neighbors_count = len(neighbors[u] & neighbors[v])
        common_neighbors.append(common_neighbors_count)

        # Jaccard Coefficient
        union_size = len(neighbors[u] | neighbors[v])
        jaccard_coeff = common_neighbors_count / union_size if union_size else 0
        jaccard_coefficients.append(jaccard_coeff)

        # Adamic-Adar Index
        adamic_adar_index = sum(adamic_adar_contrib[w] for w in neighbors[u] & neighbors[v])
        adamic_adar_indices.append(adamic_adar_index)

    return common_neighbors, jaccard_coefficients, adamic_adar_indices

In [19]:
%%time
# calculate for train and test sets
train_common_neighbors, train_jaccard_coefficients, train_adamic_adar_indices = calculate_similarity_scores(g_train, train_set)
test_common_neighbors, test_jaccard_coefficients, test_adamic_adar_indices = calculate_similarity_scores(g_train, test_set)

100%|████████████████████████████████████████| 1850717/1850717 [01:57<00:00, 15761.27it/s]
100%|██████████████████████████████████████████| 616907/616907 [00:37<00:00, 16374.16it/s]

CPU times: user 2min 31s, sys: 2.99 s, total: 2min 34s
Wall time: 2min 35s





## Binary classification with link prediction

We're trying to predict whether a link should exist between two nodes (AKA how confident... binary classification). Each model is an XGBoost and uses a single similarity measure as its feature.

In [20]:
# function to train and evaluate a model
def train_evaluate_model(train_features, train_labels, test_features, test_labels):
    # training the model
    model = xgb.XGBClassifier(eval_metric='logloss')
    model.fit(train_features, train_labels)

    # predicting on the test set
    predictions = model.predict(test_features)

    # evaluating the model
    accuracy = accuracy_score(test_labels, predictions)
    precision = precision_score(test_labels, predictions)
    recall = recall_score(test_labels, predictions)
    f1 = f1_score(test_labels, predictions)
    roc_auc = roc_auc_score(test_labels, predictions)

    return accuracy, precision, recall, f1, roc_auc

In [21]:
# preparing labels
train_labels = [1 if w > 0 else 0 for _, _, w in train_set]
test_labels = [1 if w > 0 else 0 for _, _, w in test_set]

In [22]:
%%time
# training and evaluating models for each similarity measure
results = []
for feature_name, train_feature, test_feature in tqdm(zip(
    ['Common Neighbors', 'Jaccard Coefficient', 'Adamic-Adar Index'],
    [train_common_neighbors, train_jaccard_coefficients, train_adamic_adar_indices],
    [test_common_neighbors, test_jaccard_coefficients, test_adamic_adar_indices]
)):
    curr_eval = train_evaluate_model(
        np.array(train_feature).reshape(-1, 1), train_labels,
        np.array(test_feature).reshape(-1, 1), test_labels
    )
    results.append([feature_name] + list(curr_eval))

3it [01:01, 20.43s/it]

CPU times: user 2min 16s, sys: 1.72 s, total: 2min 18s
Wall time: 1min 1s





In [23]:
res = pd.DataFrame(results, columns=['similarity_measure', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'])
res

Unnamed: 0,similarity_measure,accuracy,precision,recall,f1,roc_auc
0,Common Neighbors,0.967699,0.971997,0.98809,0.979977,0.937112
1,Jaccard Coefficient,0.956963,0.95652,0.991263,0.973581,0.905513
2,Adamic-Adar Index,0.971234,0.974529,0.989915,0.982162,0.943212


In [None]:
# TODO more similarity measures

## End

In [24]:
t1 = datetime.now()
print(f'Time elapsed to run entire notebook: {t1 - t0}')

Time elapsed to run entire notebook: 0:04:33.146260
