Step 1: Load and Prepare Data

In [2]:
import networkx as nx
import pandas as pd
from geopy.distance import geodesic

# Load the graph and data
G = nx.read_gml("GraphMissingEdges.gml")
df_edges = pd.read_csv("edgesToEvaluate.csv")
df_categories = pd.read_csv("categories.csv")

# Create a category mapping from the CSV file
category_map = dict(zip(df_categories["CategoryId"], df_categories["names"]))


Step 2: Feature Engineering
We’ll generate several features, including:

- Common categories between two venues.
- Jaccard similarity and Adamic-Adar index for node similarity.
- Distance between the venues based on latitude and longitude.
- Stars difference and review count difference between the venues.

In [17]:
from networkx.algorithms import link_prediction
from geopy.distance import geodesic
import numpy as np

def calculate_features(row):
    venue1, venue2 = row['venue1'], row['venue2']
    
    # Categories-based feature
    cat1 = set(G.nodes[venue1]["categories"].split(","))
    cat2 = set(G.nodes[venue2]["categories"].split(","))
    common_categories = len(cat1.intersection(cat2))
    
    # Node similarity metrics
    jaccard = list(link_prediction.jaccard_coefficient(G, [(venue1, venue2)]))[0][2]

    # Handle nodes with zero-degree by filtering in adamic_adar calculation
    adamic_adar = 0
    for u, v, adamic_adar_score in link_prediction.adamic_adar_index(G, [(venue1, venue2)]):
        # Filter nodes with non-zero degree
        valid_neighbors = [
            w for w in nx.common_neighbors(G, u, v)
            if G.degree(w) > 0
        ]
        # Calculate adamic_adar_score only for valid neighbors
        adamic_adar = sum(1 / np.log(G.degree(w)) for w in valid_neighbors)
    
    # Distance-based feature
    loc1 = (G.nodes[venue1]["latitude"], G.nodes[venue1]["longitude"])
    loc2 = (G.nodes[venue2]["latitude"], G.nodes[venue2]["longitude"])
    distance = geodesic(loc1, loc2).kilometers
    
    # Difference in stars and review count
    # Convert stars and review count to numeric values
    stars1 = float(G.nodes[venue1]["stars"])
    stars2 = float(G.nodes[venue2]["stars"])
    review_count1 = int(G.nodes[venue1]["reviewCount"])
    review_count2 = int(G.nodes[venue2]["reviewCount"])
    
    stars_diff = abs(stars1 - stars2)
    review_count_diff = abs(review_count1 - review_count2)
    
    return pd.Series({
        "common_categories": common_categories,
        "jaccard": jaccard,
        # "adamic_adar": adamic_adar,
        "distance": distance,
        "stars_diff": stars_diff,
        "review_count_diff": review_count_diff
    })

# Apply feature calculation to each row in edges dataset
df_features = df_edges.apply(calculate_features, axis=1)
df_edges = pd.concat([df_edges, df_features], axis=1)


Step 3: Prepare Labels
Load the ground truth labels from edgesToEvaluate.csv. Make sure to have a column named link that indicates the existence of the link.

In [18]:
import random

# Positive samples (existing edges)
positive_edges = list(G.edges)
df_positive = pd.DataFrame(positive_edges, columns=["venue1", "venue2"])
df_positive['link'] = 1  # Label as positive

# Negative samples (non-existent edges)
nodes = list(G.nodes)
negative_edges = []
while len(negative_edges) < len(positive_edges):  # Match number of positive examples
    u, v = random.sample(nodes, 2)
    if not G.has_edge(u, v):
        negative_edges.append((u, v))

df_negative = pd.DataFrame(negative_edges, columns=["venue1", "venue2"])
df_negative['link'] = 0  # Label as negative

# Combine positive and negative samples
df_train = pd.concat([df_positive, df_negative], ignore_index=True)

# Use the calculate_features function defined earlier to create features for the training set
df_train_features = df_train.apply(calculate_features, axis=1)
df_train = pd.concat([df_train, df_train_features], axis=1)

# Define features and labels
X_train = df_train.drop(columns=['venue1', 'venue2', 'link'])
y_train = df_train['link']


ZeroDivisionError: float division by zero

Step 4: Train the Model
We’ll use a classifier like Random Forest for simplicity. You can also try other classifiers such as logistic regression or XGBoost.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Step 5: Save Predictions
Finally, save the predictions on edgesToEvaluate.csv.

In [None]:
df_edges['link'] = clf.predict(X)
df_edges[['linkID', 'link']].to_csv("predicted_links.csv", index=False)
