In [6]:
import networkx as nx

### GRAPH TRAVERSAL ALGORITHMS

In [None]:
# BREAD FIRST SEARCH

# Create a simple graph
G = nx.Graph()
G.add_edges_from([(1, 2), (1, 3), (2, 4), (3, 5)])
'''
  1
 / \
2   3
|   |
4   5
'''

# BFS from node 1
# BFS visits nodes level by level.
# It starts at the root (node 1), then visits all neighbors before going deeper
bfs_nodes = list(nx.bfs_tree(G, source=1))
print("BFS:", bfs_nodes)

# DEPTH FIRST SEARCH
# DFS from node 1
# DFS goes as deep as possible along each branch before backtracking.
# It starts at node 1 and explores one neighbor fully before moving to another.
dfs_nodes = list(nx.dfs_tree(G, source=1))
print("DFS:", dfs_nodes)

BFS: [1, 2, 3, 4, 5]
DFS: [1, 2, 4, 3, 5]


### LINK PREDICTION ALGORITHMS


#### Common Neighbors

In [None]:

# 1️ Build a toy social graph -----------------------------------
G = nx.Graph()
G.add_edges_from([
    ("Alice", "Bob"), ("Alice", "Claire"), ("Bob", "Dennis"),
    ("Claire", "Dennis"), ("Claire", "Eva"), ("Dennis", "Frank"),
    ("Eva", "Frank"),  # no edge yet between Alice and Dennis, etc.
])

# 2️ Pick the target user for recommendations -------------------
target = "Alice"
print("Gnodes",G.nodes)
# Pairs to score: (target, other) where no edge exists yet
pairs = [(target, v) for v in G.nodes
         if v != target and not G.has_edge(target, v)]
print("pairs",pairs)
# 3️Compute common-neighbor scores -----------------------------
## takes every node pair you listed in pairs, counts how many friends they have in common inside graph G, and gives you a generator that yields (u, v, count) for each pair.

scores = nx.common_neighbor_centrality(G, pairs)
print("scores",scores)
# 44 Rank and show the top suggestions --------------------------
top_k = sorted(scores, key=lambda x: x[2], reverse=True)[:3]
# 
print(f"Friend suggestions for {target}:")
for u, v, score in top_k:
    print(f"  • {v}  (shared friends = {score})")

Gnodes ['Alice', 'Bob', 'Claire', 'Dennis', 'Eva', 'Frank']
pairs [('Alice', 'Dennis'), ('Alice', 'Eva'), ('Alice', 'Frank')]
scores <generator object _apply_prediction.<locals>.<genexpr> at 0x1105e2240>
Friend suggestions for Alice:
  • Dennis  (shared friends = 2.2)
  • Eva  (shared friends = 1.4)
  • Frank  (shared friends = 0.3999999999999999)


#### ADAMIC Adar-Index 

In [None]:
import networkx as nx

# 1️  Build a tiny co-authorship graph
G = nx.Graph()
G.add_edges_from([
    ("Alice", "Prof Hub"), ("Bob", "Prof Hub"),         # hub collaborator
    ("Alice", "Eve"), ("Bob", "Eve"),                   # niche collaborator
    ("Charlie", "Prof Hub"), ("Charlie", "Dana")        # another cluster
])

# 2️  Candidate pairs with no existing edge
pairs = [(u, v) for u in G for v in G
         if u < v and not G.has_edge(u, v)]

# 3️  Compute Adamic–Adar
aa = nx.adamic_adar_index(G, pairs)

# 4️  Show ranked suggestions
for u, v, score in sorted(aa, key=lambda t: t[2], reverse=True):
    print(f"{u:6} ↔ {v:6}  AA = {score:.3f}")

'''

Eve    ↔ Prof Hub  AA = 2.885 is the strongest candidate
Although “Prof Hub” has many connections, Eve is low-degree and therefore rare in the network; their shared neighbours are mostly Eve’s other collaborators, so the pair still scores highly.
 If you want to suppress hubs even further, try Resource Allocation or apply a post-filter that discards any pair involving nodes above a chosen degree threshold.
'''

Eve    ↔ Prof Hub  AA = 2.885
Alice  ↔ Bob     AA = 2.353
Dana   ↔ Prof Hub  AA = 1.443
Alice  ↔ Charlie  AA = 0.910
Bob    ↔ Charlie  AA = 0.910
Alice  ↔ Dana    AA = 0.000
Bob    ↔ Dana    AA = 0.000
Charlie ↔ Eve     AA = 0.000
Dana   ↔ Eve     AA = 0.000


#### Resource Allocation Index*

In [20]:
import networkx as nx

# 1️⃣  Build a toy co-authorship graph
G = nx.Graph()
G.add_edges_from([
    ("Alice",   "Prof Hub"),
    ("Bob",     "Prof Hub"),
    ("Charlie", "Prof Hub"),
    ("Alice",   "Eve"),
    ("Bob",     "Eve"),
    ("Charlie", "Dana"),
])

# 2️⃣  Candidate pairs = no existing edge
pairs = [(u, v) for u in G for v in G
         if u < v and not G.has_edge(u, v)]

# 3️⃣  Compute Resource Allocation Index
ra = nx.resource_allocation_index(G, pairs)

# 4️⃣  Rank and display top suggestions
print("Top RA scores (higher ⇒ stronger recommendation):")
for u, v, score in sorted(ra, key=lambda t: t[2], reverse=True):
    print(f"{u:7} ↔ {v:9}  RA = {score:.3f}")

Top RA scores (higher ⇒ stronger recommendation):
Eve     ↔ Prof Hub   RA = 1.000
Alice   ↔ Bob        RA = 0.833
Dana    ↔ Prof Hub   RA = 0.500
Alice   ↔ Charlie    RA = 0.333
Bob     ↔ Charlie    RA = 0.333
Alice   ↔ Dana       RA = 0.000
Bob     ↔ Dana       RA = 0.000
Charlie ↔ Eve        RA = 0.000
Dana    ↔ Eve        RA = 0.000


#### Preferential attachement


In [21]:
import networkx as nx

# -- same graph as before ------------------------------------
G = nx.Graph()
G.add_edges_from([
    ("Alice",   "Prof Hub"),
    ("Bob",     "Prof Hub"),
    ("Charlie", "Prof Hub"),
    ("Alice",   "Eve"),
    ("Bob",     "Eve"),
    ("Charlie", "Dana")
])

# candidate pairs without an existing edge
pairs = [(u, v) for u in G for v in G
         if u < v and not G.has_edge(u, v)]

# preferential-attachment generator
pa = nx.preferential_attachment(G, pairs)   # yields (u, v, score)

# sort & show
print("Pairs ranked by Preferential Attachment (highest first)")
for u, v, score in sorted(pa, key=lambda t: t[2], reverse=True):
    print(f"{u:<7} ↔ {v:<9}  PA = {score}")

Pairs ranked by Preferential Attachment (highest first)
Eve     ↔ Prof Hub   PA = 6
Alice   ↔ Bob        PA = 4
Alice   ↔ Charlie    PA = 4
Bob     ↔ Charlie    PA = 4
Charlie ↔ Eve        PA = 4
Dana    ↔ Prof Hub   PA = 3
Alice   ↔ Dana       PA = 2
Bob     ↔ Dana       PA = 2
Dana    ↔ Eve        PA = 2


#### SALTON

In [None]:
import networkx as nx
import math

# --- build the same graph -----------------------------------
G = nx.Graph()
G.add_edges_from([
    ("Alice",   "Prof Hub"),
    ("Bob",     "Prof Hub"),
    ("Charlie", "Prof Hub"),
    ("Alice",   "Eve"),
    ("Bob",     "Eve"),
    ("Charlie", "Dana")
])

# candidate pairs with no current edge
pairs = [(u, v) for u in G for v in G
         if u < v and not G.has_edge(u, v)]

# --- Salton / Cosine index generator ------------------------
def salton_index(G, ebunch):
    for u, v in ebunch:
        cn = len(set(G[u]) & set(G[v]))
        denom = math.sqrt(G.degree(u) * G.degree(v))
        yield (u, v, 0 if denom == 0 else cn / denom)

# rank and display
print("Pairs ranked by Salton / Cosine (highest first)")
for u, v, score in sorted(salton_index(G, pairs),
                          key=lambda t: t[2], reverse=True):
    print(f"{u:<7} ↔ {v:<9}  Salton = {score:.3f}")

##### Salton / cosine index

In [None]:
import networkx as nx

# 1️⃣  Toy membership data  -----------------------------------
subs = {
    "r/Python":   {"alice", "bob", "claire", "dennis"},
    "r/DataSci":  {"alice", "bob", "eva", "frank"},
    "r/AI":       {"alice", "bob", "claire", "eva", "frank", "gina"},
    "r/Funny":    {"alice", "bob", "claire", "dennis", "eva",
                   "frank", "gina", "henry", "ida", "john"},
}

# 2️⃣  Project down to a graph whose edge weight is SD --------
G = nx.Graph()
for a, users_a in subs.items():
    for b, users_b in subs.items():
        if a < b:                          # one direction only
            overlap = len(users_a & users_b)
            sd = 2 * overlap / (len(users_a) + len(users_b))
            if sd > 0:                     # keep only pairs with some overlap
                G.add_edge(a, b, weight=sd)

# 3️⃣  Recommend similar communities for r/Python -------------
target = "r/Python"
recommend = sorted(
    ((nbr, G[target][nbr]["weight"]) for nbr in G.neighbors(target)),
    key=lambda t: t[1],
    reverse=True
)

print("Suggested subreddits for r/Python")
for sub, score in recommend:
    print(f"  • {sub:<10}  (Sorensen–Dice = {score:.2f})")

In [None]:

# 💼 Compliance Project: Suspicious Transaction Prediction

##### 🗺️ **Context**
- **Graph**: Transaction network
  - **Nodes** = Bank accounts or customers
  - **Edges** = Money transfers

##### 🎯 **Goal**
Predict **suspicious or hidden links** (potential illicit transactions).

---

##### ⚡ **Why not only classic ML?**
- Flat tables miss **graph structure** (e.g., indirect paths).
- Need to capture hidden connections and network behavior.

---

##### 🔗 **Graph approach: node2vec**

###### ✅ **What it does**
1. Runs random walks to explore graph neighborhoods.
2. Learns a **vector (embedding)** for each account.
3. Similar graph contexts → similar vectors.

---

##### ⚖️ **How to use embeddings**

###### ➕ **Combine with features**
- Transaction amounts
- KYC scores
- Transfer frequency

###### ⚙️ **Train ML model**
- Input: [tabular features + embeddings]
- Target: suspicious or normal label

---

##### ✅ **Benefits**
- Finds hidden risky accounts with no direct links.
- Highlights possible collusion paths.
- Helps compliance teams proactively investigate.

---

##### 💬 **Summary**
> *We build a transaction graph, generate node2vec embeddings to capture hidden relationships, and combine them with classical features to train a model for suspicious link prediction.*

#### Banking usecase : node2vec

In [2]:
! pip install node2vec

Collecting node2vec
  Using cached node2vec-0.5.0-py3-none-any.whl.metadata (849 bytes)
Collecting gensim<5.0.0,>=4.3.0 (from node2vec)
  Using cached gensim-4.3.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (8.1 kB)
Collecting joblib<2.0.0,>=1.4.0 (from node2vec)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim<5.0.0,>=4.3.0->node2vec)
  Using cached scipy-1.13.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim<5.0.0,>=4.3.0->node2vec)
  Downloading smart_open-7.3.0-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim<5.0.0,>=4.3.0->node2vec)
  Using cached wrapt-1.17.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.4 kB)
Using cached node2vec-0.5.0-py3-none-any.whl (7.2 kB)
Using cached gensim-4.3.3-cp312-cp312-macosx_11_0_arm64.whl (24.0 MB)
Downloading joblib-1.5.1-py3-none-any.whl (307 kB)
Using cached scipy-1.13.1-cp312-cp312-macosx_12_0_arm64.whl (30.

In [13]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.0-cp312-cp312-macosx_12_0_arm64.whl (10.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scikit-learn
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [scikit-learn][0m [scikit-learn]
[1A[2KSuccessfully installed scikit-learn-1.7.0 threadpoolctl-3.6.0


In [None]:
import pandas as pd
import networkx as nx
from node2vec import Node2Vec
from sklearn.ensemble import RandomForestClassifier

# ---------------------------------
# Step 1: Create transactions data
# ---------------------------------
df = pd.DataFrame({
    'from_account': ["A", "A", "B", "C"],
    'to_account':   ["B", "C", "D", "D"],
    'amount': [500, 200, 700, 300]
})

print("🔎 Transactions table:")
print(df, '\n')

# Collect all unique accounts
all_accounts = pd.DataFrame({'account': pd.unique(df[['from_account', 'to_account']].values.ravel())})

# ---------------------------------
# Step 2: Build transaction graph
# ---------------------------------
G = nx.DiGraph()
G.add_edges_from(zip(df['from_account'], df['to_account']))

# ---------------------------------
# Step 3: Generate node2vec embeddings
# ---------------------------------
'''
✔️ num_walks controls how many “sentences” each node contributes.	•	The number of walks you start from each node.If you set 10, each node generates 10 random walk sequences.
✔️ walk_length controls how long each “sentence” is.
✔️ dimensions controls how much “space” each node has to encode its context.

Example: num_walks = 2, walk_length = 3 
Walk 1: A → B → D → C
Walk 2: A → C → D → B
if more walk length than node then go back 
	•	Low num_walks (e.g., 2): less context variety, fewer samples.
	•	High num_walks (e.g., 10 or 20): more robust learning, better node context representation.
'''
node2vec = Node2Vec(G, dimensions=4, walk_length=5, num_walks=10, workers=1, seed=42)
model = node2vec.fit(window=3, min_count=1)

embeddings = pd.DataFrame(
    [model.wv[str(node)] for node in G.nodes()],
    index=G.nodes()
)
embeddings.columns = [f'emb_{i+1}' for i in range(embeddings.shape[1])]
embeddings.reset_index(inplace=True)
embeddings.rename(columns={'index': 'account'}, inplace=True)

print("🧩 Embeddings table:")
print(embeddings, '\n')

# ---------------------------------
# Step 4: Compute features
# ---------------------------------
avg_amount = df.groupby('from_account')['amount'].mean().reset_index()
avg_amount.rename(columns={'from_account': 'account', 'amount': 'avg_amount_out'}, inplace=True)

num_tx = df.groupby('from_account').size().reset_index(name='num_transfers')
num_tx.rename(columns={'from_account': 'account'}, inplace=True)

# Merge to all accounts to avoid dropping receivers
features_df = all_accounts.merge(avg_amount, on='account', how='left')
features_df = features_df.merge(num_tx, on='account', how='left')

# Fill missing values (accounts with only incoming transfers)
features_df['avg_amount_out'] = features_df['avg_amount_out'].fillna(0)
features_df['num_transfers'] = features_df['num_transfers'].fillna(0)

# Dummy risk score
features_df['risk_score'] = [0.2, 0.3, 0.3, 0.9]  # Example for A, B, C, D

print("📄 Features table before embeddings:")
print(features_df, '\n')

# Merge embeddings
full_df = features_df.merge(embeddings, on='account', how='left')

print("🧾 Full table with features + embeddings:")
print(full_df, '\n')

# ---------------------------------
# Step 5: Add labels
# ---------------------------------
labels = pd.DataFrame({
    'account': ['A', 'B', 'C', 'D'],
    'label': [0, 0, 0, 1]  # D marked as suspicious
})
final_df = full_df.merge(labels, on='account', how='left')

print("🏷️ Table with labels:")
print(final_df, '\n')

# ---------------------------------
# Step 6: Prepare for ML
# ---------------------------------
X = final_df.drop(columns=['account', 'label'])
y = final_df['label']

model = RandomForestClassifier(random_state=42)
model.fit(X, y)

# Predict suspicion scores
pred_probs = model.predict_proba(X)[:, 1]
final_df['suspicion_score'] = pred_probs

# ---------------------------------
# Step 7: Output final results
# ---------------------------------
print("✅ Final results with suspicion scores:")
print(final_df[['account', 'suspicion_score']])

🔎 Transactions table:
  from_account to_account  amount
0            A          B     500
1            A          C     200
2            B          D     700
3            C          D     300 



Computing transition probabilities: 100%|██████████| 4/4 [00:00<00:00, 16400.02it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 54050.31it/s]

🧩 Embeddings table:
  account     emb_1     emb_2     emb_3     emb_4
0       A -0.113415  0.163851 -0.121504 -0.045400
1       B -0.232574 -0.177920  0.161472  0.224325
2       C -0.125386 -0.094084  0.184513 -0.038337
3       D -0.013406  0.005911  0.127584  0.225232 

📄 Features table before embeddings:
  account  avg_amount_out  num_transfers  risk_score
0       A           350.0            2.0         0.2
1       B           700.0            1.0         0.3
2       C           300.0            1.0         0.3
3       D             0.0            0.0         0.9 

🧾 Full table with features + embeddings:
  account  avg_amount_out  num_transfers  risk_score     emb_1     emb_2  \
0       A           350.0            2.0         0.2 -0.113415  0.163851   
1       B           700.0            1.0         0.3 -0.232574 -0.177920   
2       C           300.0            1.0         0.3 -0.125386 -0.094084   
3       D             0.0            0.0         0.9 -0.013406  0.005911   

   


