#### Banking usecase : node2vec

In [1]:
! pip install node2vec



In [2]:
!pip install scikit-learn



In [3]:
import pandas as pd
import networkx as nx
from node2vec import Node2Vec
from sklearn.ensemble import RandomForestClassifier

# ---------------------------------
# Step 1: Create transactions data
# ---------------------------------
df = pd.DataFrame({
    'from_account': ["A", "A", "B", "C"],
    'to_account':   ["B", "C", "D", "D"],
    'amount': [500, 200, 700, 300]
})

print("🔎 Transactions table:")
print(df, '\n')

# Collect all unique accounts
all_accounts = pd.DataFrame({'account': pd.unique(df[['from_account', 'to_account']].values.ravel())})

# ---------------------------------git
# Step 2: Build transaction graph
# ---------------------------------
G = nx.DiGraph()
G.add_edges_from(zip(df['from_account'], df['to_account']))

# ---------------------------------
# Step 3: Generate node2vec embeddings
# ---------------------------------
'''
✔️ num_walks controls how many “sentences” each node contributes.	•	The number of walks you start from each node.If you set 10, each node generates 10 random walk sequences.
✔️ walk_length controls how long each “sentence” is.
✔️ dimensions controls how much “space” each node has to encode its context.After training, each node becomes a point in a dimensions-dimensional space. Each node vector has 2 values, so you can plot them directly in 2D.
Node A embedding (example): [0.12, -0.08]
Example: num_walks = 2, walk_length = 3 
Walk 1: A → B → D → C
Walk 2: A → C → D → B
if more walk length than node then go back 
	•	Low num_walks (e.g., 2): less context variety, fewer samples.
	•	High num_walks (e.g., 10 or 20): more robust learning, better node context representation.
'''
node2vec = Node2Vec(G, dimensions=4, walk_length=5, num_walks=10, workers=1, seed=42)
model = node2vec.fit(window=3, min_count=1)

embeddings = pd.DataFrame(
    [model.wv[str(node)] for node in G.nodes()],
    index=G.nodes()
)
embeddings.columns = [f'emb_{i+1}' for i in range(embeddings.shape[1])]
embeddings.reset_index(inplace=True)
embeddings.rename(columns={'index': 'account'}, inplace=True)

print("🧩 Embeddings table:")
print(embeddings, '\n')

# ---------------------------------
# Step 4: Compute features
# ---------------------------------
avg_amount = df.groupby('from_account')['amount'].mean().reset_index()
avg_amount.rename(columns={'from_account': 'account', 'amount': 'avg_amount_out'}, inplace=True)

num_tx = df.groupby('from_account').size().reset_index(name='num_transfers')
num_tx.rename(columns={'from_account': 'account'}, inplace=True)

# Merge to all accounts to avoid dropping receivers
features_df = all_accounts.merge(avg_amount, on='account', how='left')
features_df = features_df.merge(num_tx, on='account', how='left')

# Fill missing values (accounts with only incoming transfers)
features_df['avg_amount_out'] = features_df['avg_amount_out'].fillna(0)
features_df['num_transfers'] = features_df['num_transfers'].fillna(0)

# Dummy risk score
features_df['risk_score'] = [0.2, 0.3, 0.3, 0.9]  # Example for A, B, C, D

print("📄 Features table before embeddings:")
print(features_df, '\n')

# Merge embeddings
full_df = features_df.merge(embeddings, on='account', how='left')

print("🧾 Full table with features + embeddings:")
print(full_df, '\n')

# ---------------------------------
# Step 5: Add labels
# ---------------------------------
labels = pd.DataFrame({
    'account': ['A', 'B', 'C', 'D'],
    'label': [0, 0, 0, 1]  # D marked as suspicious
})
final_df = full_df.merge(labels, on='account', how='left')

print("🏷️ Table with labels:")
print(final_df, '\n')

# ---------------------------------
# Step 6: Prepare for ML
# ---------------------------------
X = final_df.drop(columns=['account', 'label'])
y = final_df['label']

model = RandomForestClassifier(random_state=42)
model.fit(X, y)

# Predict suspicion scores
pred_probs = model.predict_proba(X)[:, 1]
final_df['suspicion_score'] = pred_probs

# ---------------------------------
# Step 7: Output final results
# ---------------------------------
print("✅ Final results with suspicion scores:")
print(final_df[['account', 'suspicion_score']])

  from .autonotebook import tqdm as notebook_tqdm


🔎 Transactions table:
  from_account to_account  amount
0            A          B     500
1            A          C     200
2            B          D     700
3            C          D     300 



Computing transition probabilities: 100%|██████████| 4/4 [00:00<00:00, 12309.04it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 64527.75it/s]

🧩 Embeddings table:
  account     emb_1     emb_2     emb_3     emb_4
0       A -0.113415  0.163851 -0.121504 -0.045400
1       B -0.232574 -0.177920  0.161472  0.224325
2       C -0.125386 -0.094084  0.184513 -0.038337
3       D -0.013406  0.005911  0.127584  0.225232 

📄 Features table before embeddings:
  account  avg_amount_out  num_transfers  risk_score
0       A           350.0            2.0         0.2
1       B           700.0            1.0         0.3
2       C           300.0            1.0         0.3
3       D             0.0            0.0         0.9 

🧾 Full table with features + embeddings:
  account  avg_amount_out  num_transfers  risk_score     emb_1     emb_2  \
0       A           350.0            2.0         0.2 -0.113415  0.163851   
1       B           700.0            1.0         0.3 -0.232574 -0.177920   
2       C           300.0            1.0         0.3 -0.125386 -0.094084   
3       D             0.0            0.0         0.9 -0.013406  0.005911   

   


