# CAPSTONE PROJECT 2023-2024
### Financial Graph Mining For Customers & Supply Chains Assessment
In order to improve the Bank’s CIB activities, a special attention is given to its clients transactions. Mapping out the supply chain of corporate customers is a strategic move that can enhance the bank’s risk management, client relationship and competitiveness in the market.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import networkx as nx
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
import numpy as np
import plotly.graph_objects as go
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from torch_geometric.nn import BatchNorm
from torch.nn import Linear
import warnings
warnings.filterwarnings('ignore')

#  PART 1: EXPLORATORY DATA ANALYSIS

### 1.1 Importing datasets and treatment of variables

In [None]:
static_data = pd.read_csv('/home/jovyan/hfactory_magic_folders/financial_graph_mining_for_customers___supply_chains_assessment/static_data_all_x.csv',sep=';')
transactions_data = pd.read_csv('/home/jovyan/hfactory_magic_folders/financial_graph_mining_for_customers___supply_chains_assessment/transactions_x.csv',sep=';')

In [None]:
def convert_probability(value):
    if '+' in str(value):
        return float(value.replace('+', '')) + 0.25
    elif '-' in str(value):
        return float(value.replace('-', '')) - 0.25
    else:
        return float(value)
def from_pred_to_normal(number):

    if 0 <= number <= 13:
        if number == 13:
            return "-13"
        else:
            base = int(number)
            if number - base < 0.1:
                return f"{base}"
            else:
                if number - base < 0.4:
                    return f"{base}+"
                else:
                    return f"-{base+1}"

static_data['T_LOCAL_TX_PD'] = static_data['T_LOCAL_TX_PD'].apply(convert_probability)

In [None]:
test_data_=pd.read_csv('/home/jovyan/hfactory_magic_folders/financial_graph_mining_for_customers___supply_chains_assessment/static_data_x_PD.csv',sep=';')

## filtre solon T_LOCAL_MT_ACTIF_SOCIAL=NAN et QUARTER="q4"
test_data = test_data_.loc[(pd.isna(test_data_['T_LOCAL_MT_ACTIF_SOCIAL'])) & (test_data_['QUARTER'] == "q4")]
test_data_ = test_data_.loc[(pd.isna(test_data_['T_LOCAL_MT_ACTIF_SOCIAL'])) & (test_data_['QUARTER'] == "q4")]
test_data['T_LOCAL_TX_PD'] = test_data['T_LOCAL_TX_PD'].apply(convert_probability)

TRAITEMENT df,df_train ET df_test

In [None]:
df=static_data
df_merged = pd.merge(test_data, df[['ID', 'T_LOCAL_MT_ACTIF_SOCIAL']], on='ID', how='left', suffixes=('', '_from_static_data'))
df_merged=df_merged.drop_duplicates()
df_merged.drop(columns=['T_LOCAL_MT_ACTIF_SOCIAL'], inplace=True)
df_merged.rename(columns={'T_LOCAL_MT_ACTIF_SOCIAL_from_static_data': 'T_LOCAL_MT_ACTIF_SOCIAL'}, inplace=True)
df_test=df_merged
df_test

In [None]:
df_train = pd.merge(df, df_test[['ID', 'QUARTER']], on=['ID', 'QUARTER'], how='left', indicator=True)
df_train = df_train[df_train['_merge'] == 'left_only']
df_train.drop(columns=['_merge'], inplace=True)
df_train

In [None]:
def onehotencoding(df,columns=None):
    # One-hot encode 'CATEGORY','REGION' and 'QUARTER'
    category_one_hot = pd.get_dummies(df['CATEGORY'], prefix='Cat')
    region_one_hot = pd.get_dummies(df['REGION'], prefix='Reg')
    quarter_one_hot=pd.get_dummies(df['QUARTER'])

    df = pd.concat([df, category_one_hot, region_one_hot,quarter_one_hot], axis=1)

    df.drop(['CATEGORY', 'REGION','QUARTER'], axis=1, inplace=True)
    df_converted = df.copy()
    if columns is None:
        columns = df_converted.columns
    for col in columns:
        if df_converted[col].dtype == bool:
            df_converted[col] = df_converted[col].astype(int)
    return df_converted
df_test_encoded=onehotencoding(df_test)
df_train_encoded=onehotencoding(df_train)
df_test_encoded = df_test_encoded.reindex(columns=df_train_encoded.columns, fill_value=0)

### 1.2 Basics analysis on features (we will use static data and transactions data)

In this part we will do some analysis on the companies only and make analysis based on quarter.

In [None]:
static_data_unique=static_data.drop_duplicates(subset='ID') # To delete duplicates companies based on ID
print(static_data_unique.describe())

In total we have 1129 companies in our dataset. Default probability accross them is around 5.6 (from 0 to 13).

In [None]:
categories_counts = static_data_unique['CATEGORY'].value_counts()
region_counts = static_data_unique['REGION'].value_counts()
total_categories = categories_counts.sum()
percentages_categories = 100 * categories_counts / total_categories

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

# Catégories
sns.barplot(x=categories_counts.index, y=categories_counts.values, ax=ax1)
ax1.set_title('Distribution of Companies Across Categories')
ax1.set_xlabel('Category')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=45)

# Ajout des étiquettes de pourcentage sur l'axe des y pour le graphique des catégories
ax1.set_yticklabels([f'{p:.2f}%' for p in np.linspace(0, 60, len(ax1.get_yticks()))])

# Régions
ax2.pie(region_counts, labels=region_counts.index, autopct='%1.1f%%', startangle=140)
ax2.set_title('Distribution of Companies Across Regions')

plt.tight_layout()
plt.show()


More than 55% of the companies are in raw materials in our database. Companies in OEM are the most less represented (less than 10%).
We also have around 60% of companies in the region APAC. The region EMEA follows with exactly 15.8% of the companies.

In [None]:
mean_by_quarter = static_data.groupby('QUARTER')['T_LOCAL_TX_PD'].mean().reset_index()

plt.figure(figsize=(10, 6))
sns.lineplot(x='QUARTER', y='T_LOCAL_TX_PD', data=mean_by_quarter, marker='o')
plt.title('Evolution of the Mean of T_LOCAL_TX_PD by Quarter')
plt.xlabel('Qarter')
plt.ylabel('Mean of T_LOCAL_TX_PD')
plt.xticks(rotation=45) 
plt.tight_layout()
plt.show()

Our target variable mean  was quit high according to the other quarter in Q2. This mean was lesser in Q4.

In [None]:
mean_by_quarter_ = static_data.groupby('QUARTER')['ESG_SCORE'].mean().reset_index()

plt.figure(figsize=(10, 6))
sns.lineplot(x='QUARTER', y='ESG_SCORE', data=mean_by_quarter_, marker='o')
plt.title('Evolution of the Mean of ESG_SCORE by Quarter')
plt.xlabel('Qarter')
plt.ylabel('Mean of ESG_SCORE')
plt.xticks(rotation=45) 
plt.tight_layout()
plt.show()

During all quarters the mean of the ESG score stayed at the same level which is around 3.87.

In [None]:
mean_by_cat = static_data.groupby('CATEGORY')['T_LOCAL_TX_PD'].mean().reset_index()
mean_by_reg = static_data.groupby('REGION')['T_LOCAL_TX_PD'].mean().reset_index()

mean_by_cat['Type'] = 'Category'
mean_by_reg['Type'] = 'Region'
mean_by_cat.rename(columns={'CATEGORY': 'Group'}, inplace=True)
mean_by_reg.rename(columns={'REGION': 'Group'}, inplace=True)

combined_data = pd.concat([mean_by_cat, mean_by_reg])

plt.figure(figsize=(12, 6))
sns.barplot(x='Group', y='T_LOCAL_TX_PD', hue='Type', data=combined_data, dodge=False)
plt.title('Mean of T_LOCAL_TX_PD by Category and Region')
plt.xlabel('Groups')
plt.ylabel('Mean of T_LOCAL_TX_PD')
plt.xticks(rotation=45)
min_val = combined_data['T_LOCAL_TX_PD'].min() * 0.95 
max_val = combined_data['T_LOCAL_TX_PD'].max() * 1.05
plt.ylim(min_val, max_val)
plt.tight_layout()
plt.show()


Companies in dealers category had a higher PD mean than other categories. The following category is company in tier2 and tier1.
We also notice that companies in the region EMEA have the biggest PD mean accross this period.
The region and category can be a little bit relevant to help predicting PD even if their contributions can be be small.

In [None]:
plt.figure(figsize=(6, 4))
sns.heatmap(static_data[['ESG_SCORE','T_LOCAL_MT_ACTIF_SOCIAL','T_LOCAL_TX_PD']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Features')
plt.show()

There is a positive correlation between T_LOCAL_MT_ACTIF_SOCIAL and T_LOCAL_TX_PD but this one is very small (0.012)
We also have a very small negative correlation between our target and T_LOCAL_MT_ACTIF_SOCIAL(-0.018)

In [None]:
G_train = nx.DiGraph()

for _, row in df_train_encoded.iterrows():
    node_features = row.drop(['ID', 'T_LOCAL_TX_PD']).to_dict()  
    G_train.add_node(row['ID'], **node_features, t_local_tx_pd=row['T_LOCAL_TX_PD'])


for _, row in transactions_data.iterrows():
    if row['ID'] in G_train and row['COUNTERPARTY'] in G_train:  
        G_train.add_edge(row['ID'], row['COUNTERPARTY'], date=row['DATE'], tx_amount=row['TX_AMOUNT'])



print(f"Train Graph has {len(G_train.nodes)} nodes and {len(G_train.edges)} edges.")


In [None]:
G_test = nx.DiGraph()

for _, row in df_test_encoded.iterrows():
    node_features = row.drop(['ID', 'T_LOCAL_TX_PD']).to_dict()  
    G_test.add_node(row['ID'], **node_features, t_local_tx_pd=row['T_LOCAL_TX_PD'])


for _, row in transactions_data.iterrows():
    if row['ID'] in G_test and row['COUNTERPARTY'] in G_test:  
        G_test.add_edge(row['ID'], row['COUNTERPARTY'], date=row['DATE'], tx_amount=row['TX_AMOUNT'])



print(f"Test Graph has {len(G_test.nodes)} nodes and {len(G_test.edges)} edges.")


In [None]:
def plot_neighborhood(depth,G=G_train, node_id=373869):

    neighbors = nx.single_source_shortest_path_length(G, node_id, cutoff=depth).keys()
    subgraph = G.subgraph(neighbors)
    pos = nx.spring_layout(subgraph, seed=42)


    edge_x, edge_y = [], []
    for edge in subgraph.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=2, color='#888'),
                            hoverinfo='none', mode='lines')

    node_x, node_y = [], []
    node_colors = []
    depth_colors = ['red', 'green', 'blue', 'purple', 'orange', 'yellow', 'pink', 'brown', 'grey', 'cyan']  

    for node in subgraph.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        node_depth = nx.shortest_path_length(G, source=node_id, target=node)
        color = depth_colors[node_depth] if node_depth < len(depth_colors) else 'black'
        node_colors.append(color)

    node_trace = go.Scatter(x=node_x, y=node_y, mode='markers', hoverinfo='text',
                            marker=dict(showscale=False, colorscale='Viridis', size=10, color=node_colors, line_width=2))

 
    node_trace.text = [f'ID: {node}' for node in subgraph.nodes()]


    fig = go.Figure(data=[edge_trace, node_trace], layout=go.Layout(
                title=f'{depth}-Deep Neighborhood of Node {node_id}',
                titlefont_size=16,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))

    fig.show()


for i in [1,2,3]:
    plot_neighborhood(i)


# PART 2: MODELING

Let try some simple models on our train set without using the transactions informations to see how they performs

In [None]:
X_train=df_train_encoded.drop(columns=["T_LOCAL_TX_PD","ID"])
X_test=df_test_encoded.drop(columns=["T_LOCAL_TX_PD","ID"])
y_train=df_train_encoded[["T_LOCAL_TX_PD"]]
y_test=df_test_encoded[["T_LOCAL_TX_PD"]]

### 2.1: Using Random forest

In [None]:
param_grid = {
    'n_estimators': [10, 50, 100], 
    'max_depth': [None, 10, 20, 30], 
    'min_samples_split': [2, 4, 6]
}

grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           cv=5,  
                           scoring='neg_mean_squared_error',  
                           verbose=1,  
                           n_jobs=-1)


grid_search.fit(X_train, y_train.values.ravel())


best_model = grid_search.best_estimator_

y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Training R2 score: {train_r2:.2f}")
print(f"Test R2 score: {test_r2:.2f}")
print(f"Training MSE: {train_mse:.2f}")
print(f"Test MSE: {test_mse:.2f}")

In [None]:
test_data_['PD random forest'] = np.array([from_pred_to_normal(y) for y in y_test_pred])
test_data_[["T_LOCAL_TX_PD",'PD random forest']]

### 2.2: Let try GNN for predicting PD

In [None]:
G_data_train = from_networkx(G_train)
G_data_train.x = torch.tensor([list(data.values()) for _, data in G_train.nodes(data=True)], dtype=torch.float)
G_data_train.y = torch.tensor([data['t_local_tx_pd'] for _, data in G_train.nodes(data=True)], dtype=torch.float)


G_data_test = from_networkx(G_test)
G_data_test.x = torch.tensor([list(data.values()) for _, data in G_test.nodes(data=True)], dtype=torch.float)
G_data_test.y = torch.tensor([data['t_local_tx_pd'] for _, data in G_test.nodes(data=True)], dtype=torch.float)

In [None]:
class EnhancedGCN(torch.nn.Module):
    def __init__(self, num_node_features, hidden_channels, output_channels=1, dropout_rate=0.5):
        super(EnhancedGCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)  
        self.batch_norm1 = BatchNorm(hidden_channels)  
        self.batch_norm2 = BatchNorm(hidden_channels) 
        self.dropout_rate = dropout_rate
        self.out = torch.nn.Linear(hidden_channels, output_channels)

    def forward(self, x, edge_index):
        # 1st layer
        x = F.relu(self.conv1(x, edge_index))
        x = self.batch_norm1(x)
        x = F.dropout(x, p=self.dropout_rate, training=self.training)

        # 2st  layer
        x = F.relu(self.conv2(x, edge_index))
        x = self.batch_norm2(x)
        x = F.dropout(x, p=self.dropout_rate, training=self.training)

        # 3rd  layer
        x = F.relu(self.conv3(x, edge_index))

        
        x = self.out(x)
        return x
 

In [None]:

num_node_features = G_data_train.num_node_features
hidden_channels = 64  
output_channels = 1  
dropout_rate = 0.5 

model = EnhancedGCN(num_node_features, hidden_channels, output_channels, dropout_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  
loss_func = torch.nn.MSELoss() 

def train():
    model.train()
    optimizer.zero_grad()
    out = model(G_data_train.x, G_data_train.edge_index)
    loss = loss_func(out.squeeze(), G_data_train.y)
    loss.backward()
    optimizer.step()
    return loss.item()

for epoch in range(2000):
    loss = train()
    print(f'Epoch {epoch+1}: Loss: {loss}')

In [None]:
def evaluate(data):
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        loss = loss_func(out.squeeze(), data.y)
    return loss.item()

test_loss = evaluate(G_data_test)
print(f'Test Loss: {test_loss}')

In [None]:
def predict(data):
    model.eval()
    with torch.no_grad():
        predictions = model(data.x, data.edge_index)
    return predictions.squeeze()

predictions = predict(G_data_test)

In [None]:
test_data_['PD GNN'] = np.array([from_pred_to_normal(y) for y in predictions])
test_data_[["T_LOCAL_TX_PD",'PD random forest','PD GNN']]

### 2.2: LET RESCALE NUMERICAL VARIABLES AND RETRY GNN

In [None]:
scaler = StandardScaler()

num_features_static=["ESG_SCORE","T_LOCAL_MT_ACTIF_SOCIAL"]
num_features_transac=["TX_AMOUNT"]

df_test_encoded[num_features_static] = scaler.fit_transform(df_test_encoded[num_features_static])
df_train_encoded[num_features_static] = scaler.fit_transform(df_train_encoded[num_features_static])
transactions_data[num_features_transac]=scaler.fit_transform(transactions_data[num_features_transac])


In [None]:
G_train = nx.DiGraph()

for _, row in df_train_encoded.iterrows():
    node_features = row.drop(['ID', 'T_LOCAL_TX_PD']).to_dict()  
    G_train.add_node(row['ID'], **node_features, t_local_tx_pd=row['T_LOCAL_TX_PD'])


for _, row in transactions_data.iterrows():
    if row['ID'] in G_train and row['COUNTERPARTY'] in G_train:  
        G_train.add_edge(row['ID'], row['COUNTERPARTY'], date=row['DATE'], tx_amount=row['TX_AMOUNT'])

print(f"Train Graph has {len(G_train.nodes)} nodes and {len(G_train.edges)} edges.")

G_test = nx.DiGraph()

for _, row in df_test_encoded.iterrows():
    node_features = row.drop(['ID', 'T_LOCAL_TX_PD']).to_dict()  
    G_test.add_node(row['ID'], **node_features, t_local_tx_pd=row['T_LOCAL_TX_PD'])


for _, row in transactions_data.iterrows():
    if row['ID'] in G_test and row['COUNTERPARTY'] in G_test:  
        G_test.add_edge(row['ID'], row['COUNTERPARTY'], date=row['DATE'], tx_amount=row['TX_AMOUNT'])

print(f"Test Graph has {len(G_test.nodes)} nodes and {len(G_test.edges)} edges.")

In [None]:
G_data_train = from_networkx(G_train)
G_data_train.x = torch.tensor([list(data.values()) for _, data in G_train.nodes(data=True)], dtype=torch.float)
G_data_train.y = torch.tensor([data['t_local_tx_pd'] for _, data in G_train.nodes(data=True)], dtype=torch.float)

G_data_test = from_networkx(G_test)
G_data_test.x = torch.tensor([list(data.values()) for _, data in G_test.nodes(data=True)], dtype=torch.float)
G_data_test.y = torch.tensor([data['t_local_tx_pd'] for _, data in G_test.nodes(data=True)], dtype=torch.float)

In [None]:
class EnhancedGCN(torch.nn.Module):
    def __init__(self, num_node_features, hidden_channels, output_channels=1, dropout_rate=0.5):
        super(EnhancedGCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)  
        self.batch_norm1 = BatchNorm(hidden_channels)  
        self.batch_norm2 = BatchNorm(hidden_channels) 
        self.dropout_rate = dropout_rate
        self.out = torch.nn.Linear(hidden_channels, output_channels)

    def forward(self, x, edge_index):
        # 1st layer
        x = F.relu(self.conv1(x, edge_index))
        x = self.batch_norm1(x)
        x = F.dropout(x, p=self.dropout_rate, training=self.training)

        # 2st  layer
        x = F.relu(self.conv2(x, edge_index))
        x = self.batch_norm2(x)
        x = F.dropout(x, p=self.dropout_rate, training=self.training)

        # 3rd  layer
        x = F.relu(self.conv3(x, edge_index))

        
        x = self.out(x)
        return x

In [None]:

num_node_features = G_data_train.num_node_features
hidden_channels = 70  
output_channels = 1  
dropout_rate = 0.2 

model = EnhancedGCN(num_node_features, hidden_channels, output_channels, dropout_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  
loss_func = torch.nn.MSELoss() 

def train():
    model.train()
    optimizer.zero_grad()
    out = model(G_data_train.x, G_data_train.edge_index)
    loss = loss_func(out.squeeze(), G_data_train.y)
    loss.backward()
    optimizer.step()
    return loss.item()

for epoch in range(2000):
    loss = train()
    print(f'Epoch {epoch+1}: Loss: {loss}')

In [None]:
def evaluate(data):
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        loss = loss_func(out.squeeze(), data.y)
    return loss.item()

test_loss = evaluate(G_data_test)
print(f'Test Loss: {test_loss}')

In [None]:
def predict(data):
    model.eval()
    with torch.no_grad():
        predictions = model(data.x, data.edge_index)
    return predictions.squeeze()

predictions = predict(G_data_test)

In [None]:
test_data_['PD GNN 2'] = np.array([from_pred_to_normal(y) for y in predictions])
test_data_[["T_LOCAL_TX_PD",'PD random forest','PD GNN','PD GNN 2']]

# PART 3: ANALYSIS AND CONCLUSIONS

The random forest is the best estimator we can find for predicting PD in q4. Here the GNN predict mostly values between 4 and 5. This is less accurate even if this model take in account transactions.

After scaling the new GNN is more better than the previous GNN