In [1]:
import numpy as np 
import networkx as nx 
import os 
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
import plotly.graph_objects as go
import plotly.colors as pc
from IPython.display import HTML
import gravis as gv

In [2]:
data_FBW = pd.read_csv('./data/FBW.csv', header=None, sep=';')
# show the first 5 rows
data_FBW.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Wet clay,Modelling,Pressure,Wet smoothing,Leather-hard,Dry,Open firing,,,,,,,,
1,Wet clay,Modelling,Coiling,Pressure,Application separate element,Wet smoothing,Excisions,Leather-hard,Burnishing,Dry,Open firing,Incrustation,,,
2,Wet clay,Modelling,Coiling,Pressure,Application separate element,Wet smoothing,Simple incision,Simple impression,Leather-hard,Burnishing,Dry,Open firing,,,
3,Wet clay,Modelling,Coiling,Pressure,Application separate element,Scraping,Wet smoothing,Simple incision,Simple impression,Leather-hard,Burnishing,Dry,Open firing,,
4,Wet clay,Modelling,Pressure,Wet smoothing,Leather-hard,Dry,Open firing,,,,,,,,


In [6]:
# create a graph from the data and exclude the nan values, the data are not in the form off edge list, but each row is a path in the graph and each element in the row is a node

def graph_from_data(data):
    G = nx.DiGraph()
    for i in range(len(data)):
        path = data.iloc[i].dropna().tolist()
        for j in range(len(path) - 1):
            G.add_edge(path[j], path[j + 1])
            # add the weight of the edge as the number of times the edge appears in the data
            G[path[j]][path[j + 1]]['weight'] = G[path[j]][path[j + 1]].get('weight', 0) + 1
    
    # Normalize the weights of the edges divinding by the outgoing strenght of the source node
    total_weights = {node: G.out_degree(node, weight='weight') for node in G.nodes()}
    for u, v in G.edges():
        G[u][v]['weight'] /= total_weights[u] if total_weights[u] > 0 else 1  # Avoid division by zero

    return G

# create the graph from the data
G_FBW = graph_from_data(data_FBW)

# print the weights of the edges
for u, v, data in G_FBW.edges(data=True):
    print(f"Edge {u} -> {v} has weight {data['weight']}")

# create a layout for the graph
pos_FBW = nx.spring_layout(G_FBW, seed=42)
# Create a color map for the nodes based on their degree
node_degrees = dict(G_FBW.degree())
node_colors = {}
for node, degree in node_degrees.items():
    node_colors[node] = pc.sample_colorscale('Viridis', degree / max(node_degrees.values()))[0]
nx.set_node_attributes(G_FBW, node_colors, 'color')

# Create dictionaries for edge colors and widths
edge_weights = nx.get_edge_attributes(G_FBW, 'weight')
max_weight = max(edge_weights.values())
edge_colors = {}
edge_widths = {}

# Assign colors and widths to each edge based on weight
for u, v, weight in G_FBW.edges(data='weight'):
    normalized_weight = weight / max_weight
    edge_colors[(u, v)] = pc.sample_colorscale('Viridis', normalized_weight)[0]
    edge_widths[(u, v)] = 2 * normalized_weight

# Set the edge attributes
nx.set_edge_attributes(G_FBW, edge_colors, 'color')
nx.set_edge_attributes(G_FBW, edge_widths, 'width')

fig_FBW = go.Figure()
# add the nodes to the graph
for node, data in G_FBW.nodes(data=True):
    fig_FBW.add_trace(go.Scatter(
        x=[pos_FBW[node][0]], 
        y=[pos_FBW[node][1]], 
        mode='markers+text', 
        text=[node], 
        textposition='top center',
        marker=dict(color=data['color']),
        name=node
    ))
# add the edges to the graph - FIX HERE
for u, v, data in G_FBW.edges(data=True):
    fig_FBW.add_trace(go.Scatter(
        x=[pos_FBW[u][0], pos_FBW[v][0]], 
        y=[pos_FBW[u][1], pos_FBW[v][1]], 
        mode='lines',
        line=dict(color=data['color'], width=data['width'], shape='spline', dash='solid'),
        name=f"{u} -> {v}",
        marker=dict(symbol='arrow', size=10, angleref='previous', color=data['color'])
    ))
# set the layout of the plotly graph
fig_FBW.update_layout(
    title='FBW Graph',
    showlegend=False,
    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    height=800,
    width=800
)
# display the plotly graph
fig_FBW.write_html(
     './graph_test.html', include_plotlyjs='cdn', full_html=True)
    
# Read the file content first
with open("./graph_test.html", "r", encoding="utf-8") as f:
    html_content = f.read()
# Display the HTML content
HTML(html_content)

fig1 = gv.d3(G_FBW)

fig1.export_html(
    './graph_test_gravis.html'
)





Edge Wet clay -> Modelling has weight 0.9679144385026738
Edge Wet clay -> Coiling has weight 0.03208556149732621
Edge Modelling -> Pressure has weight 0.08839779005524862
Edge Modelling -> Coiling has weight 0.9116022099447514
Edge Pressure -> Wet smoothing has weight 0.48128342245989303
Edge Pressure -> Application separate element has weight 0.39037433155080214
Edge Pressure -> Scraping has weight 0.12299465240641712
Edge Pressure -> Leather-hard has weight 0.0053475935828877
Edge Wet smoothing -> Leather-hard has weight 0.3837837837837838
Edge Wet smoothing -> Excisions has weight 0.06486486486486487
Edge Wet smoothing -> Simple incision has weight 0.16756756756756758
Edge Wet smoothing -> Simple impression has weight 0.3621621621621622
Edge Wet smoothing -> Application has weight 0.010810810810810811
Edge Wet smoothing -> Tilted impression has weight 0.010810810810810811
Edge Leather-hard -> Dry has weight 0.10160427807486631
Edge Leather-hard -> Burnishing has weight 0.83957219251

In [26]:
# Create a layout for the graph 
pos_FBW = nx.spring_layout(G_FBW)

# Create a color map for the nodes based on their degree
node_degrees = dict(G_FBW.degree())
node_colors = {}
for node, degree in node_degrees.items():
    node_colors[node] = pc.sample_colorscale('Viridis', degree / max(node_degrees.values()))[0]
nx.set_node_attributes(G_FBW, node_colors, 'color')

# Create dictionaries for edge colors and widths
edge_weights = nx.get_edge_attributes(G_FBW, 'weight')
max_weight = max(edge_weights.values())
edge_colors = {}
edge_widths = {}

# Assign colors and widths to each edge based on weight
for u, v, weight in G_FBW.edges(data='weight'):
    normalized_weight = weight / max_weight
    edge_colors[(u, v)] = pc.sample_colorscale('Viridis', normalized_weight)[0]
    edge_widths[(u, v)] = 1 + 3 * normalized_weight  # Scale width between 1-4

# Set the edge attributes
nx.set_edge_attributes(G_FBW, edge_colors, 'color')
nx.set_edge_attributes(G_FBW, edge_widths, 'width')

fig_FBW = go.Figure()

# Add the nodes to the graph
for node, data in G_FBW.nodes(data=True):
    fig_FBW.add_trace(go.Scatter(
        x=[pos_FBW[node][0]], 
        y=[pos_FBW[node][1]], 
        mode='markers+text', 
        text=[node], 
        textposition='top center',
        marker=dict(color=data['color'], size=10),
        name=node,
        hoverinfo='text',
        hovertext=f"Node: {node}<br>Degree: {G_FBW.degree(node)}"
    ))

# Add the edges as lines (without arrows)
for u, v, data in G_FBW.edges(data=True):
    fig_FBW.add_trace(go.Scatter(
        x=[pos_FBW[u][0], pos_FBW[v][0]], 
        y=[pos_FBW[u][1], pos_FBW[v][1]], 
        mode='lines',
        line=dict(color=data['color'], width=data['width']),
        showlegend=False,
        hoverinfo='text',
        hovertext=f"Edge: {u} → {v}<br>Weight: {data['weight']}"
    ))

# show edge legend
fig_FBW.add_trace(go.Scatter(
    x=[None],
    y=[None],
    mode='lines',
    line=dict(color='black', width=2),
    name='Edge Legend',
    hoverinfo='none'
))
# Add a legend for edges
for u, v, data in G_FBW.edges(data=True):
    fig_FBW.add_trace(go.Scatter(
        x=[None],
        y=[None],
        mode='lines',
        line=dict(color=data['color'], width=data['width']),
        name=f"{u} → {v} (Weight: {data['weight']})",
        hoverinfo='none',
        showlegend=True
    ))

# Add arrow annotations to show direction
for u, v, data in G_FBW.edges(data=True):
    # Get the positions
    x0, y0 = pos_FBW[u]
    x1, y1 = pos_FBW[v]
    
    # Calculate the midpoint - this is where we'll place our arrow
    xmid = (x0 + x1) / 2
    ymid = (y0 + y1) / 2
    
    # Calculate the direction vector
    dx = x1 - x0
    dy = y1 - y0
    
    # Normalize and scale to create a shorter arrow at the midpoint
    length = np.sqrt(dx**2 + dy**2)
    arrow_length = length / 8  # Adjust this factor to control arrow length
    dx = dx / length * arrow_length
    dy = dy / length * arrow_length
    
    # Add the arrow annotation
    fig_FBW.add_annotation(
        x=xmid + dx/2,  # End point of arrow
        y=ymid + dy/2,
        ax=xmid - dx/2,  # Start point of arrow
        ay=ymid - dy/2,
        xref='x',
        yref='y',
        axref='x',
        ayref='y',
        showarrow=True,
        arrowhead=2,  # Arrow style
        arrowsize=1.5,  # Arrow size
        arrowwidth=data['width'],  # Match edge width
        arrowcolor=data['color']   # Match edge color
    )



# Set the layout of the plotly graph
fig_FBW.update_layout(
    title='FBW Directed Graph',
    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    height=800,
    width=800, 
    legend=dict(
        orientation='h',  # Horizontal legend
        yanchor='bottom',
        y=1.02,  # Position above the graph
        xanchor='center',
        x=0.5,  # Centered horizontally
        title_text='Edge Weights',
        title_font=dict(size=14),
        font=dict(size=10)
    )

)
#print the weights of the edges
print("Edge Weights:")
for u, v, data in G_FBW.edges(data=True):
    print(f"{u} → {v}: {data['weight']}")

# Display the plotly graph
fig_FBW.write_html('./graph_test.html', include_plotlyjs='cdn', full_html=True)

# Read the file content and display
with open("./graph_test.html", "r", encoding="utf-8") as f:
    html_content = f.read()
HTML(html_content)

Edge Weights:
Wet clay → Modelling: 0.9679144385026738
Wet clay → Coiling: 0.03208556149732621
Modelling → Pressure: 0.08839779005524862
Modelling → Coiling: 0.9116022099447514
Pressure → Wet smoothing: 0.48128342245989303
Pressure → Application separate element: 0.39037433155080214
Pressure → Scraping: 0.12299465240641712
Pressure → Leather-hard: 0.0053475935828877
Wet smoothing → Leather-hard: 0.3837837837837838
Wet smoothing → Excisions: 0.06486486486486487
Wet smoothing → Simple incision: 0.16756756756756758
Wet smoothing → Simple impression: 0.3621621621621622
Wet smoothing → Application: 0.010810810810810811
Wet smoothing → Tilted impression: 0.010810810810810811
Leather-hard → Dry: 0.10160427807486631
Leather-hard → Burnishing: 0.839572192513369
Leather-hard → Shining: 0.0213903743315508
Leather-hard → Excisions: 0.0213903743315508
Leather-hard → Simple incision: 0.0106951871657754
Leather-hard → Leather hard smoothing: 0.0053475935828877
Dry → Open firing: 1.0
Open firing → Inc

In [27]:
# now by starting from the start node, we can find the most probable path to the end node, by following the edges weigh. Or in general create a a path using the probabilities associated jumping from node to node

def find_most_probable_path(G, start_node, end_node):
    path = [start_node]
    current_node = start_node
    
    while current_node != end_node:
        neighbors = list(G.successors(current_node))
        if not neighbors:
            break  # No more neighbors to explore
        
        # Get the weights of the edges to the neighbors
        weights = [G[current_node][neighbor]['weight'] for neighbor in neighbors]
        
        # Normalize the weights to get probabilities
        total_weight = sum(weights)
        probabilities = [weight / total_weight for weight in weights]
        
        # Choose the next node based on the probabilities
        next_node = np.random.choice(neighbors, p=probabilities)
        path.append(next_node)
        current_node = next_node
    
    return path

# Example usage
start_node = 'Wet clay'  # Replace with your actual start node
end_node = 'Dry'  # Replace with your actual end node
most_probable_path = find_most_probable_path(G_FBW, start_node, end_node)
print("Most probable path from {} to {}: {}".format(start_node, end_node, most_probable_path))
# To visualize the most probable path, we can highlight it in the graph


Most probable path from Wet clay to Dry: ['Wet clay', np.str_('Modelling'), np.str_('Coiling'), np.str_('Pressure'), np.str_('Application separate element'), np.str_('Wet smoothing'), np.str_('Tilted impression'), np.str_('Leather-hard'), np.str_('Shining'), np.str_('Dry')]


In [28]:
# plot the most probable path in the graph
def plot_most_probable_path(G, path, pos):
    fig = go.Figure()
    
    # Add nodes
    for node, data in G.nodes(data=True):
        fig.add_trace(go.Scatter(
            x=[pos[node][0]], 
            y=[pos[node][1]], 
            mode='markers+text', 
            text=[node], 
            textposition='top center',
            marker=dict(color=data['color'], size=10),
            name=node
        ))
    
    # Add edges
    for u, v, data in G.edges(data=True):
        fig.add_trace(go.Scatter(
            x=[pos[u][0], pos[v][0]], 
            y=[pos[u][1], pos[v][1]], 
            mode='lines',
            line=dict(color=data['color'], width=data['width']),
            showlegend=False
        ))
    
    # Highlight the most probable path
    for i in range(len(path) - 1):
        u = path[i]
        v = path[i + 1]
        fig.add_trace(go.Scatter(
            x=[pos[u][0], pos[v][0]], 
            y=[pos[u][1], pos[v][1]], 
            mode='lines',
            line=dict(color='red', width=4, dash='dash'),
            name=f"{u} → {v} (Path)",
            showlegend=False
        ))
    
    fig.update_layout(
        title='Most Probable Path Visualization',
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        height=800,
        width=800
    )
    
    return fig

# Plot the most probable path
most_probable_path_fig = plot_most_probable_path(G_FBW, most_probable_path, pos_FBW)
# Display the plotly graph for the most probable path
most_probable_path_fig.write_html('./most_probable_path.html', include_plotlyjs='cdn', full_html=True)
# Read the file content and display
with open("./most_probable_path.html", "r", encoding="utf-8") as f:
    html_content = f.read()
HTML(html_content)


In [29]:
# Create an empirical path distribution from the data P(path={node_1, node_2, ..., node_n}) = P(node_1) * P(node_2 | node_1) * ... * P(node_n | node_{n-1}, ..., node_1})
def empirical_path_distribution(G, path):
    prob = 1.0
    for i in range(len(path) - 1):
        u = path[i]
        v = path[i + 1]
        if G.has_edge(u, v):
            weight = G[u][v]['weight']
            total_outgoing_weight = sum(G[u][nbr]['weight'] for nbr in G.successors(u))
            if total_outgoing_weight > 0:
                prob *= weight / total_outgoing_weight
            else:
                prob *= 0  # No outgoing edges from u
        else:
            prob *= 0  # No edge from u to v
    return prob
# Example usage
start_node = 'Wet clay'  # Replace with your actual start node
end_node = 'Dry'  # Replace with your actual end node
path = find_most_probable_path(G_FBW, start_node, end_node)  # Use the previously found path
empirical_prob = empirical_path_distribution(G_FBW, path)
print("Empirical probability of the path {}: {:.4f}".format(path, empirical_prob))
# To visualize the empirical path distribution, we can create a bar chart showing the probabilities of each edge in the path
def plot_empirical_path_distribution(G, path):
    edges = [(path[i], path[i + 1]) for i in range(len(path) - 1)]
    weights = [G[u][v]['weight'] for u, v in edges]
    
    # # Normalize the weights to get probabilities
    # total_weight = sum(weights)
    # probabilities = [weight / total_weight for weight in weights]
    
    fig = go.Figure(data=[go.Bar(
        x=[f"{u} → {v}" for u, v in edges],
        y=weights,
        marker_color='blue'
    )])
    
    fig.update_layout(
        title='Empirical Path Distribution',
        xaxis_title='Edges',
        yaxis_title='Probability',
        height=400,
        width=800
    )
    
    return fig
# Plot the empirical path distribution
empirical_path_fig = plot_empirical_path_distribution(G_FBW, path)
# Display the plotly graph for the empirical path distribution
empirical_path_fig.write_html('./empirical_path_distribution.html', include_plotlyjs='cdn', full_html=True)
# Read the file content and display
with open("./empirical_path_distribution.html", "r", encoding="utf-8") as f:
    html_content = f.read()
HTML(html_content)


Empirical probability of the path ['Wet clay', np.str_('Modelling'), np.str_('Coiling'), np.str_('Pressure'), np.str_('Application separate element'), np.str_('Wet smoothing'), np.str_('Leather-hard'), np.str_('Dry')]: 0.0113


In [30]:
# Given a new path determine if it is most likely to come from the empirical distribution or from the random distribution
def is_most_likely_path(G, path, empirical_prob):
    random_prob = 1.0 / len(G.edges()) ** (len(path) - 1)  # Assuming uniform distribution over edges
    return empirical_prob > random_prob
# Example usage
start_node = 'Wet clay'  # Replace with your actual start node
end_node = 'Open firing'  # Replace with your actual end node
new_path = find_most_probable_path(G_FBW, start_node, end_node)  # Use the same path for comparison
empirical_prob_new = empirical_path_distribution(G_FBW, new_path)
is_most_likely = is_most_likely_path(G_FBW, new_path, empirical_prob_new)
print("Is the new path more likely than random?", is_most_likely)


Is the new path more likely than random? True


In [None]:
# the chain of steps to make a pot (all the elements in a row of the data) is a path. I would like to find a pdf of the paths, so I can sample from it and generate new paths

def path_pdf(data_FBW):
    path_counts = Counter()
    
    # Count the occurrences of each path in the data
    for i in range(len(data_FBW)):
        path = data_FBW.iloc[i].dropna().tolist()
        if len(path) > 1:
            path_counts[tuple(path)] += 1
    
    # Calculate the total number of paths
    total_paths = sum(path_counts.values())
    
    # Create a probability distribution for each unique path
    path_probabilities = {path: count / total_paths for path, count in path_counts.items()}
    
    return path_probabilities
def path_pdf_weights(G): # find the path pdf using the weights of the edges (which are already normalised as a probability) take the path from the data but then I know the probs from one step to another 
    path_probabilities = {}
    for i in range(len(data_FBW)):
        path = data_FBW.iloc[i].dropna().tolist()
        if len(path) > 1:
            prob = 1.0
            for j in range(len(path) - 1):
                u = path[j]
                v = path[j + 1]
                if G.has_edge(u, v):
                    prob *= G[u][v]['weight']
                else:
                    prob *= 0
            path_probabilities[tuple(path)] = path_probabilities.get(tuple(path), 0) + prob
    # Normalize the path probabilities
    total_prob = sum(path_probabilities.values())
    if total_prob > 0:
        path_probabilities = {path: prob / total_prob for path, prob in path_probabilities.items()}
    else:
        path_probabilities = {path: 0 for path in path_probabilities}
    return path_probabilities
# Calculate the path probabilities using the empirical distribution


# Example usage
path_probabilities = path_pdf_weights(G_FBW)
# Print the path probabilities
for path, prob in path_probabilities.items():
    print(f"Path: {path}, Probability: {prob:.4f}")
# To visualize the path probabilities, we can create a bar chart showing the probabilities of each unique path
def plot_path_probabilities(path_probabilities, height=400, width=800):
    paths = list(path_probabilities.keys())
    probabilities = list(path_probabilities.values())
    # sort paths by probability
    sorted_indices = np.argsort(probabilities)[::-1]  # Sort in descending order
    paths = [paths[i] for i in sorted_indices]
    probabilities = [probabilities[i] for i in sorted_indices]
    
    
    fig = go.Figure(data=[go.Bar(
        x=[' -> '.join(path) for path in paths],
        y=probabilities,
        marker_color='green'
    )])
    
    fig.update_layout(
        title='Path Probabilities',
        xaxis_title='Paths',
        yaxis_title='Probability',
        height=height,
        width=width
    )
    
    return fig
# Plot the path probabilities
path_prob_fig = plot_path_probabilities(path_probabilities)
# Display the plotly graph for the path probabilities
path_prob_fig.write_html('./path_probabilities.html', include_plotlyjs='cdn', full_html=True)
# Read the file content and display
with open("./path_probabilities.html", "r", encoding="utf-8") as f:
    html_content = f.read()
HTML(html_content, metadata=dict(width=800, height=1600))

Path: ('Wet clay', 'Modelling', 'Pressure', 'Wet smoothing', 'Leather-hard', 'Dry', 'Open firing'), Probability: 0.0016
Path: ('Wet clay', 'Modelling', 'Coiling', 'Pressure', 'Application separate element', 'Wet smoothing', 'Excisions', 'Leather-hard', 'Burnishing', 'Dry', 'Open firing', 'Incrustation'), Probability: 0.0002
Path: ('Wet clay', 'Modelling', 'Coiling', 'Pressure', 'Application separate element', 'Wet smoothing', 'Simple incision', 'Simple impression', 'Leather-hard', 'Burnishing', 'Dry', 'Open firing'), Probability: 0.0131
Path: ('Wet clay', 'Modelling', 'Coiling', 'Pressure', 'Application separate element', 'Scraping', 'Wet smoothing', 'Simple incision', 'Simple impression', 'Leather-hard', 'Burnishing', 'Dry', 'Open firing'), Probability: 0.0003
Path: ('Wet clay', 'Modelling', 'Coiling', 'Pressure', 'Application separate element', 'Wet smoothing', 'Simple incision', 'Simple impression', 'Excisions', 'Leather-hard', 'Burnishing', 'Dry', 'Open firing'), Probability: 0.000

In [32]:
# manteining the same ordering can you overlap the paths in one dataset and the other dataset to check how similar the Pot aboundance distribution is in the two datasets

def plot_path_abundance_comparison(path_probabilities1, path_probabilities2, name1 = "Dataset 1", name2 = "Dataset 2"):
    paths1 = list(path_probabilities1.keys())
    probabilities1 = list(path_probabilities1.values())
    # sort paths by probability
    sorted_indices = np.argsort(probabilities1)[::-1]  # Sort in descending order
    paths1 = [paths1[i] for i in sorted_indices]
    probabilities1 = [probabilities1[i] for i in sorted_indices]
    
    paths2 = list(path_probabilities2.keys())
    probabilities2 = list(path_probabilities2.values())
    # sort paths using the same indices as paths1
    sorted_indices2 = np.argsort(probabilities2)[::-1]  # Sort in descending order
    paths2 = [paths2[i] for i in sorted_indices]
    probabilities2 = [probabilities2[i] for i in sorted_indices]

    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=[' -> '.join(path) for path in paths1],
        y=probabilities1,
        name= name1,
        marker_color='blue',
        opacity=0.6
    ))
    fig.add_trace(go.Bar(
        x=[' -> '.join(path) for path in paths2],
        y=probabilities2,
        name= name2,
        marker_color='red',
        opacity=0.6
    ))
    fig.update_layout(
        title='Path Abundance Comparison',
        xaxis_title='Paths',
        yaxis_title='Probability',
        barmode='overlay',  # Overlay the bars
        height=1600,
        width=800,
        legend=dict(
            orientation='h',  # Horizontal legend
            yanchor='bottom',
            y=1.02,  # Position above the graph
            xanchor='center',
            x=0.5,  # Centered horizontally
            title_text='Datasets',
            title_font=dict(size=14),
            font=dict(size=10)
        )
    )
    return fig

# Example usage with two datasets
data_FBW = pd.read_csv('./data/FBW.csv', header=None, sep=';')
# show the first 5 rows
data_FBW.head()

data_CW = pd.read_csv('./data/CW.csv', header=None, sep=';')
# show the first 5 rows
data_CW.head()
# Create graphs from the data
G_FBW = graph_from_data(data_FBW)
G_CW = graph_from_data(data_CW)
# Calculate the path probabilities for both datasets
path_probabilities_FBW = path_pdf_weights(G_FBW)
path_probabilities_CW = path_pdf_weights(G_CW)
# Plot the path abundance comparison
path_abundance_comparison_fig = plot_path_abundance_comparison(path_probabilities_FBW, path_probabilities_CW, name1="FBW Dataset", name2="CW Dataset")
# Display the plotly graph for the path abundance comparison
path_abundance_comparison_fig.write_html('./path_abundance_comparison.html', include_plotlyjs='cdn', full_html=True)
# Read the file content and display
with open("./path_abundance_comparison.html", "r", encoding="utf-8") as f:
    html_content = f.read()

HTML(html_content, metadata=dict(width=800, height=1600))



In [33]:
# Compute the Wasserstein distance between the two path distributions
from scipy.stats import wasserstein_distance

def compute_wasserstein_distance(path_probabilities1, path_probabilities2):
    # Ensure both distributions have the same paths
    all_paths = set(path_probabilities1.keys()).union(set(path_probabilities2.keys()))
    
    # Create vectors for the probabilities
    prob_vector1 = [path_probabilities1.get(path, 0) for path in all_paths]
    prob_vector2 = [path_probabilities2.get(path, 0) for path in all_paths]
    
    # Compute the Wasserstein distance
    distance = wasserstein_distance(prob_vector1, prob_vector2)
    
    return distance
# Example usage
wasserstein_distance_value = compute_wasserstein_distance(path_probabilities_FBW, path_probabilities_CW)
print(f"Wasserstein distance between the two path distributions: {wasserstein_distance_value:.4f}")
# To visualize the Wasserstein distance, we can create a bar chart showing the probabilities of each unique path in both datasets

Wasserstein distance between the two path distributions: 0.0034


In [34]:
# extract weights from the networkx graph
data_FBW = pd.read_csv('./data/FBW.csv', header=None, sep=';')
G_FBW = graph_from_data(data_FBW)
def extract_weights(G):
    weights = []
    for u, v, data in G.edges(data=True):
        weights.append(data['weight'])
        print(f"Edge {u} -> {v} has weight {data['weight']}")

extract_weights(G_FBW)

Edge Wet clay -> Modelling has weight 0.9679144385026738
Edge Wet clay -> Coiling has weight 0.03208556149732621
Edge Modelling -> Pressure has weight 0.08839779005524862
Edge Modelling -> Coiling has weight 0.9116022099447514
Edge Pressure -> Wet smoothing has weight 0.48128342245989303
Edge Pressure -> Application separate element has weight 0.39037433155080214
Edge Pressure -> Scraping has weight 0.12299465240641712
Edge Pressure -> Leather-hard has weight 0.0053475935828877
Edge Wet smoothing -> Leather-hard has weight 0.3837837837837838
Edge Wet smoothing -> Excisions has weight 0.06486486486486487
Edge Wet smoothing -> Simple incision has weight 0.16756756756756758
Edge Wet smoothing -> Simple impression has weight 0.3621621621621622
Edge Wet smoothing -> Application has weight 0.010810810810810811
Edge Wet smoothing -> Tilted impression has weight 0.010810810810810811
Edge Leather-hard -> Dry has weight 0.10160427807486631
Edge Leather-hard -> Burnishing has weight 0.83957219251