<a href="https://colab.research.google.com/github/AeroEng16/GNN_learning/blob/main/AhmedData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import os

from google.colab import files
try:
  import pyvista as pv
except:
  !pip install pyvista
  import pyvista as pv
from scipy.spatial import cKDTree

try:
  from stl import mesh  # numpy-stl
except:
  !pip install numpy-stl
  from stl import mesh  # numpy-stl
import numpy as np
import xml.etree.ElementTree as ET
#try:
#  import earcut
#except:
!pip install earcut-py
from earcut import earcut
import itertools as it

import more_itertools as mit

import torch

try:
  from torch_geometric.data import Data
  from torch_geometric.data import Data, DataLoader
  from torch_geometric.utils import subgraph

except:
  !pip install torch_geometric
  from torch_geometric.data import Data
  from torch_geometric.data import Data, DataLoader
  from torch_geometric.utils import subgraph

try:
  import vtk
except:
  !pip install vtk
  import vtk
from vtk.util.numpy_support import vtk_to_numpy
from os import read
from collections import defaultdict
from scipy.interpolate import griddata

import glob



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# To Do


* What to download from there and how many examples do we need?
*   https://huggingface.co/datasets/neashton/ahmedml#cfd-solver




## Loading Ahmed Body Research Data

In [None]:
%%shell
# Set the paths
HF_OWNER="neashton"
HF_PREFIX="ahmedml"

# Set the local directory to download the files
LOCAL_DIR="./ahmed_data"

# Create the local directory if it doesn't exist
mkdir -p "$LOCAL_DIR"

# Loop through the run folders from 1 to 500
for i in $(seq 1 200); do
    RUN_DIR="run_$i"
    RUN_LOCAL_DIR="$LOCAL_DIR/$RUN_DIR"

    # Create the run directory if it doesn't exist
    mkdir -p "$RUN_LOCAL_DIR"

    # Download the ahmed_i.stl file
    wget "https://huggingface.co/datasets/${HF_OWNER}/${HF_PREFIX}/resolve/main/$RUN_DIR/ahmed_$i.stl" -O "$RUN_LOCAL_DIR/ahmed_$i.stl"

    # Download the force_mom_i.csv file
    wget "https://huggingface.co/datasets/${HF_OWNER}/${HF_PREFIX}/resolve/main/$RUN_DIR/force_mom_$i.csv" -O "$RUN_LOCAL_DIR/force_mom_$i.csv"

    # Download the force_mom_i.csv file
    wget "https://huggingface.co/datasets/${HF_OWNER}/${HF_PREFIX}/resolve/main/$RUN_DIR/boundary_$i.vtp" -O "$RUN_LOCAL_DIR/boundary_$i.vtp"

done

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2025-10-26 12:47:32 (153 KB/s) - ‘./ahmed_data/run_94/force_mom_94.csv’ saved [42/42]

--2025-10-26 12:47:32--  https://huggingface.co/datasets/neashton/ahmedml/resolve/main/run_94/boundary_94.vtp
Resolving huggingface.co (huggingface.co)... 13.35.202.97, 13.35.202.34, 13.35.202.121, ...
Connecting to huggingface.co (huggingface.co)|13.35.202.97|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cas-bridge.xethub.hf.co/xet-bridge-us/67a9e0bcca8e1649fc10745c/f5fa26ad74338efcd6dc94105ebe074fe51173f6d3a7622df297a5c379adc170?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20251026%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251026T124732Z&X-Amz-Expires=3600&X-Amz-Signature=753bab252625188ff7db313eb503871b88d081c9bffc40c9ea5b19fe21858d29&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27b



## Functions to help with the Data processing

In [3]:
def readVTK(filename):
  '''
  Function that takes in a vtp file from openfoam and
  reads it into various arrays for point coordinates, cell connectivity,
  cell nodes and cell breakpoints
  '''
  reader = vtk.vtkXMLPolyDataReader()
  reader.SetFileName(filename)
  reader.Update()
  polyDataOutput = reader.GetOutput()


  polydata = reader.GetOutput()
  points = polydata.GetPoints()
  array = points.GetData()
  point_coordinates = vtk_to_numpy(array)
  cellData = polydata.GetCellData()
  numArrays = cellData.GetNumberOfArrays()
  #for i in range(numArrays):
  #  print(cellData.GetArrayName(i))
  #
  pressureData = vtk_to_numpy(cellData.GetArray('pMean'))
  meshData = polydata.GetPolys()
  # connectivity Array is a single 1D array that lists all node indices in a given cell
  connectivityArray = vtk_to_numpy(meshData.GetConnectivityArray())
  # Offsets are the index where each cell connectivity array starts - https://vtk.org/doc/nightly/html/classvtkCellArray.html#details
  offsetArray = vtk_to_numpy(meshData.GetOffsetsArray())
  return pressureData, point_coordinates, connectivityArray, offsetArray

In [4]:
def calculateEdges(offsetArray, connectivityArray):
  ''' For a vtp file, create a list of the nodes for each cell and
   then convert this to a list of edges in both directions (i.e. two entries
   per mesh edge)
  '''
  cells = []
  for index,i in enumerate(offsetArray[:-1]):
    cellVerts = connectivityArray[i:offsetArray[index+1]]
    cells.append(cellVerts.tolist())
  edges = []
  for cell in cells:
    n = len(cell)
    currentEdges = list(it.islice(mit.windowed(it.cycle(cell), 2), n-1, 2*n-1))
    currentEdges = [list(edge) for edge in currentEdges]
    currentEdges.extend([x[::-1] for x in currentEdges])
    edges.extend(currentEdges)
  return edges

def calcEdgeVectors(edges,coords):
  '''
  From a list of edges and a list of point coordinates, calculate
  the edge vector.
  '''
  edgeDf=  pd.DataFrame(edges,columns=['node1','node2'])
  vectors = coords[edgeDf.node2]-coords[edgeDf.node1]
  vectors = vectors.tolist()
  return vectors

In [5]:
def createNodeDicts(connectivityArray,offsetArray,pressureData):

  # Function below is an efficient means of identifying all the instances of each node number, can then link those indices to a cell number via the offset array (somehow)
  def list_duplicates(seq):
      tally = defaultdict(list)
      for i,item in enumerate(seq):
          tally[item].append(i)
      return ((key,locs) for key,locs in tally.items()
                              if len(locs)>=1)
  # Function below finds the indices of list that match the values in the 2nd list
  def findMatching(lst, cellNums):
  #    return [i for i, x in enumerate(lst) if x in cellNums]
      return [[i for i, x in enumerate(lst) if x == k] for k in cellNums]

  # Node list is a list of tuples where the first entry is the node number and the second is a list of its locations in the connectivityArray
  nodeList = sorted(list_duplicates(connectivityArray))

  # Node cells is a list where the number indicates the cell that each index belongs to
  nodeCells = [np.nan]*len(connectivityArray-1)

  for counter,i in enumerate(range(len(offsetArray)-1)):
    nodeCells[int(offsetArray[counter]):int(offsetArray[counter+1])] = [counter]*len(nodeCells[int(offsetArray[counter]):int(offsetArray[counter+1])])

  # Node list is a dictionary where the keys are nodes and the value is the average
  # pressure calculated from the cells that node is part of.
  nodePressuresDict = {}
  nodeCellsDict = {}
  for i in nodeList:
    nodeCellsDict[int(i[0])] = [nodeCells[j] for j in i[1]]
    nodePressuresDict[int(i[0])] = np.mean([pressureData[k] for k in [nodeCells[j] for j in i[1]]])
  return nodeCellsDict, nodePressuresDict

## Convert Data to Pygeometric Data Obkect
Requires the following
* data.x -  node feature matrix list of lists (tensor) where each list is a set of node features, in this case where just looking at surface data then no node features are required.
* data.edge_index - Edge connectivity matrix i.e. list of lists where each list is two nodes that are connected by an edge (given this is undirected there needs to be an edge defined in each direction)
* data.edge_attr - edge feature matrix, list of lists where each sublist is an edge feature matrix, in this case vectors for each edge
* data.y - node level data to train against, in this case it is the surface pressure that should be predicted at each node

To do the above we need to convert the cell centered values to node values for pressures, to do this we need to create a data stucture (using Pandas here) that has an entry for each node where there is a list of connected nodes (or edges), the cells it is part of and their associated pressures.


In [6]:
vtpFileList =  glob.glob('/content/ahmed_data/*/boundary_*.vtp')
try:
  os.mkdir('DataPoints')
except OSError as e:
    print("Path already exists, no new directory made")
for counter,filename in enumerate(vtpFileList):

  # Read the VTP file into memory
  pressureData, point_coordinates, connectivityArray, offsetArray = readVTK(filename)

  # Calculate edge array from offset array and connectivity array
  edges = calculateEdges(offsetArray,connectivityArray)

  # Calculate edge vectors from edges and point coordinates
  vectors = calcEdgeVectors(edges,point_coordinates)

  # Calculate dicts containing links between node/cell numbers and nodes/pressures
  nodeCellsDict, nodePressuresDict = createNodeDicts(connectivityArray,offsetArray,pressureData)

  # Create a pandas array where first column is the node number and the second is pressures
  df = pd.DataFrame(nodePressuresDict.items(),columns=['node','nodePressure'])
  df = df.assign(cells=pd.Series(nodeCellsDict.values()).values)

  # Create variables for each of the components that will form the data point

  edgeIndex = torch.tensor(edges,dtype=torch.long)
  edgeFeatures = torch.tensor(vectors,dtype=torch.float)
  targets = torch.tensor(df.nodePressure.values,dtype=torch.float)

  dataPoint = Data(edge_index = torch.transpose(edgeIndex,0,1),
                  edge_attr = edgeFeatures,
                  y= targets,
                  num_nodes = len(targets),
                  coordinates = point_coordinates)
  torch.save(dataPoint,'DataPoints/ahmedBodyData'+str(counter)+".pt")
  print(counter)

In [None]:
!zip -r /content/DataPoints.zip /content/DataPoints
files.download("/content/DataPoints.zip")

In [None]:
reloadData = torch.load('ahmedBodyData0.pt',weights_only = False)

1215780

## Pre Processing

In [7]:
reloadData = torch.load('/content/drive/MyDrive/Gnns/AhmedGNNDatapoints/ahmedBodyData0.pt',weights_only = False)

In [14]:
data = reloadData
num_nodes = reloadData.num_nodes
# -----------------------------
# 2. Split into subgraphs using random node subsets
# -----------------------------
num_subgraphs = 50
nodes_per_subgraph = num_nodes // num_subgraphs
subgraphs = []

all_nodes = torch.randperm(num_nodes)

for i in range(num_subgraphs):
    start = i * nodes_per_subgraph
    end = min((i + 1) * nodes_per_subgraph, num_nodes)
    node_subset = all_nodes[start:end]

    # Extract subgraph
    edge_idx, edge_attr = subgraph(node_subset, data.edge_index, relabel_nodes=True)
    sub_x = data.x[node_subset]
    sub_y = data.y[node_subset]

    sub_data(edge_index = torch.transpose(edgeIndex,0,1),
                  edge_attr = edgeFeatures,
                  y= targets,
                  num_nodes = len(targets),
                  coordinates = point_coordinates)
    #sub_data = Data(x=sub_x, edge_index=edge_idx, y=sub_y)
    subgraphs.append(sub_data)

# -----------------------------
# 3. Save subgraphs to disk
# -----------------------------
os.makedirs("subgraphs", exist_ok=True)
for idx, sg in enumerate(subgraphs):
    torch.save(sg, f"subgraphs/subgraph_{idx}.pt")

# -----------------------------
# 4. Load subgraphs back and create DataLoader
# -----------------------------
loaded_subgraphs = []
for file in os.listdir("subgraphs"):
    if file.endswith(".pt"):
        loaded_subgraphs.append(torch.load(os.path.join("subgraphs", file)))

train_loader = DataLoader(loaded_subgraphs, batch_size=8, shuffle=True)

# -----------------------------
# 5. Print summary
# -----------------------------
print(f"Original graph: {data}")
print(f"Number of subgraphs created: {len(subgraphs)}")
print(f"Example subgraph: {subgraphs[0]}")
print(f"DataLoader batches: {len(train_loader)}")


TypeError: 'NoneType' object is not subscriptable

### Plotting Single Case as a Surface
* First triangulate the points using delauney
* Then create a LUT using cdktree
  * This enables fast look up of the triangulated points to the nearest true CFD point.
  * The index of the nearest point is then used to find the pressure at that point and is what is then plotted.

In [8]:

# Your CFD data
x = np.array(reloadData.coordinates[0:-1:100,0])  # your x coordinates
y = np.array(reloadData.coordinates[0:-1:100,1])  # your y coordinates
z = np.array(reloadData.coordinates[0:-1:100,2])  # your z coordinates
pressure = np.array(reloadData.y[0:-1:100])  # your pressure values

# Create point cloud
points = np.column_stack((x, y, z))
# This creates a pv point cloud object
cloud = pv.PolyData(points)

# Surface reconstruction using Delaunay 3D (this create pyramids in 3D space filling the volume)
# The "extract_geometry()" extracts just the outer surface by finding the faces that aren't shared by tetrahedra
try:
    # Try with alpha parameter (helps with concave shapes), alpha = 2
    surf = cloud.delaunay_3d(alpha=2.0).extract_geometry()
except:
    # Fallback: no alpha (convex hull), this only works for convex shapes
    surf = cloud.delaunay_3d().extract_geometry()

# Extract vertices and faces for Plotly
vertices = surf.points
faces = surf.faces.reshape(-1, 4)[:, 1:]

# Map pressure to surface vertices using nearest neighbor
tree = cKDTree(points)
distances, indices = tree.query(vertices)
pressure_surf = pressure[indices]

# Create Plotly figure
fig = go.Figure(data=[go.Mesh3d(
    x=vertices[:, 0],
    y=vertices[:, 1],
    z=vertices[:, 2],
    i=faces[:, 0],
    j=faces[:, 1],
    k=faces[:, 2],
    intensity=pressure_surf,
    colorscale='RdBu_r',
    colorbar=dict(title='Pressure'),
    opacity=1.0,
    flatshading=False
)])

fig.update_layout(
    scene=dict(
        aspectmode='data',
        xaxis_title='X',
        yaxis_title='Y',
        zaxis_title='Z'
    ),
    title='Ahmed Body Surface Pressure'
)

fig.show()