From 7715461e3b3445564a4745bade895f4766c44cac Mon Sep 17 00:00:00 2001 From: "linhaiwei.lhw" Date: Thu, 10 Jul 2025 15:17:48 +0800 Subject: [PATCH 1/2] Support localERIDiscovery mode to not use access key and enable to expose specific eri to pod Signed-off-by: linhaiwei.lhw --- cmd/agent/main.go | 14 ++++- deploy/helm/templates/configmap.yaml | 1 + deploy/helm/templates/daemonset.yaml | 8 ++- deploy/helm/templates/deployment.yaml | 3 ++ deploy/helm/values.yaml | 5 ++ internal/agent/agent.go | 47 +++++++++++++++-- internal/drivers/utils_linux.go | 76 ++++++++++++++++++++++++++- internal/types/config.go | 1 + 8 files changed, 146 insertions(+), 9 deletions(-) diff --git a/cmd/agent/main.go b/cmd/agent/main.go index c96cf9a..0361041 100644 --- a/cmd/agent/main.go +++ b/cmd/agent/main.go @@ -19,15 +19,27 @@ func main() { preferDriver string allocAllDevices bool devicepluginPreStart bool + localERIDiscovery bool + exposedLocalERIs string ) flag.StringVar(&preferDriver, "prefer-driver", "", "prefer driver") flag.BoolVar(&allocAllDevices, "allocate-all-devices", false, "allocate all erdma devices for resource request, true => alloc all, false => alloc devices based on numa") flag.BoolVar(&devicepluginPreStart, "deviceplugin-prestart-container", false, "use device plugin prestart container to config smc-r, enable it if not use webhook to inject initContainers") + flag.BoolVar(&localERIDiscovery, "local-eri-discovery", false, + "Only manager on-node eri resources without using OpenAPI and access key") + flag.StringVar(&exposedLocalERIs, "exposed-local-eris", "", + "allocate specific ERI from existing ERI to pods for each instance") flag.Parse() - eriAgent, err := agent.NewAgent(preferDriver, allocAllDevices, devicepluginPreStart) + eriAgent, err := agent.NewAgent( + preferDriver, + allocAllDevices, + devicepluginPreStart, + localERIDiscovery, + exposedLocalERIs, + ) if err != nil { panic(err) } diff --git a/deploy/helm/templates/configmap.yaml b/deploy/helm/templates/configmap.yaml index 49948d4..e47fafb 100644 --- a/deploy/helm/templates/configmap.yaml +++ b/deploy/helm/templates/configmap.yaml @@ -17,5 +17,6 @@ data: "enableWebhook": {{ .Values.config.enableWebhook }}, "smcInitImage": "{{ .Values.config.smcInitImage }}", "enableInitContainerInject": {{ .Values.config.enableInitContainerInject }}, + "localERIDiscovery": {{ .Values.config.localERIDiscovery }}, "nodeSelector": {{ .Values.nodeSelector | toJson }} } diff --git a/deploy/helm/templates/daemonset.yaml b/deploy/helm/templates/daemonset.yaml index c53ef70..2b5aa83 100644 --- a/deploy/helm/templates/daemonset.yaml +++ b/deploy/helm/templates/daemonset.yaml @@ -21,7 +21,7 @@ spec: spec: hostPID: true hostNetwork: true - {{- with .Values.imagePullSecrets }} + {{- with .Values.agent.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} {{- end }} @@ -44,6 +44,12 @@ spec: {{ if .Values.agent.allocateAllDevices }} - --allocate-all-devices {{ end }} + {{ if .Values.config.localERIDiscovery }} + - --local-eri-discovery + {{ end }} + {{ if .Values.agent.exposedLocalERIs }} + - --exposed-local-eris={{ join "," .Values.agent.exposedLocalERIs }} + {{ end }} {{ if not .Values.config.enableWebhook }} - --deviceplugin-prestart-container {{ end }} diff --git a/deploy/helm/templates/deployment.yaml b/deploy/helm/templates/deployment.yaml index 944dda9..63e0d86 100644 --- a/deploy/helm/templates/deployment.yaml +++ b/deploy/helm/templates/deployment.yaml @@ -1,3 +1,5 @@ +{{- if not .Values.config.localERIDiscovery }} +--- apiVersion: apps/v1 kind: Deployment metadata: @@ -64,3 +66,4 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} +{{- end }} diff --git a/deploy/helm/values.yaml b/deploy/helm/values.yaml index f976dd4..d010c32 100644 --- a/deploy/helm/values.yaml +++ b/deploy/helm/values.yaml @@ -2,6 +2,7 @@ # This is a YAML-formatted file. # Declare variables to be passed into your templates. +# controller will not be deployed if localERIDiscovery is set controller: replicaCount: 2 image: @@ -24,6 +25,9 @@ agent: tag: "latest" preferDriver: "" allocateAllDevices: false + # format: //... + exposedLocalERIs: + - i-xxx erdma_0/erdma_1 imagePullSecrets: [] nameOverride: "" fullnameOverride: "" @@ -57,6 +61,7 @@ config: enableWebhook: false enableInitContainerInject: true smcInitImage: "" + localERIDiscovery: false credentials: type: "" diff --git a/internal/agent/agent.go b/internal/agent/agent.go index 6548e17..a6d1da5 100644 --- a/internal/agent/agent.go +++ b/internal/agent/agent.go @@ -5,13 +5,17 @@ import ( "os" "os/signal" "runtime" + "strings" "syscall" "github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/deviceplugin" "github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/drivers" "github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/k8s" "github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/types" + "github.com/samber/lo" ctrl "sigs.k8s.io/controller-runtime" + + networkv1 "github.com/AliyunContainerService/alibabacloud-erdma-controller/api/v1" ) var ( @@ -23,6 +27,8 @@ type Agent struct { driver drivers.ERdmaDriver allocAllDevices bool devicepluginPreStart bool + localERIDiscovery bool + exposedLocalERIs []string } func stackTriger() { @@ -48,25 +54,56 @@ func stackTriger() { signal.Notify(sigchain, syscall.SIGUSR1) } -func NewAgent(preferDriver string, allocAllDevice bool, devicepluginPreStart bool) (*Agent, error) { +func NewAgent(preferDriver string, allocAllDevice bool, devicepluginPreStart bool, localERIDiscovery bool, exposedLocalERIs string) (*Agent, error) { kubernetes, err := k8s.NewKubernetes() if err != nil { return nil, err } + agentLog.Info("NewAgent: ", "localERIDiscovery", localERIDiscovery) return &Agent{ kubernetes: kubernetes, driver: drivers.GetDriver(preferDriver), allocAllDevices: allocAllDevice, devicepluginPreStart: devicepluginPreStart, + localERIDiscovery: localERIDiscovery, + exposedLocalERIs: strings.Split(exposedLocalERIs, ","), }, nil } func (a *Agent) Run() error { go stackTriger() - // 1. wait related eri device - eriInfos, err := a.kubernetes.WaitEriInfo() - if err != nil { - return err + var err error + var eriInfos *networkv1.ERdmaDevice + var eri []*types.ERI + if !a.localERIDiscovery { + // 1. wait related eri device + eriInfos, err = a.kubernetes.WaitEriInfo() + if err != nil { + return err + } + } else { + if !(len(a.exposedLocalERIs) == 1 && a.exposedLocalERIs[0] == "") { + a.allocAllDevices = true + agentLog.Info("LocalERIDiscovery: enable expose ERIs, set allocAllDevices to true") + } + eri, err = drivers.SelectERIs(a.exposedLocalERIs) + if err != nil { + return fmt.Errorf("LocalERIDiscovery: select eri failed: %v", err) + } + eriInfos = &networkv1.ERdmaDevice{ + Spec: networkv1.ERdmaDeviceSpec{ + Devices: lo.Map(eri, func(item *types.ERI, index int) networkv1.DeviceInfo { + return networkv1.DeviceInfo{ + InstanceID: item.InstanceID, + MAC: item.MAC, + IsPrimaryENI: item.IsPrimaryENI, + ID: item.ID, + NetworkCardIndex: item.CardIndex, + QueuePair: item.QueuePair, + } + }), + }, + } } agentLog.Info("eri info", "eriInfo", eriInfos, "driver", a.driver.Name()) // 2. install eri driver diff --git a/internal/drivers/utils_linux.go b/internal/drivers/utils_linux.go index fab5c0b..6e7261d 100644 --- a/internal/drivers/utils_linux.go +++ b/internal/drivers/utils_linux.go @@ -11,14 +11,38 @@ import ( "os/exec" "path" "path/filepath" + "regexp" "strconv" "strings" "github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/types" + "github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/utils" "github.com/samber/lo" "github.com/vishvananda/netlink" ) +func checkExpose(instanceID string, exposedLocalERIs []string, rdmaDevice string) (bool, error) { + if len(exposedLocalERIs) == 1 && exposedLocalERIs[0] == "" { + return true, nil + } + pattern := `^i-\w+\s+(\w+(?:/\w+)*)$` + re := regexp.MustCompile(pattern) + for _, exposeInfo := range exposedLocalERIs { + if !re.MatchString(exposeInfo) { + return false, fmt.Errorf("invalid format %s. Expected format: \"instanceID: interface1 interface2 ...\"", exposeInfo) + } + id := strings.SplitN(exposeInfo, " ", 2)[0] + if instanceID == id { + exposeERIs := strings.Split(strings.TrimSpace(strings.SplitN(exposeInfo, " ", 2)[1]), "/") + for _, dev := range exposeERIs { + if dev == rdmaDevice { + return true, nil + } + } + } + } + return false, nil +} func driverExists() bool { if isContainerOS() { _, err := containerExec("modinfo erdma") @@ -104,14 +128,16 @@ func GetERdmaFromLink(link netlink.Link) (*netlink.RdmaLink, error) { } linkHwAddr := link.Attrs().HardwareAddr // erdma guid first byte is ^= 0x2 - linkHwAddr[0] ^= 0x2 + new_linkHwAddr := make(net.HardwareAddr, len(linkHwAddr)) + copy(new_linkHwAddr, linkHwAddr) + new_linkHwAddr[0] ^= 0x2 for _, rl := range rdmaLinks { rdmaHwAddr, err := parseERdmaLinkHwAddr(rl.Attrs.NodeGuid) if err != nil { return nil, err } driverLog.Info("check rdma link", "rdmaLink", rl.Attrs.Name, "rdmaHwAddr", rdmaHwAddr.String(), "linkHwAddr", linkHwAddr.String()) - if rdmaHwAddr.String() == linkHwAddr.String() { + if rdmaHwAddr.String() == new_linkHwAddr.String() { return rl, nil } } @@ -203,3 +229,49 @@ func GetERDMANumaNode(info *netlink.RdmaLink) (int64, error) { } return int64(numa), nil } + +const ( + instanceIDAddr = "http://100.100.100.200/latest/meta-data/instance-id" +) + +func SelectERIs(exposedLocalERIs []string) ([]*types.ERI, error) { + var selectEriList []*types.ERI + var isExposed bool + instanceID, _ := utils.GetStrFromMetadata(instanceIDAddr) + links, err := netlink.LinkList() + if err != nil { + return nil, fmt.Errorf("list link failed: %v", err) + } + + for _, link := range links { + if _, ok := link.(*netlink.Device); !ok { + continue + } + if link.Attrs().HardwareAddr != nil { + rdmaLink, _ := GetERdmaFromLink(link) + if rdmaLink != nil { + rdmadevice := rdmaLink.Attrs.Name + isExposed, err = checkExpose(instanceID, exposedLocalERIs, rdmadevice) + if isExposed { + driverLog.Info("LocalERIDiscovery: expose eri", "rdmadevice", rdmadevice, "link name", link.Attrs().Name) + eri := &types.ERI{ + ID: rdmadevice, + IsPrimaryENI: link.Attrs().Name == "eth0", + MAC: link.Attrs().HardwareAddr.String(), + InstanceID: instanceID, + CardIndex: -1, + QueuePair: -1, + } + selectEriList = append(selectEriList, eri) + driverLog.Info("Simple mode SelectERIs: eri", "eri", eri) + } else if err != nil { + return nil, err + } + } else { + driverLog.Info("LocalERIDiscovery: link is not rdma device, skip", "link_name", link.Attrs().Name) + } + } + } + + return selectEriList, nil +} diff --git a/internal/types/config.go b/internal/types/config.go index 2f49b00..1695162 100644 --- a/internal/types/config.go +++ b/internal/types/config.go @@ -12,6 +12,7 @@ type Config struct { SMCInitImage string `json:"smcInitImage"` EnableInitContainerInject *bool `json:"enableInitContainerInject"` NodeSelector map[string]string `json:"nodeSelector"` + LocalERIDiscovery bool `json:"localERIDiscovery"` } type Sensitive string From a890d6f3a2ebff6c889b48054f4427d20d60240e Mon Sep 17 00:00:00 2001 From: "linhaiwei.lhw" Date: Fri, 11 Jul 2025 17:47:47 +0800 Subject: [PATCH 2/2] support exposing eris for unmatched instance Signed-off-by: linhaiwei.lhw --- deploy/helm/values.yaml | 7 +++++-- internal/drivers/utils_linux.go | 18 ++++++++++++++++-- internal/types/config.go | 1 - 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/deploy/helm/values.yaml b/deploy/helm/values.yaml index d010c32..2b53270 100644 --- a/deploy/helm/values.yaml +++ b/deploy/helm/values.yaml @@ -25,9 +25,12 @@ agent: tag: "latest" preferDriver: "" allocateAllDevices: false - # format: //... + # format: + # expose specific eris for matched node: - //... + # expose specific eris for unmatched node: - i-* //... + # expose all eris for unmatched node: - i-* erdma_* exposedLocalERIs: - - i-xxx erdma_0/erdma_1 + - i-XXX erdma_0/erdma_1 imagePullSecrets: [] nameOverride: "" fullnameOverride: "" diff --git a/internal/drivers/utils_linux.go b/internal/drivers/utils_linux.go index 6e7261d..325152d 100644 --- a/internal/drivers/utils_linux.go +++ b/internal/drivers/utils_linux.go @@ -22,17 +22,20 @@ import ( ) func checkExpose(instanceID string, exposedLocalERIs []string, rdmaDevice string) (bool, error) { + var unMatchExposeERIs []string + isMatched := false if len(exposedLocalERIs) == 1 && exposedLocalERIs[0] == "" { return true, nil } - pattern := `^i-\w+\s+(\w+(?:/\w+)*)$` + pattern := `^(i-(?:\w+|\*))\s+((?:(?:\w+)(?:\/\w+)*))$` re := regexp.MustCompile(pattern) for _, exposeInfo := range exposedLocalERIs { if !re.MatchString(exposeInfo) { - return false, fmt.Errorf("invalid format %s. Expected format: \"instanceID: interface1 interface2 ...\"", exposeInfo) + return false, fmt.Errorf("invalid format %s", exposeInfo) } id := strings.SplitN(exposeInfo, " ", 2)[0] if instanceID == id { + isMatched = true exposeERIs := strings.Split(strings.TrimSpace(strings.SplitN(exposeInfo, " ", 2)[1]), "/") for _, dev := range exposeERIs { if dev == rdmaDevice { @@ -40,6 +43,17 @@ func checkExpose(instanceID string, exposedLocalERIs []string, rdmaDevice string } } } + if id == "i-*" { + unMatchExposeERIs = strings.Split(strings.TrimSpace(strings.SplitN(exposeInfo, " ", 2)[1]), "/") + } + } + if !isMatched { + driverLog.Info("no matched instanceID found, using unMatchExposeERIs", "instanceID", instanceID) + for _, dev := range unMatchExposeERIs { + if dev == "erdma_*" || dev == rdmaDevice { + return true, nil + } + } } return false, nil } diff --git a/internal/types/config.go b/internal/types/config.go index 1695162..2f49b00 100644 --- a/internal/types/config.go +++ b/internal/types/config.go @@ -12,7 +12,6 @@ type Config struct { SMCInitImage string `json:"smcInitImage"` EnableInitContainerInject *bool `json:"enableInitContainerInject"` NodeSelector map[string]string `json:"nodeSelector"` - LocalERIDiscovery bool `json:"localERIDiscovery"` } type Sensitive string