Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use GiB as default unit instead of MiB #3

Merged
merged 13 commits into from Mar 1, 2019
2 changes: 1 addition & 1 deletion cmd/inspect/display.go
Expand Up @@ -144,7 +144,7 @@ func displaySummary(nodeInfos []*NodeInfo) {
if hasPendingGPU {
buffer.WriteString("PENDING(Allocated)\t")
}
buffer.WriteString("GPU Memory(MiB)\n")
buffer.WriteString(fmt.Sprintf("GPU Memory(%s)\n", memoryUnit))

// fmt.Fprintf(w, "NAME\tIPADDRESS\tROLE\tGPU(Allocated/Total)\tPENDING(Allocated)\n")
fmt.Fprintf(w, buffer.String())
Expand Down
27 changes: 25 additions & 2 deletions cmd/inspect/nodeinfo.go
Expand Up @@ -46,6 +46,7 @@ func buildAllNodeInfos(allPods []v1.Pod, nodes []v1.Node) ([]*NodeInfo, error) {
nodeInfos := buildNodeInfoWithPods(allPods, nodes)
for _, info := range nodeInfos {
if info.gpuTotalMemory > 0 {
setUnit(info.gpuTotalMemory, info.gpuCount)
err := info.buildDeviceInfo()
if err != nil {
log.Warningf("Failed due to %v", err)
Expand All @@ -71,7 +72,7 @@ func (n *NodeInfo) acquirePluginPod() v1.Pod {
}

func getTotalGPUMemory(node v1.Node) int {
val, ok := node.Status.Capacity[resourceName]
val, ok := node.Status.Allocatable[resourceName]

if !ok {
return 0
Expand All @@ -81,7 +82,7 @@ func getTotalGPUMemory(node v1.Node) int {
}

func getGPUCountInNode(node v1.Node) int {
val, ok := node.Status.Capacity[countName]
val, ok := node.Status.Allocatable[countName]

if !ok {
return int(0)
Expand Down Expand Up @@ -221,3 +222,25 @@ func isGPUSharingNode(node v1.Node) bool {

return ok
}

var (
memoryUnit = ""
)

func setUnit(gpuMemory, gpuCount int) {
if memoryUnit != "" {
return
}

if gpuCount == 0 {
return
}

gpuMemoryByDev := gpuMemory / gpuCount

if gpuMemoryByDev > 100 {
memoryUnit = "MiB"
} else {
memoryUnit = "GiB"
}
}
17 changes: 15 additions & 2 deletions cmd/nvidia/main.go
Expand Up @@ -10,15 +10,28 @@ import (
var (
mps = flag.Bool("mps", false, "Enable or Disable MPS")
healthCheck = flag.Bool("health-check", false, "Enable or disable Health check")
memoryUnit = flag.String("memory-unit", "GiB", "Set memoryUnit of the GPU Memroy, support 'GiB' and 'MiB'")
)

func main() {
flag.Parse()
log.V(1).Infoln("Start gpushare device plugin")

ngm := nvidia.NewSharedGPUManager(*mps, *healthCheck)
ngm := nvidia.NewSharedGPUManager(*mps, *healthCheck, translatememoryUnits(*memoryUnit))
err := ngm.Run()
if err != nil {
log.Fatalf("Failed due to %v", err)
}
}

func translatememoryUnits(value string) nvidia.MemoryUnit {
memoryUnit := nvidia.MemoryUnit(value)
switch memoryUnit {
case nvidia.MiBPrefix:
case nvidia.GiBPrefix:
default:
log.Warningf("Unsupported memory unit: %s, use memoryUnit Gi as default", value)
memoryUnit = nvidia.GiBPrefix
}

return memoryUnit
}
4 changes: 2 additions & 2 deletions demo/binpack-1/binpack-1.yaml
Expand Up @@ -39,5 +39,5 @@ spec:
image: cheyang/gpu-player:v2
resources:
limits:
# MiB
aliyun.com/gpu-mem: 8076
# GiB
aliyun.com/gpu-mem: 2
4 changes: 2 additions & 2 deletions demo/binpack-1/job.yaml
Expand Up @@ -11,7 +11,7 @@ spec:
image: alpine:3.6
resources:
limits:
# MiB
aliyun.com/gpu-mem: 8076
# GiB
aliyun.com/gpu-mem: 2
command: ["sleep","30s"]
restartPolicy: Never
3 changes: 2 additions & 1 deletion device-plugin-ds.yaml
Expand Up @@ -18,13 +18,14 @@ spec:
nodeSelector:
gpushare: "true"
containers:
- image: registry.cn-hangzhou.aliyuncs.com/acs/k8s-gpushare-plugin:v2-1.11-35eccab
- image: registry.cn-hangzhou.aliyuncs.com/acs/k8s-gpushare-plugin:v2-1.11-aff8a23
name: gpushare
# Make this pod as Guaranteed pod which will never be evicted because of node's resource consumption.
command:
- gpushare-device-plugin-v2
- -logtostderr
- --v=5
- --memory-unit=GiB
resources:
limits:
memory: "300Mi"
Expand Down
6 changes: 6 additions & 0 deletions pkg/gpu/nvidia/const.go
Expand Up @@ -4,6 +4,9 @@ import (
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
)

// MemoryUnit describes GPU Memory, now only supports Gi, Mi
type MemoryUnit string

const (
resourceName = "aliyun.com/gpu-mem"
resourceCount = "aliyun.com/gpu-count"
Expand All @@ -26,4 +29,7 @@ const (
EnvAssignedFlag = "ALIYUN_COM_GPU_MEM_ASSIGNED"
EnvResourceAssumeTime = "ALIYUN_COM_GPU_MEM_ASSUME_TIME"
EnvResourceAssignTime = "ALIYUN_COM_GPU_MEM_ASSIGN_TIME"

GiBPrefix = MemoryUnit("GiB")
MiBPrefix = MemoryUnit("MiB")
)
3 changes: 2 additions & 1 deletion pkg/gpu/nvidia/gpumanager.go
Expand Up @@ -16,7 +16,8 @@ type sharedGPUManager struct {
healthCheck bool
}

func NewSharedGPUManager(enableMPS, healthCheck bool) *sharedGPUManager {
func NewSharedGPUManager(enableMPS, healthCheck bool, bp MemoryUnit) *sharedGPUManager {
metric = bp
return &sharedGPUManager{
enableMPS: enableMPS,
healthCheck: healthCheck,
Expand Down
16 changes: 14 additions & 2 deletions pkg/gpu/nvidia/nvidia.go
Expand Up @@ -12,7 +12,10 @@ import (
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
)

var gpuMemory uint
var (
gpuMemory uint
metric MemoryUnit
)

func check(err error) {
if err != nil {
Expand All @@ -28,6 +31,15 @@ func extractRealDeviceID(fakeDeviceID string) string {
return strings.Split(fakeDeviceID, "-_-")[0]
}

func setGPUMemory(raw uint) {
v := raw
if metric == GiBPrefix {
v = raw / 1024
}
gpuMemory = v
log.Infof("set gpu memory: %d", gpuMemory)
}

func getGPUMemory() uint {
return gpuMemory
}
Expand Down Expand Up @@ -56,7 +68,7 @@ func getDevices() ([]*pluginapi.Device, map[string]uint) {
// var KiB uint64 = 1024
log.Infof("# device Memory: %d", uint(*d.Memory))
if getGPUMemory() == uint(0) {
gpuMemory = uint(*d.Memory)
setGPUMemory(uint(*d.Memory))
}
for j := uint(0); j < getGPUMemory(); j++ {
fakeID := generateFakeDeviceID(d.UUID, j)
Expand Down