Skip to content

Commit

Permalink
Merge pull request #6794 from yujuhong/restart_counts
Browse files Browse the repository at this point in the history
Kubelet: persist restart count of a container
  • Loading branch information
vmarmol committed Apr 14, 2015
2 parents 2c3b9c8 + fd34441 commit 54406a5
Show file tree
Hide file tree
Showing 2 changed files with 168 additions and 44 deletions.
125 changes: 81 additions & 44 deletions pkg/kubelet/dockertools/manager.go
Expand Up @@ -231,8 +231,17 @@ func (self *DockerManager) GetPodStatus(pod *api.Pod) (*api.PodStatus, error) {
uid := pod.UID
manifest := pod.Spec

oldStatuses := make(map[string]api.ContainerStatus, len(pod.Spec.Containers))
lastObservedTime := make(map[string]util.Time, len(pod.Spec.Containers))
for _, status := range pod.Status.ContainerStatuses {
oldStatuses[status.Name] = status
if status.LastTerminationState.Termination != nil {
lastObservedTime[status.Name] = status.LastTerminationState.Termination.FinishedAt
}
}

var podStatus api.PodStatus
statuses := make(map[string]api.ContainerStatus)
statuses := make(map[string]*api.ContainerStatus, len(pod.Spec.Containers))

expectedContainers := make(map[string]api.Container)
for _, container := range manifest.Containers {
Expand All @@ -245,6 +254,10 @@ func (self *DockerManager) GetPodStatus(pod *api.Pod) (*api.PodStatus, error) {
return nil, err
}

containerDone := util.NewStringSet()
// Loop through list of running and exited docker containers to construct
// the statuses. We assume docker returns a list of containers sorted in
// reverse by time.
for _, value := range containers {
if len(value.Names) == 0 {
continue
Expand All @@ -261,30 +274,44 @@ func (self *DockerManager) GetPodStatus(pod *api.Pod) (*api.PodStatus, error) {
}
dockerContainerName := dockerName.ContainerName
c, found := expectedContainers[dockerContainerName]
terminationMessagePath := ""
if !found {
// TODO(dchen1107): should figure out why not continue here
// continue
} else {
terminationMessagePath = c.TerminationMessagePath
continue
}
// We assume docker return us a list of containers in time order
if containerStatus, found := statuses[dockerContainerName]; found {
// Populate last termination state
if containerStatus.LastTerminationState.Termination == nil {
result := self.inspectContainer(value.ID, dockerContainerName, terminationMessagePath)
if result.err == nil && result.status.State.Termination != nil {
containerStatus.LastTerminationState = result.status.State
}
}
containerStatus.RestartCount += 1
statuses[dockerContainerName] = containerStatus
terminationMessagePath := c.TerminationMessagePath
if containerDone.Has(dockerContainerName) {
continue
}

var terminationState *api.ContainerState = nil
// Inspect the container.
result := self.inspectContainer(value.ID, dockerContainerName, terminationMessagePath)
if result.err != nil {
return nil, result.err
} else if result.status.State.Termination != nil {
terminationState = &result.status.State
}

if containerStatus, found := statuses[dockerContainerName]; found {
if containerStatus.LastTerminationState.Termination == nil && terminationState != nil {
// Populate the last termination state.
containerStatus.LastTerminationState = *terminationState
}
count := true
// Only count dead containers terminated after last time we observed,
if lastObservedTime, ok := lastObservedTime[dockerContainerName]; ok {
if terminationState != nil && terminationState.Termination.FinishedAt.After(lastObservedTime.Time) {
count = false
} else {
// The container finished before the last observation. No
// need to examine/count the older containers. Mark the
// container name as done.
containerDone.Insert(dockerContainerName)
}
}
if count {
containerStatus.RestartCount += 1
}
continue
}

if dockerContainerName == PodInfraContainerName {
Expand All @@ -294,44 +321,54 @@ func (self *DockerManager) GetPodStatus(pod *api.Pod) (*api.PodStatus, error) {
}
} else {
// Add user container information.
statuses[dockerContainerName] = result.status
if oldStatus, found := oldStatuses[dockerContainerName]; found {
// Use the last observed restart count if it's available.
result.status.RestartCount = oldStatus.RestartCount
}
statuses[dockerContainerName] = &result.status
}
}

// Handle the containers for which we cannot find any associated active or
// dead docker containers.
for _, container := range manifest.Containers {
if _, found := statuses[container.Name]; found {
continue
}
var containerStatus api.ContainerStatus
if status, found := statuses[container.Name]; found {
containerStatus = status
} else {
// The container has not been created yet. Check image is ready on
// the node or not.
// TODO: If we integrate DockerPuller into DockerManager, we can
// record the pull failure and eliminate the image checking below.
image := container.Image
// TODO(dchen1107): docker/docker/issues/8365 to figure out if the image exists
_, err := self.client.InspectImage(image)
if err == nil {
containerStatus.State.Waiting = &api.ContainerStateWaiting{
Reason: fmt.Sprintf("Image: %s is ready, container is creating", image),
}
} else if err == docker.ErrNoSuchImage {
containerStatus.State.Waiting = &api.ContainerStateWaiting{
Reason: fmt.Sprintf("Image: %s is not ready on the node", image),
}
}
if oldStatus, found := oldStatuses[container.Name]; found {
// Some states may be lost due to GC; apply the last observed
// values if possible.
containerStatus.RestartCount = oldStatus.RestartCount
containerStatus.LastTerminationState = oldStatus.LastTerminationState
}
if containerStatus.State.Waiting != nil {
// For containers in the waiting state, fill in a specific reason if it is recorded.
if reason, ok := self.reasonCache.Get(uid, container.Name); ok {
containerStatus.State.Waiting.Reason = reason
//Check image is ready on the node or not.
// TODO: If we integrate DockerPuller into DockerManager, we can
// record the pull failure and eliminate the image checking below.
image := container.Image
// TODO(dchen1107): docker/docker/issues/8365 to figure out if the image exists
_, err := self.client.InspectImage(image)
if err == nil {
containerStatus.State.Waiting = &api.ContainerStateWaiting{
Reason: fmt.Sprintf("Image: %s is ready, container is creating", image),
}
} else if err == docker.ErrNoSuchImage {
containerStatus.State.Waiting = &api.ContainerStateWaiting{
Reason: fmt.Sprintf("Image: %s is not ready on the node", image),
}
}
statuses[container.Name] = containerStatus
statuses[container.Name] = &containerStatus
}

podStatus.ContainerStatuses = make([]api.ContainerStatus, 0)
for _, status := range statuses {
podStatus.ContainerStatuses = append(podStatus.ContainerStatuses, status)
for containerName, status := range statuses {
if status.State.Waiting != nil {
// For containers in the waiting state, fill in a specific reason if it is recorded.
if reason, ok := self.reasonCache.Get(uid, containerName); ok {
status.State.Waiting.Reason = reason
}
}
podStatus.ContainerStatuses = append(podStatus.ContainerStatuses, *status)
}

return &podStatus, nil
Expand Down
87 changes: 87 additions & 0 deletions pkg/kubelet/kubelet_test.go
Expand Up @@ -4117,3 +4117,90 @@ func TestGetPodCreationFailureReason(t *testing.T) {
}
}
}

func TestGetRestartCount(t *testing.T) {
testKubelet := newTestKubelet(t)
testKubelet.fakeCadvisor.On("MachineInfo").Return(&cadvisorApi.MachineInfo{}, nil)
kubelet := testKubelet.kubelet
fakeDocker := testKubelet.fakeDocker

containers := []api.Container{
{Name: "bar"},
}
pod := api.Pod{
ObjectMeta: api.ObjectMeta{
UID: "12345678",
Name: "foo",
Namespace: "new",
},
Spec: api.PodSpec{
Containers: containers,
},
}

// format is // k8s_<container-id>_<pod-fullname>_<pod-uid>
names := []string{"/k8s_bar." + strconv.FormatUint(dockertools.HashContainer(&containers[0]), 16) + "_foo_new_12345678_0"}
currTime := time.Now()
containerMap := map[string]*docker.Container{
"1234": {
ID: "1234",
Name: "bar",
Config: &docker.Config{},
State: docker.State{
ExitCode: 42,
StartedAt: currTime.Add(-60 * time.Second),
FinishedAt: currTime.Add(-60 * time.Second),
},
},
"5678": {
ID: "5678",
Name: "bar",
Config: &docker.Config{},
State: docker.State{
ExitCode: 42,
StartedAt: currTime.Add(-30 * time.Second),
FinishedAt: currTime.Add(-30 * time.Second),
},
},
"9101": {
ID: "9101",
Name: "bar",
Config: &docker.Config{},
State: docker.State{
ExitCode: 42,
StartedAt: currTime.Add(30 * time.Minute),
FinishedAt: currTime.Add(30 * time.Minute),
},
},
}
fakeDocker.ContainerMap = containerMap

// Helper function for verifying the restart count.
verifyRestartCount := func(pod *api.Pod, expectedCount int) api.PodStatus {
status, err := kubelet.generatePodStatus(pod)
if err != nil {
t.Errorf("unexpected error %v", err)
}
restartCount := status.ContainerStatuses[0].RestartCount
if restartCount != expectedCount {
t.Errorf("expected %d restart count, got %d", expectedCount, restartCount)
}
return status
}

// Container "bar" has failed twice; create two dead docker containers.
// TODO: container lists are expected to be sorted reversely by time.
// We should fix FakeDockerClient to sort the list before returning.
fakeDocker.ExitedContainerList = []docker.APIContainers{{Names: names, ID: "5678"}, {Names: names, ID: "1234"}}
pod.Status = verifyRestartCount(&pod, 1)

// Found a new dead container. The restart count should be incremented.
fakeDocker.ExitedContainerList = []docker.APIContainers{
{Names: names, ID: "9101"}, {Names: names, ID: "5678"}, {Names: names, ID: "1234"}}
pod.Status = verifyRestartCount(&pod, 2)

// All dead containers have been GC'd. The restart count should persist
// (i.e., remain the same).
fakeDocker.ExitedContainerList = []docker.APIContainers{}
verifyRestartCount(&pod, 2)
}

0 comments on commit 54406a5

Please sign in to comment.