Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update expiration timeout based on observed latencies #7628

Merged
1 commit merged into from May 1, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions pkg/client/cache/expiration_cache.go
Expand Up @@ -18,6 +18,7 @@ package cache

import (
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
"github.com/golang/glog"
"time"
)

Expand Down Expand Up @@ -81,6 +82,7 @@ func (c *ExpirationCache) getOrExpire(key string) (interface{}, bool) {
return nil, false
}
if c.expirationPolicy.IsExpired(timestampedItem) {
glog.V(4).Infof("Entry %v: %+v has expired", key, timestampedItem.obj)
// Since expiration happens lazily on read, don't hold up
// the reader trying to acquire a write lock for the delete.
// The next reader will retry the delete even if this one
Expand Down
9 changes: 7 additions & 2 deletions pkg/controller/controller_utils.go
Expand Up @@ -19,6 +19,8 @@ package controller
import (
"encoding/json"
"fmt"
"time"

"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api/validation"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
Expand All @@ -28,7 +30,6 @@ import (
"github.com/GoogleCloudPlatform/kubernetes/pkg/util/wait"
"github.com/golang/glog"
"sync/atomic"
"time"
)

const CreatedByAnnotation = "kubernetes.io/created-by"
Expand Down Expand Up @@ -106,7 +107,9 @@ func (r *RCExpectations) setExpectations(rc *api.ReplicationController, add, del
if err != nil {
return err
}
return r.Add(&PodExpectations{add: int64(add), del: int64(del), key: rcKey})
podExp := &PodExpectations{add: int64(add), del: int64(del), key: rcKey}
glog.V(4).Infof("Setting expectations %+v", podExp)
return r.Add(podExp)
}

func (r *RCExpectations) ExpectCreations(rc *api.ReplicationController, adds int) error {
Expand All @@ -124,6 +127,8 @@ func (r *RCExpectations) lowerExpectations(rc *api.ReplicationController, add, d
glog.V(2).Infof("Controller has both add and del expectations %+v", podExp)
}
podExp.Seen(int64(add), int64(del))
// The expectations might've been modified since the update on the previous line.
glog.V(4).Infof("Lowering expectations %+v", podExp)
}
}

Expand Down
33 changes: 23 additions & 10 deletions pkg/controller/replication_controller.go
Expand Up @@ -42,20 +42,28 @@ var (

const (
// We'll attempt to recompute the required replicas of all replication controllers
// the have fulfilled their expectations at least this often.
// the have fulfilled their expectations at least this often. This recomputation
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo: s/the/that

// happens based on contents in local pod storage.
FullControllerResyncPeriod = 30 * time.Second

// If a watch misdelivers info about a pod, it'll take this long
// to rectify the number of replicas.
// If a watch misdelivers info about a pod, it'll take at least this long
// to rectify the number of replicas. Note that dropped deletes are only
// rectified after the expectation times out because we don't know the
// final resting state of the pod.
PodRelistPeriod = 5 * time.Minute

// If a watch drops an (add, delete) event for a pod, it'll take this long
// before a dormant rc waiting for those packets is woken up anyway. This
// should typically be somewhere between the PodRelistPeriod and the
// FullControllerResyncPeriod. It is specifically targeted at the case
// where some problem prevents an update of expectations, without it the
// RC could stay asleep forever.
ExpectationsTimeout = 2 * time.Minute
// If a watch drops a delete event for a pod, it'll take this long
// before a dormant rc waiting for those packets is woken up anyway. It is
// specifically targeted at the case where some problem prevents an update
// of expectations, without it the RC could stay asleep forever. This should
// be set based on the expected latency of watch events.

// TODO: Set this per expectation, based on its size.
// Currently an rc can service (create *and* observe the watch events for said
// creation) about 10-20 pods a second, so it takes about 3.5 min to service
// 3000 pods. Just creation is limited to 30qps, and watching happens with
// ~10-30s latency/pod at scale.
ExpectationsTimeout = 6 * time.Minute
)

// ReplicationManager is responsible for synchronizing ReplicationController objects stored
Expand Down Expand Up @@ -220,6 +228,11 @@ func (rm *ReplicationManager) deletePod(obj interface{}) {
}
return
}
// When a delete is dropped, the relist will notice a pod in the store not
// in the list, leading to the insertion of a tombstone key. Since we don't
// know which rc to wake up/update expectations, we rely on the ttl on the
// expectation expiring. The rc syncs via the 30s periodic resync and notices
// fewer pods than its replica count.
podKey, err := framework.DeletionHandlingMetaNamespaceKeyFunc(obj)
if err != nil {
glog.Errorf("Couldn't get key for object %+v: %v", obj, err)
Expand Down