Skip to content

Commit

Permalink
Make statics owned by the CustomResourceDefinition
Browse files Browse the repository at this point in the history
...so they get deleted automatically when the operator is uninstalled.

This still doesn't clean up the SCC, which may be an upstream bug.  Trying to figure that out in [slack](https://coreos.slack.com/archives/CB48XQ4KZ/p1593546841423000)

Jira: [OSD-4083](https://issues.redhat.com/browse/OSD-4083)
Fixes: openshift#15
  • Loading branch information
2uasimojo committed Jul 1, 2020
1 parent 3f38f8d commit 67911d3
Show file tree
Hide file tree
Showing 14 changed files with 337 additions and 139 deletions.
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ This operator is available via OperatorHub.
More detailed information can be found [here](https://access.redhat.com/articles/5025181).

## Uninstalling
TODO (don't forget about manual cleanup of statics)
1. Uninstall via OperatorHub
2. Manually delete the `efs-csi-scc` SecurityContextConstraints (see [below](#securitycontextconstraints-must-be-deleted-manually))

## Usage

Expand Down Expand Up @@ -187,6 +188,15 @@ can leave it in an unusable state, even if the operator is able to resurrect the
The only supported way to delete a `PersistentVolumeClaim` (or `PersistentVolume`) associated with a `SharedVolume`
is to delete the `SharedVolume` and let the operator do the rest.

### SecurityContextConstraints must be deleted manually
Per [issue #23](https://github.com/openshift/aws-efs-operator/issues/23), uninstalling the operator may not clean up the `efs-csi-scc` SecurityContextConstraints. To delete it manually:

```shell
$ oc delete scc efs-csi-scc
```

This may require special permissions.

## Under the hood

The operator has two controllers. One monitors the resources necessary to run the
Expand Down
7 changes: 7 additions & 0 deletions cmd/manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"runtime"
"strings"

apiextensions "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
_ "k8s.io/client-go/plugin/pkg/client/auth"
"k8s.io/client-go/rest"
Expand Down Expand Up @@ -143,6 +144,12 @@ func main() {
os.Exit(1)
}

// Need this for the CustomResourceDefinition Kind
if err := apiextensions.AddToScheme(mgr.GetScheme()); err != nil {
log.Error(err, "")
os.Exit(1)
}

// Setup all Controllers
if err := controller.AddToManager(mgr); err != nil {
log.Error(err, "")
Expand Down
7 changes: 7 additions & 0 deletions deploy/cluster_role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,10 @@ rules:
- securitycontextconstraints
verbs:
- '*'
- apiGroups:
- apiextensions.k8s.io
resources:
- customresourcedefinitions
verbs:
# TODO(efried): Why isn't `get` good enough?
- '*'
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ require (
github.com/spf13/pflag v1.0.5
golang.org/x/net v0.0.0-20191028085509-fe3aa8a45271
k8s.io/api v0.0.0
k8s.io/apiextensions-apiserver v0.0.0
k8s.io/apimachinery v0.0.0
k8s.io/client-go v12.0.0+incompatible
k8s.io/kubernetes v1.16.2
Expand Down
66 changes: 4 additions & 62 deletions pkg/controller/sharedvolume/sharedvolume_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ func TestReconcile(t *testing.T) {
if err := r.client.Delete(ctx, pvMap[pvname]); err != nil {
t.Fatal(err)
}
delete(pvBySharedVolume, svKey(svMap[fmt.Sprintf("%s/%s", nsy, svb)]))
if res, err = r.Reconcile(req); res != test.NullResult || err != nil {
t.Fatalf("Expected no requeue, no error; got\nresult: %v\nerr: %v", res, err)
}
Expand Down Expand Up @@ -773,65 +774,6 @@ func TestEnsureFails(t *testing.T) {
}
}

// fakeClientWithCustomErrors overrides some of the fake client's methods, allowing them to (not
// actually run and) throw specific errors. Use it like this:
// realFakeClient := NewFakeClient(...)
// c := fakeClientWithCustomErrors{
// Client: realFakeClient,
// DeleteBehavior: []error{nil, fmt.Errorf("Error on the second call"), nil}
// UpdateBehavior: []error(fmt.Errorf("Error on the first call"))
// }
// c.Delete(...) // runs the real fake Delete
// c.Delete(...) // skips the real fake Delete and returns the first error
// c.Delete(...) // runs the real fake Delete
// c.Delete(...) // runs the real fake Delete, even though we ran off the end of the array
// c.Update(...) // returns the error
type fakeClientWithCustomErrors struct {
// The "Real" fake client
crclient.Client
// Entries in this list affect each successive call to Delete(). Using `nil` causes the "real"
// Delete to be called. Using a non-nil error causes the "real" Delete to be skipped, and that
// error to be returned instead.
DeleteBehavior []error
// Private tracker of calls to Delete, used to determine which DeleteBehavior to use.
numDeleteCalls int
// Ditto Update
UpdateBehavior []error
// Private tracker of calls to Update, used to determine which UpdateBehavior to use.
numUpdateCalls int
// TODO(efried): Add the other methods. Propose upstream.
}

func clientOverride(behavior []error, numCalls int) error {
if len(behavior) > numCalls {
return behavior[numCalls] // which might be nil
}
// If we ran off the end, assume default behavior
return nil
}

// Delete overrides the fake client's Delete, conditionally bypassing it and returning an error instead.
func (f *fakeClientWithCustomErrors) Delete(ctx context.Context, obj runtime.Object, opts ...crclient.DeleteOption) error {
// Always increment the call count, but not until we're done.
defer func() { f.numDeleteCalls++ }()
if err := clientOverride(f.DeleteBehavior, f.numDeleteCalls); err != nil {
return err
}
// Otherwise run the real Delete
return f.Client.Delete(ctx, obj, opts...)
}

// Update overrides the fake client's Update, conditionally bypassing it and returning an error instead.
func (f *fakeClientWithCustomErrors) Update(ctx context.Context, obj runtime.Object, opts ...crclient.UpdateOption) error {
// Always increment the call count, but not until we're done.
defer func() { f.numUpdateCalls++ }()
if err := clientOverride(f.UpdateBehavior, f.numUpdateCalls); err != nil {
return err
}
// Otherwise run the real Update
return f.Client.Update(ctx, obj, opts...)
}

// TestHandleDeleteFails hits unusual failure paths in `handleDelete`
func TestHandleDeleteFails(t *testing.T) {
// Make sure the caches are cleared from other tests
Expand Down Expand Up @@ -884,7 +826,7 @@ func TestHandleDeleteFails(t *testing.T) {
}

// 1) Make the PVC Ensurable's Delete fail
r.client = &fakeClientWithCustomErrors{
r.client = &test.FakeClientWithCustomErrors{
Client: realFakeClient,
DeleteBehavior: []error{
// This has to be an error other than NotFound to make EnsurableImpl.Delete return it.
Expand Down Expand Up @@ -914,7 +856,7 @@ func TestHandleDeleteFails(t *testing.T) {
}

// 2) Make the PV Ensurable's Delete fail (after the PVC Ensurable's Delete succeeds)
r.client = &fakeClientWithCustomErrors{
r.client = &test.FakeClientWithCustomErrors{
Client: realFakeClient,
DeleteBehavior: []error{
// The first Delete is the PVC's
Expand Down Expand Up @@ -947,7 +889,7 @@ func TestHandleDeleteFails(t *testing.T) {
// 3) Make the SV's Update (to clear the finalizer) fail.
// Note that when we start this sub-case, the PVC is already gone, so this also exercises
// the idempotency of that deletion.
r.client = &fakeClientWithCustomErrors{
r.client = &test.FakeClientWithCustomErrors{
Client: realFakeClient,
UpdateBehavior: []error{
// The SV's Update() to clear the finalizer is the first (and only) Update().
Expand Down
1 change: 1 addition & 0 deletions pkg/controller/statics/statics.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ func init() {
loadDefTemplate(scDef, "storageclass.yaml")
StorageClassName = scDef.Name

// NOTE(efried): We can't SetOwner() yet because we don't have the CRD at this stage.
staticResources = []util.Ensurable{
&util.EnsurableImpl{
ObjType: &corev1.ServiceAccount{},
Expand Down
50 changes: 50 additions & 0 deletions pkg/controller/statics/statics_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,17 @@ This controller knows how to bootstrap these objects and watch them for changes
*/

import (
"context"
"openshift/aws-efs-operator/pkg/util"
"time"

apiextensions "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"

"sigs.k8s.io/controller-runtime/pkg/client"
crclient "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/handler"
logf "sigs.k8s.io/controller-runtime/pkg/log"
Expand All @@ -20,6 +26,8 @@ import (
"sigs.k8s.io/controller-runtime/pkg/source"
)

const svCRDName = "sharedvolumes.aws-efs.managed.openshift.io"

var log = logf.Log.WithName("controller_statics")

// Add creates a new Statics Controller and adds it to the Manager. The Manager will set fields on the Controller
Expand Down Expand Up @@ -70,6 +78,11 @@ type ReconcileStatics struct {
func (r *ReconcileStatics) Reconcile(request reconcile.Request) (reconcile.Result, error) {
reqLogger := log.WithValues("Request.Namespace", request.Namespace, "Request.Name", request.Name)

var (
crd *apiextensions.CustomResourceDefinition
err error
)

// We got this far, so it's a type we're watching that also passed our filter. That means it
// ought to be one of our statics.
s := findStatic(request.NamespacedName)
Expand All @@ -80,12 +93,49 @@ func (r *ReconcileStatics) Reconcile(request reconcile.Request) (reconcile.Resul
return reconcile.Result{}, nil
}

// Find the SharedVolume CRD, which will be set up to "own" the statics. This is the mechanism
// by which the statics get cleaned up when the operator is uninstalled.
// TODO(efried): Except for the SCC, which for some reason seems to ignore the OwnerReferences
// and not get deleted. This may be an upstream bug.
if crd, err = discoverCRD(r.client); err != nil {
if errors.IsNotFound(err) {
reqLogger.Info("SharedVolume CRD has already been deleted. Skipping reconcile, awaiting demise.")
return reconcile.Result{}, nil
}
reqLogger.Error(err, "Couldn't retrieve SharedVolume CRD")
// Not sure under what circumstances this could happen, but... requeue after one second
return reconcile.Result{Requeue: true, RequeueAfter: time.Millisecond * 1000}, nil
}
// If the CRD is being deleted, it means we're shutting down. Bail out to avoid thrashing (the
// deletion of the CRD triggers deletion of the statics, which we would otherwise try to
// restore below).
if crd.GetDeletionTimestamp() != nil {
reqLogger.Info("The SharedVolume CRD is being deleted, which means we're shutting down. Skipping reconcile.")
return reconcile.Result{}, nil
}

reqLogger.Info("Reconciling.", "request", request)

// Make sure the static is "owned" by the CRD.
// We need to do this here because we can't count on the CRD existing during static setup.
s.SetOwner(util.AsOwner(crd))

if err := s.Ensure(reqLogger, r.client); err != nil {
// TODO: Max retries so we don't get in a hard loop when the failure is something incurable?
return reconcile.Result{Requeue: true}, err
}

return reconcile.Result{}, nil
}

// discoverCRD finds our SharedVolume CustomResourceDefinition.
func discoverCRD(client crclient.Client) (*apiextensions.CustomResourceDefinition, error) {
crd := &apiextensions.CustomResourceDefinition{}
nsn := types.NamespacedName{
Name: svCRDName,
}
if err := client.Get(context.TODO(), nsn, crd); err != nil {
return nil, err
}
return crd, nil
}
Loading

0 comments on commit 67911d3

Please sign in to comment.