Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve kubeadm reset #37831

Merged
merged 3 commits into from Dec 7, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
208 changes: 135 additions & 73 deletions cmd/kubeadm/app/cmd/reset.go
@@ -1,5 +1,5 @@
/*
Copyright 2014 The Kubernetes Authors.
Copyright 2016 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand All @@ -21,145 +21,207 @@ import (
"io"
"os"
"os/exec"
"path/filepath"
"path"
"strings"

"github.com/spf13/cobra"

kubeadmapi "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm"
"k8s.io/kubernetes/cmd/kubeadm/app/preflight"
kubeadmutil "k8s.io/kubernetes/cmd/kubeadm/app/util"
"k8s.io/kubernetes/pkg/util/initsystem"
)

// NewCmdReset returns "kubeadm reset" command.
// NewCmdReset returns the "kubeadm reset" command
func NewCmdReset(out io.Writer) *cobra.Command {
var skipPreFlight bool
var skipPreFlight, removeNode bool
cmd := &cobra.Command{
Use: "reset",
Short: "Run this to revert any changes made to this host by 'kubeadm init' or 'kubeadm join'.",
Run: func(cmd *cobra.Command, args []string) {
r, err := NewReset(skipPreFlight)
r, err := NewReset(skipPreFlight, removeNode)
kubeadmutil.CheckErr(err)
kubeadmutil.CheckErr(r.Run(out))
},
}

cmd.PersistentFlags().BoolVar(
&skipPreFlight, "skip-preflight-checks", false,
"skip preflight checks normally run before modifying the system",
"Skip preflight checks normally run before modifying the system",
)

cmd.PersistentFlags().BoolVar(
&removeNode, "remove-node", true,
"Remove this node from the pool of nodes in this cluster",
)

return cmd
}

type Reset struct{}
type Reset struct {
removeNode bool
}

func NewReset(skipPreFlight bool) (*Reset, error) {
func NewReset(skipPreFlight, removeNode bool) (*Reset, error) {
if !skipPreFlight {
fmt.Println("Running pre-flight checks")
err := preflight.RunResetCheck()
if err != nil {
fmt.Println("[preflight] Running pre-flight checks...")

if err := preflight.RunResetCheck(); err != nil {
return nil, &preflight.PreFlightError{Msg: err.Error()}
}
} else {
fmt.Println("Skipping pre-flight checks")
fmt.Println("[preflight] Skipping pre-flight checks...")
}

return &Reset{}, nil
return &Reset{
removeNode: removeNode,
}, nil
}

// cleanDir removes everything in a directory, but not the directory itself:
func cleanDir(path string) {
// If the directory doesn't even exist there's nothing to do, and we do
// not consider this an error:
if _, err := os.Stat(path); os.IsNotExist(err) {
return
}
// Run reverts any changes made to this host by "kubeadm init" or "kubeadm join".
func (r *Reset) Run(out io.Writer) error {

d, err := os.Open(path)
// Drain and maybe remove the node from the cluster
err := drainAndRemoveNode(r.removeNode)
if err != nil {
fmt.Printf("failed to remove directory: [%v]\n", err)
}
defer d.Close()
names, err := d.Readdirnames(-1)
if err != nil {
fmt.Printf("failed to remove directory: [%v]\n", err)
}
for _, name := range names {
err = os.RemoveAll(filepath.Join(path, name))
if err != nil {
fmt.Printf("failed to remove directory: [%v]\n", err)
}
}
}

// resetConfigDir is used to cleanup the files kubeadm writes in /etc/kubernetes/.
func resetConfigDir(configDirPath string) {
dirsToClean := []string{
filepath.Join(configDirPath, "manifests"),
filepath.Join(configDirPath, "pki"),
fmt.Printf("[reset] Failed to cleanup node: [%v]\n", err)
}
fmt.Printf("Deleting contents of config directories: %v\n", dirsToClean)
for _, dir := range dirsToClean {
cleanDir(dir)
}

filesToClean := []string{
filepath.Join(configDirPath, "admin.conf"),
filepath.Join(configDirPath, "kubelet.conf"),
}
fmt.Printf("Deleting files: %v\n", filesToClean)
for _, path := range filesToClean {
err := os.RemoveAll(path)
if err != nil {
fmt.Printf("failed to remove file: [%v]\n", err)
}
}
}

// Run reverts any changes made to this host by "kubeadm init" or "kubeadm join".
func (r *Reset) Run(out io.Writer) error {
serviceToStop := "kubelet"
initSystem, err := initsystem.GetInitSystem()
if err != nil {
fmt.Printf("%v", err)
fmt.Printf("[reset] Failed to detect init system and stop the kubelet service: %v\n", err)
} else {
fmt.Printf("Stopping the %s service...\n", serviceToStop)
fmt.Printf("[reset] Stopping the %s service...\n", serviceToStop)
if err := initSystem.ServiceStop(serviceToStop); err != nil {
fmt.Printf("failed to stop the %s service", serviceToStop)
fmt.Printf("[reset] Failed to stop the %s service\n", serviceToStop)
}
}

fmt.Printf("Unmounting directories in /var/lib/kubelet...\n")
fmt.Println("[reset] Unmounting directories in /var/lib/kubelet...")
umountDirsCmd := "cat /proc/mounts | awk '{print $2}' | grep '/var/lib/kubelet' | xargs -r umount"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe instead of embedding "shell" script here, write small function to call umount individually and report error on particular path ?

umountOutputBytes, err := exec.Command("sh", "-c", umountDirsCmd).Output()
if err != nil {
fmt.Printf("failed to unmount directories in /var/lib/kubelet, %s", string(umountOutputBytes))
fmt.Printf("[reset] Failed to unmount directories in /var/lib/kubelet: %s\n", string(umountOutputBytes))
}

dirsToClean := []string{"/var/lib/kubelet"}
// Remove contents from the config and pki directories
resetConfigDir(kubeadmapi.GlobalEnvParams.KubernetesDir, kubeadmapi.GlobalEnvParams.HostPKIPath)

dirsToClean := []string{"/var/lib/kubelet", "/etc/cni/net.d"}

// Only clear etcd data when the etcd manifest is found. In case it is not found, we must assume that the user
// provided external etcd endpoints. In that case, it is his own responsibility to reset etcd
if _, err := os.Stat("/etc/kubernetes/manifests/etcd.json"); os.IsNotExist(err) {
dirsToClean = append(dirsToClean, "/var/lib/etcd")
} else {
fmt.Printf("[reset] No etcd manifest found in %q, assuming external etcd.\n", "/etc/kubernetes/manifests/etcd.json")
}

resetConfigDir("/etc/kubernetes/")

fmt.Printf("Deleting contents of stateful directories: %v\n", dirsToClean)
fmt.Printf("[reset] Deleting contents of stateful directories: %v\n", dirsToClean)
for _, dir := range dirsToClean {
cleanDir(dir)
}

dockerCheck := preflight.ServiceCheck{Service: "docker"}
if warnings, errors := dockerCheck.Check(); len(warnings) == 0 && len(errors) == 0 {
fmt.Println("Stopping all running docker containers...")
if err := exec.Command("sh", "-c", "docker ps | grep 'k8s_' | awk '{print $1}' | xargs docker rm --force --volumes").Run(); err != nil {
fmt.Println("failed to stop the running containers")
fmt.Println("[reset] Stopping all running docker containers...")
if err := exec.Command("sh", "-c", "docker ps | grep 'k8s_' | awk '{print $1}' | xargs -r docker rm --force --volumes").Run(); err != nil {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what would happen if non-docker container runtime used ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't support other runtimes than docker for the moment, so that's not a problem for now.
this code just moved from below

fmt.Println("[reset] Failed to stop the running containers")
}
} else {
fmt.Println("docker doesn't seem to be running, skipping the removal of kubernetes containers")
fmt.Println("[reset] docker doesn't seem to be running, skipping the removal of running kubernetes containers")
}

return nil
}

func drainAndRemoveNode(removeNode bool) error {

hostname, err := os.Hostname()
if err != nil {
return fmt.Errorf("failed to detect node hostname")
}
hostname = strings.ToLower(hostname)

// TODO: Use the "native" k8s client for this once we're confident the versioned is working
kubeConfigPath := path.Join(kubeadmapi.GlobalEnvParams.KubernetesDir, "kubelet.conf")

getNodesCmd := fmt.Sprintf("kubectl --kubeconfig %s get nodes | grep %s", kubeConfigPath, hostname)
output, err := exec.Command("sh", "-c", getNodesCmd).Output()
if err != nil || len(output) == 0 {
// kubeadm shouldn't drain and/or remove the node when it doesn't exist anymore
return nil
}

fmt.Printf("[reset] Draining node: %q\n", hostname)

output, err = exec.Command("kubectl", "--kubeconfig", kubeConfigPath, "drain", hostname, "--delete-local-data", "--force", "--ignore-daemonsets").Output()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need to add HttpProxyCheck() in RunResetCheck(). but we need somehow get API URL which written in kubelet.conf.
Also, FileAvailableCheck{Path: "/etc/kubernetes/kubelet.conf"} needed there if we want to use kubectl here.
And InPathCheck{executable: "kubectl", mandatory: true}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would these be covered sufficiently by the err handling?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason I don't want to add those checks is that the drain/remove functions aren't critical in any way, if it doesn't work, it just won't do the drain and remove things.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

checks can be done optional based on state of "remove-node". if user specifies flag, he/she assumes that it would be done gracefully.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still, it will try to drain regardless of remove-node...
At least, this is totally fine for the next stable release, it improves and fixes lots of things

The TODO I added was to switch over to the go native client, but we're gonna do that in time for v1.6 when we have more time and are confident that the versioned go client (client-go) works properly.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment applicable to both invocations of kubectl, before client-go used.
It doesn't make sense to fail with debug message if we already in the beginning can warn user about potential issue. That's the whole reason for prechecks.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This situation is very different with reset however, we're expecting that the system could easily have failed to cluster up, so the drain operation may well fail and we intend to keep on going. Provided the error messages resulting from attempting to drain are logical, it's probably ok to let the error for whatever happens bubble up, as we're going to proceed either way.

That said, @luxas what happens on pure nodes here? Is kubectl configured there? Does draining only work on masters?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It works on nodes as well, otherwise I wouldn't have added it here...
The /etc/kubernetes/kubelet.conf file exists everywhere when using kubeadm, and kubectl is installed everywhere, so when going the opinionated route with debs and rpms, it just "should work (tm)" 😄

if err != nil {
return fmt.Errorf("failed to drain node %q [%s]", hostname, output)
}

if removeNode {
fmt.Printf("[reset] Removing node: %q\n", hostname)

output, err = exec.Command("kubectl", "--kubeconfig", kubeConfigPath, "delete", "node", hostname).Output()
if err != nil {
return fmt.Errorf("failed to remove node %q [%s]", hostname, output)
}
}

return nil
}

// cleanDir removes everything in a directory, but not the directory itself
func cleanDir(filepath string) error {
// If the directory doesn't even exist there's nothing to do, and we do
// not consider this an error
if _, err := os.Stat(filepath); os.IsNotExist(err) {
return nil
}

d, err := os.Open(filepath)
if err != nil {
return err
}
defer d.Close()
names, err := d.Readdirnames(-1)
if err != nil {
return err
}
for _, name := range names {
err = os.RemoveAll(path.Join(filepath, name))
if err != nil {
return err
}
}
return nil
}

// resetConfigDir is used to cleanup the files kubeadm writes in /etc/kubernetes/.
func resetConfigDir(configPathDir, pkiPathDir string) {
dirsToClean := []string{
path.Join(configPathDir, "manifests"),
pkiPathDir,
}
fmt.Printf("[reset] Deleting contents of config directories: %v\n", dirsToClean)
for _, dir := range dirsToClean {
err := cleanDir(dir)
if err != nil {
fmt.Printf("[reset] Failed to remove directory: %q [%v]\n", dir, err)
}
}

filesToClean := []string{
path.Join(configPathDir, "admin.conf"),
path.Join(configPathDir, "kubelet.conf"),
}
fmt.Printf("[reset] Deleting files: %v\n", filesToClean)
for _, path := range filesToClean {
err := os.RemoveAll(path)
if err != nil {
fmt.Printf("[reset] Failed to remove file: %q [%v]\n", path, err)
}
}
}
2 changes: 1 addition & 1 deletion cmd/kubeadm/app/cmd/reset_test.go
Expand Up @@ -162,7 +162,7 @@ func TestConfigDirCleaner(t *testing.T) {
}
}

resetConfigDir(tmpDir)
resetConfigDir(tmpDir, filepath.Join(tmpDir, "pki"))

// Verify the files we cleanup implicitly in every test:
assertExists(t, tmpDir)
Expand Down
1 change: 1 addition & 0 deletions hack/verify-flags/known-flags.txt
Expand Up @@ -489,6 +489,7 @@ registry-burst
registry-qps
reject-methods
reject-paths
remove-node
repair-malformed-updates
replicaset-lookup-cache-size
replication-controller-lookup-cache-size
Expand Down