diff --git a/.gitignore b/.gitignore index 7463a91833..104d30164d 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,9 @@ ipam-*.xml controller-gen build/tools/bin npm/debug/http + +# certificates +*/**/certs/ +*.crt +*.pem +*.srl diff --git a/npm/Dockerfile b/npm/Dockerfile index 6184c0173e..85e27bf8c8 100644 --- a/npm/Dockerfile +++ b/npm/Dockerfile @@ -20,6 +20,9 @@ FROM docker.io/ubuntu:focal COPY --from=builder /usr/local/bin/azure-npm \ /usr/bin/azure-npm +COPY --from=builder /usr/local/src/npm/npm/scripts \ + /usr/local/npm + # Install dependencies. RUN apt-get update RUN apt-get install -y iptables @@ -29,5 +32,8 @@ RUN apt-get upgrade -y RUN chmod +x /usr/bin/azure-npm +WORKDIR /usr/local/npm +RUN ./generate_certs.sh + # Run the npm command by default when the container starts. ENTRYPOINT ["/usr/bin/azure-npm", "start"] diff --git a/npm/cmd/root.go b/npm/cmd/root.go index ea07743d70..c02f870d0b 100644 --- a/npm/cmd/root.go +++ b/npm/cmd/root.go @@ -1,7 +1,14 @@ package main import ( + "bytes" + "encoding/json" + "fmt" + + npmconfig "github.com/Azure/azure-container-networking/npm/config" "github.com/spf13/cobra" + "github.com/spf13/viper" + "k8s.io/klog" ) // NewRootCmd returns a root cobra command @@ -12,6 +19,27 @@ func NewRootCmd() *cobra.Command { CompletionOptions: cobra.CompletionOptions{ DisableDefaultCmd: true, }, + PersistentPreRunE: func(cmd *cobra.Command, args []string) error { + viper.AutomaticEnv() // read in environment variables that match + viper.SetDefault(npmconfig.ConfigEnvPath, npmconfig.GetConfigPath()) + cfgFile := viper.GetString(npmconfig.ConfigEnvPath) + viper.SetConfigFile(cfgFile) + + // If a config file is found, read it in. + // NOTE: there is no config merging with default, if config is loaded, options must be set + if err := viper.ReadInConfig(); err == nil { + klog.Infof("Using config file: %+v", viper.ConfigFileUsed()) + } else { + klog.Infof("Failed to load config from env %s: %v", npmconfig.ConfigEnvPath, err) + b, _ := json.Marshal(npmconfig.DefaultConfig) //nolint // skip checking error + err := viper.ReadConfig(bytes.NewBuffer(b)) + if err != nil { + return fmt.Errorf("failed to read in default with err %w", err) + } + } + + return nil + }, } startCmd := newStartNPMCmd() diff --git a/npm/cmd/start.go b/npm/cmd/start.go index 965aee3a31..974c30380a 100644 --- a/npm/cmd/start.go +++ b/npm/cmd/start.go @@ -3,8 +3,6 @@ package main import ( - "bytes" - "encoding/json" "fmt" "math/rand" "time" @@ -48,28 +46,6 @@ func newStartNPMCmd() *cobra.Command { startNPMCmd := &cobra.Command{ Use: "start", Short: "Starts the Azure NPM process", - PersistentPreRunE: func(cmd *cobra.Command, args []string) error { - viper.AutomaticEnv() // read in environment variables that match - viper.SetDefault(npmconfig.ConfigEnvPath, npmconfig.GetConfigPath()) - cfgFile := viper.GetString(npmconfig.ConfigEnvPath) - viper.SetConfigFile(cfgFile) - - // If a config file is found, read it in. - // NOTE: there is no config merging with default, if config is loaded, options must be set - if err := viper.ReadInConfig(); err == nil { - klog.Infof("Using config file: %+v", viper.ConfigFileUsed()) - } else { - klog.Infof("Failed to load config from env %s: %v", npmconfig.ConfigEnvPath, err) - b, _ := json.Marshal(npmconfig.DefaultConfig) - err := viper.ReadConfig(bytes.NewBuffer(b)) - if err != nil { - return fmt.Errorf("failed to read in default with err %w", err) - } - } - - return nil - }, - RunE: func(cmd *cobra.Command, args []string) error { config := &npmconfig.Config{} err := viper.Unmarshal(config) diff --git a/npm/deploy/kustomize/overlays/controller/deployment.yaml b/npm/deploy/kustomize/overlays/controller/deployment.yaml index f6294e3a56..ccf8d27e2a 100644 --- a/npm/deploy/kustomize/overlays/controller/deployment.yaml +++ b/npm/deploy/kustomize/overlays/controller/deployment.yaml @@ -38,7 +38,7 @@ spec: containerPort: 10092 image: azure-npm:v1.4.1 command: ["azure-npm"] - args: ["start", "controlplane"] + args: ["controlplane"] resources: limits: cpu: 250m diff --git a/npm/deploy/kustomize/overlays/daemon/deployment.yaml b/npm/deploy/kustomize/overlays/daemon/deployment.yaml index d445379873..bdeb3244d9 100644 --- a/npm/deploy/kustomize/overlays/daemon/deployment.yaml +++ b/npm/deploy/kustomize/overlays/daemon/deployment.yaml @@ -36,7 +36,7 @@ spec: containerPort: 10091 image: azure-npm:v1.4.1 command: ["azure-npm"] - args: ["start", "daemon"] + args: ["daemon"] resources: limits: cpu: 250m diff --git a/npm/pkg/transport/errors.go b/npm/pkg/transport/errors.go index 0d6a7909aa..dd40b957a9 100644 --- a/npm/pkg/transport/errors.go +++ b/npm/pkg/transport/errors.go @@ -2,5 +2,9 @@ package transport import "errors" -// ErrNoPeer is returned when no peer was found in the gRPC context. -var ErrNoPeer = errors.New("no peer found in gRPC context") +var ( + // ErrNoPeer is returned when no peer was found in the gRPC context. + ErrNoPeer = errors.New("no peer found in gRPC context") + // ErrTLSCerts is returned for any TLS certificate related issue + ErrTLSCerts = errors.New("tls certificate error") +) diff --git a/npm/pkg/transport/events_client.go b/npm/pkg/transport/events_client.go index 268d9992d4..f053662884 100644 --- a/npm/pkg/transport/events_client.go +++ b/npm/pkg/transport/events_client.go @@ -6,7 +6,7 @@ import ( "github.com/Azure/azure-container-networking/npm/pkg/protos" "google.golang.org/grpc" - "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/credentials" "k8s.io/klog/v2" ) @@ -37,9 +37,18 @@ func NewEventsClient(ctx context.Context, pod, node, addr string) (*EventsClient } klog.Infof("Connecting to NPM controller gRPC server at address %s\n", addr) - // TODO Make this secure - // TODO Remove WithBlock option post testing - cc, err := grpc.DialContext(ctx, addr, grpc.WithTransportCredentials(insecure.NewCredentials())) + + config, err := clientTLSConfig() + if err != nil { + klog.Errorf("failed to load client tls config : %s", err) + return nil, fmt.Errorf("failed to load client tls config : %w", err) + } + + cc, err := grpc.DialContext( + ctx, + addr, + grpc.WithTransportCredentials(credentials.NewTLS(config)), + ) if err != nil { return nil, fmt.Errorf("failed to dial %s: %w", addr, err) } @@ -81,11 +90,12 @@ func (c *EventsClient) run(ctx context.Context, stopCh <-chan struct{}) error { default: if connectClient == nil { klog.Info("Reconnecting to gRPC server controller") - opts := []grpc.CallOption{grpc.WaitForReady(true)} + opts := []grpc.CallOption{grpc.WaitForReady(false)} connectClient, err = c.Connect(ctx, clientMetadata, opts...) if err != nil { return fmt.Errorf("failed to connect to dataplane events server: %w", err) } + klog.Info("Successfully connected to gRPC server controller") } event, err := connectClient.Recv() if err != nil { diff --git a/npm/pkg/transport/events_server.go b/npm/pkg/transport/events_server.go index 7e006a0161..e1188a33d2 100644 --- a/npm/pkg/transport/events_server.go +++ b/npm/pkg/transport/events_server.go @@ -157,7 +157,14 @@ func (m *EventsServer) handle() error { return fmt.Errorf("failed to handle server connections: %w", err) } + // load the server certificates + creds, err := serverTLSCreds() + if err != nil { + return fmt.Errorf("failed to load TLS certificates: %w", err) + } + var opts []grpc.ServerOption = []grpc.ServerOption{ + grpc.Creds(creds), grpc.MaxConcurrentStreams(grpcMaxConcurrentStreams), grpc.StatsHandler(m.Watchdog), } diff --git a/npm/pkg/transport/tls.go b/npm/pkg/transport/tls.go new file mode 100644 index 0000000000..c14fc8e8bb --- /dev/null +++ b/npm/pkg/transport/tls.go @@ -0,0 +1,48 @@ +package transport + +import ( + "crypto/tls" + "crypto/x509" + "fmt" + "os" + + "google.golang.org/grpc/credentials" +) + +const ( + serverCertPEMFilename = "tls.crt" + serverKeyPEMFilename = "tls.key" + caCertPEMFilename = "ca.crt" + path = "/usr/local/npm" +) + +func serverTLSCreds() (credentials.TransportCredentials, error) { + certFilepath := path + "/" + serverCertPEMFilename + keyFilepath := path + "/" + serverKeyPEMFilename + + creds, err := credentials.NewServerTLSFromFile(certFilepath, keyFilepath) + if err != nil { + return nil, fmt.Errorf("failed to create creds from cert/key files : %w", err) + } + return creds, nil +} + +func clientTLSConfig() (*tls.Config, error) { + caCertFilepath := path + "/" + caCertPEMFilename + // Load certificate of the CA who signed server's certificate + pemServerCA, err := os.ReadFile(caCertFilepath) + if err != nil { + return nil, fmt.Errorf("Failed to read the CA cert : %w", err) + } + + certPool := x509.NewCertPool() + if !certPool.AppendCertsFromPEM(pemServerCA) { + return nil, fmt.Errorf("failed to append ca cert to cert pool : %w", ErrTLSCerts) + } + + // Create the credentials and return it + return &tls.Config{ //nolint // setting tls min version to 3 + RootCAs: certPool, + InsecureSkipVerify: false, + }, nil +} diff --git a/npm/scripts/generate_certs.sh b/npm/scripts/generate_certs.sh new file mode 100755 index 0000000000..22df96102c --- /dev/null +++ b/npm/scripts/generate_certs.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +CERTS_STAGING_DIR=. +SAN_CNF_FILE=san.cnf +CERTIFICATE_VALIDITY_DAYS=3650 +CERT_SUBJ="/C=US/ST=Washington/L=Redmond/O=Microsoft/OU=Azure/CN=azure-npm.kube-system.svc.cluster.local" + +# Check if openssl is installed +if ! command -v openssl &> /dev/null +then + echo "openssl could not be found" + exit +fi + +# Check if SAN_CNF_FILE exists +if [ ! -f "$SAN_CNF_FILE" ] +then + echo "SAN_CNF_FILE does not exist" + exit +fi + +if [ ! -d "$CERTS_STAGING_DIR" ] +then + echo "Creating $CERTS_STAGING_DIR" + mkdir -p $CERTS_STAGING_DIR +fi + +# Generate the ca certificate and key +openssl req -x509 -newkey rsa:4096 -days $CERTIFICATE_VALIDITY_DAYS -nodes -keyout $CERTS_STAGING_DIR/ca.key -out $CERTS_STAGING_DIR/ca.crt -subj $CERT_SUBJ + +# Create a certificate signing request for the server +openssl req -newkey rsa:4096 -nodes -keyout $CERTS_STAGING_DIR/tls.key -out $CERTS_STAGING_DIR/server-req.pem -config $SAN_CNF_FILE -extensions v3_req -subj $CERT_SUBJ + +# Sign the server certificate with the CA +openssl x509 -req -in $CERTS_STAGING_DIR/server-req.pem -CA $CERTS_STAGING_DIR/ca.crt -CAkey $CERTS_STAGING_DIR/ca.key -CAcreateserial -out $CERTS_STAGING_DIR/tls.crt --days $CERTIFICATE_VALIDITY_DAYS --extfile $SAN_CNF_FILE --extensions v3_req + +# Remove the secret CA key and signing request +rm -rf $CERTS_STAGING_DIR/ca.key $CERTS_STAGING_DIR/server-req.pem diff --git a/npm/scripts/san.cnf b/npm/scripts/san.cnf new file mode 100644 index 0000000000..c6b10bac91 --- /dev/null +++ b/npm/scripts/san.cnf @@ -0,0 +1,21 @@ +[ req ] +default_bits = 2048 +distinguished_name = req_distinguished_name +req_extensions = v3_req + +[ req_distinguished_name ] +countryName = Country Name (2 letter code) +stateOrProvinceName = State or Province Name (full name) +localityName = Locality Name (eg, city) +organizationName = Organization Name (eg, company) +commonName = Common Name (e.g. server FQDN or YOUR name) + +[ v3_req ] +keyUsage = digitalSignature, nonRepudiation, keyEncipherment +subjectAltName = @alt_names + +[alt_names] +DNS.1 = azure-npm.kube-system.svc.cluster.local +DNS.2 = azure-npm.kube-system +DNS.3 = azure-npm +DNS.4 = 0.0.0.0