Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,9 @@ ipam-*.xml
controller-gen
build/tools/bin
npm/debug/http

# certificates
*/**/certs/
*.crt
*.pem
*.srl
6 changes: 6 additions & 0 deletions npm/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ FROM docker.io/ubuntu:focal
COPY --from=builder /usr/local/bin/azure-npm \
/usr/bin/azure-npm

COPY --from=builder /usr/local/src/npm/npm/scripts \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is npm/npm/ intentional?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep. Since I didnt want to change the WORKDIR in the builder image (here) which copies the host's source to /usr/local/src/npm it has to be this way.

/usr/local/npm

# Install dependencies.
RUN apt-get update
RUN apt-get install -y iptables
Expand All @@ -29,5 +32,8 @@ RUN apt-get upgrade -y

RUN chmod +x /usr/bin/azure-npm

WORKDIR /usr/local/npm
RUN ./generate_certs.sh

# Run the npm command by default when the container starts.
ENTRYPOINT ["/usr/bin/azure-npm", "start"]
28 changes: 28 additions & 0 deletions npm/cmd/root.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
package main

import (
"bytes"
"encoding/json"
"fmt"

npmconfig "github.com/Azure/azure-container-networking/npm/config"
"github.com/spf13/cobra"
"github.com/spf13/viper"
"k8s.io/klog"
)

// NewRootCmd returns a root cobra command
Expand All @@ -12,6 +19,27 @@ func NewRootCmd() *cobra.Command {
CompletionOptions: cobra.CompletionOptions{
DisableDefaultCmd: true,
},
PersistentPreRunE: func(cmd *cobra.Command, args []string) error {
viper.AutomaticEnv() // read in environment variables that match
viper.SetDefault(npmconfig.ConfigEnvPath, npmconfig.GetConfigPath())
cfgFile := viper.GetString(npmconfig.ConfigEnvPath)
viper.SetConfigFile(cfgFile)

// If a config file is found, read it in.
// NOTE: there is no config merging with default, if config is loaded, options must be set
if err := viper.ReadInConfig(); err == nil {
klog.Infof("Using config file: %+v", viper.ConfigFileUsed())
} else {
klog.Infof("Failed to load config from env %s: %v", npmconfig.ConfigEnvPath, err)
b, _ := json.Marshal(npmconfig.DefaultConfig) //nolint // skip checking error
err := viper.ReadConfig(bytes.NewBuffer(b))
if err != nil {
return fmt.Errorf("failed to read in default with err %w", err)
}
}

return nil
},
}

startCmd := newStartNPMCmd()
Expand Down
24 changes: 0 additions & 24 deletions npm/cmd/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
package main

import (
"bytes"
"encoding/json"
"fmt"
"math/rand"
"time"
Expand Down Expand Up @@ -48,28 +46,6 @@ func newStartNPMCmd() *cobra.Command {
startNPMCmd := &cobra.Command{
Use: "start",
Short: "Starts the Azure NPM process",
PersistentPreRunE: func(cmd *cobra.Command, args []string) error {
viper.AutomaticEnv() // read in environment variables that match
viper.SetDefault(npmconfig.ConfigEnvPath, npmconfig.GetConfigPath())
cfgFile := viper.GetString(npmconfig.ConfigEnvPath)
viper.SetConfigFile(cfgFile)

// If a config file is found, read it in.
// NOTE: there is no config merging with default, if config is loaded, options must be set
if err := viper.ReadInConfig(); err == nil {
klog.Infof("Using config file: %+v", viper.ConfigFileUsed())
} else {
klog.Infof("Failed to load config from env %s: %v", npmconfig.ConfigEnvPath, err)
b, _ := json.Marshal(npmconfig.DefaultConfig)
err := viper.ReadConfig(bytes.NewBuffer(b))
if err != nil {
return fmt.Errorf("failed to read in default with err %w", err)
}
}

return nil
},

RunE: func(cmd *cobra.Command, args []string) error {
config := &npmconfig.Config{}
err := viper.Unmarshal(config)
Expand Down
2 changes: 1 addition & 1 deletion npm/deploy/kustomize/overlays/controller/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ spec:
containerPort: 10092
image: azure-npm:v1.4.1
command: ["azure-npm"]
args: ["start", "controlplane"]
args: ["controlplane"]
resources:
limits:
cpu: 250m
Expand Down
2 changes: 1 addition & 1 deletion npm/deploy/kustomize/overlays/daemon/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ spec:
containerPort: 10091
image: azure-npm:v1.4.1
command: ["azure-npm"]
args: ["start", "daemon"]
args: ["daemon"]
resources:
limits:
cpu: 250m
Expand Down
8 changes: 6 additions & 2 deletions npm/pkg/transport/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,9 @@ package transport

import "errors"

// ErrNoPeer is returned when no peer was found in the gRPC context.
var ErrNoPeer = errors.New("no peer found in gRPC context")
var (
// ErrNoPeer is returned when no peer was found in the gRPC context.
ErrNoPeer = errors.New("no peer found in gRPC context")
// ErrTLSCerts is returned for any TLS certificate related issue
ErrTLSCerts = errors.New("tls certificate error")
)
20 changes: 15 additions & 5 deletions npm/pkg/transport/events_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (

"github.com/Azure/azure-container-networking/npm/pkg/protos"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
"google.golang.org/grpc/credentials"
"k8s.io/klog/v2"
)

Expand Down Expand Up @@ -37,9 +37,18 @@ func NewEventsClient(ctx context.Context, pod, node, addr string) (*EventsClient
}

klog.Infof("Connecting to NPM controller gRPC server at address %s\n", addr)
// TODO Make this secure
// TODO Remove WithBlock option post testing
cc, err := grpc.DialContext(ctx, addr, grpc.WithTransportCredentials(insecure.NewCredentials()))

config, err := clientTLSConfig()
if err != nil {
klog.Errorf("failed to load client tls config : %s", err)
return nil, fmt.Errorf("failed to load client tls config : %w", err)
}

cc, err := grpc.DialContext(
ctx,
addr,
grpc.WithTransportCredentials(credentials.NewTLS(config)),
)
if err != nil {
return nil, fmt.Errorf("failed to dial %s: %w", addr, err)
}
Expand Down Expand Up @@ -81,11 +90,12 @@ func (c *EventsClient) run(ctx context.Context, stopCh <-chan struct{}) error {
default:
if connectClient == nil {
klog.Info("Reconnecting to gRPC server controller")
opts := []grpc.CallOption{grpc.WaitForReady(true)}
opts := []grpc.CallOption{grpc.WaitForReady(false)}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

curious what this represents and why we had it true before, false now

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True blocks the connection until the server is ready to accept the connection. We don't want to block our connection since the server pod could crash which we are connecting or in the load-balancing case reject the connection in which case the agent would need to block until timeout. Instead we try sending, error out and try reconnecting.

connectClient, err = c.Connect(ctx, clientMetadata, opts...)
if err != nil {
return fmt.Errorf("failed to connect to dataplane events server: %w", err)
}
klog.Info("Successfully connected to gRPC server controller")
}
event, err := connectClient.Recv()
if err != nil {
Expand Down
7 changes: 7 additions & 0 deletions npm/pkg/transport/events_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,14 @@ func (m *EventsServer) handle() error {
return fmt.Errorf("failed to handle server connections: %w", err)
}

// load the server certificates
creds, err := serverTLSCreds()
if err != nil {
return fmt.Errorf("failed to load TLS certificates: %w", err)
}

var opts []grpc.ServerOption = []grpc.ServerOption{
grpc.Creds(creds),
grpc.MaxConcurrentStreams(grpcMaxConcurrentStreams),
grpc.StatsHandler(m.Watchdog),
}
Expand Down
48 changes: 48 additions & 0 deletions npm/pkg/transport/tls.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package transport

import (
"crypto/tls"
"crypto/x509"
"fmt"
"os"

"google.golang.org/grpc/credentials"
)

const (
serverCertPEMFilename = "tls.crt"
serverKeyPEMFilename = "tls.key"
caCertPEMFilename = "ca.crt"
path = "/usr/local/npm"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we have this path be defined in azure-npm-config ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The certs are baked in through the Dockerfile. Not sure how to make that dynamic. Hence I made this static :/

Copy link
Collaborator

@rbtr rbtr Mar 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you could make it dynamic using a volume mount at runtime for a BYO cert situation; it would be better for these to be configurable imo

)

func serverTLSCreds() (credentials.TransportCredentials, error) {
certFilepath := path + "/" + serverCertPEMFilename
keyFilepath := path + "/" + serverKeyPEMFilename

creds, err := credentials.NewServerTLSFromFile(certFilepath, keyFilepath)
if err != nil {
return nil, fmt.Errorf("failed to create creds from cert/key files : %w", err)
}
return creds, nil
}

func clientTLSConfig() (*tls.Config, error) {
caCertFilepath := path + "/" + caCertPEMFilename
// Load certificate of the CA who signed server's certificate
pemServerCA, err := os.ReadFile(caCertFilepath)
if err != nil {
return nil, fmt.Errorf("Failed to read the CA cert : %w", err)
}

certPool := x509.NewCertPool()
if !certPool.AppendCertsFromPEM(pemServerCA) {
return nil, fmt.Errorf("failed to append ca cert to cert pool : %w", ErrTLSCerts)
}

// Create the credentials and return it
return &tls.Config{ //nolint // setting tls min version to 3
RootCAs: certPool,
InsecureSkipVerify: false,
}, nil
}
38 changes: 38 additions & 0 deletions npm/scripts/generate_certs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash

CERTS_STAGING_DIR=.
SAN_CNF_FILE=san.cnf
CERTIFICATE_VALIDITY_DAYS=3650
CERT_SUBJ="/C=US/ST=Washington/L=Redmond/O=Microsoft/OU=Azure/CN=azure-npm.kube-system.svc.cluster.local"

# Check if openssl is installed
if ! command -v openssl &> /dev/null
then
echo "openssl could not be found"
exit
fi

# Check if SAN_CNF_FILE exists
if [ ! -f "$SAN_CNF_FILE" ]
then
echo "SAN_CNF_FILE does not exist"
exit
fi

if [ ! -d "$CERTS_STAGING_DIR" ]
then
echo "Creating $CERTS_STAGING_DIR"
mkdir -p $CERTS_STAGING_DIR
fi

# Generate the ca certificate and key
openssl req -x509 -newkey rsa:4096 -days $CERTIFICATE_VALIDITY_DAYS -nodes -keyout $CERTS_STAGING_DIR/ca.key -out $CERTS_STAGING_DIR/ca.crt -subj $CERT_SUBJ

# Create a certificate signing request for the server
openssl req -newkey rsa:4096 -nodes -keyout $CERTS_STAGING_DIR/tls.key -out $CERTS_STAGING_DIR/server-req.pem -config $SAN_CNF_FILE -extensions v3_req -subj $CERT_SUBJ

# Sign the server certificate with the CA
openssl x509 -req -in $CERTS_STAGING_DIR/server-req.pem -CA $CERTS_STAGING_DIR/ca.crt -CAkey $CERTS_STAGING_DIR/ca.key -CAcreateserial -out $CERTS_STAGING_DIR/tls.crt --days $CERTIFICATE_VALIDITY_DAYS --extfile $SAN_CNF_FILE --extensions v3_req

# Remove the secret CA key and signing request
rm -rf $CERTS_STAGING_DIR/ca.key $CERTS_STAGING_DIR/server-req.pem
21 changes: 21 additions & 0 deletions npm/scripts/san.cnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[ req ]
default_bits = 2048
distinguished_name = req_distinguished_name
req_extensions = v3_req

[ req_distinguished_name ]
countryName = Country Name (2 letter code)
stateOrProvinceName = State or Province Name (full name)
localityName = Locality Name (eg, city)
organizationName = Organization Name (eg, company)
commonName = Common Name (e.g. server FQDN or YOUR name)

[ v3_req ]
keyUsage = digitalSignature, nonRepudiation, keyEncipherment
subjectAltName = @alt_names

[alt_names]
DNS.1 = azure-npm.kube-system.svc.cluster.local
DNS.2 = azure-npm.kube-system
DNS.3 = azure-npm
DNS.4 = 0.0.0.0