diff --git a/.gitreview b/.gitreview index 1db08df202d3..98f6b2d8d67d 100644 --- a/.gitreview +++ b/.gitreview @@ -2,3 +2,4 @@ host=gerrit.fd.io port=29418 project=vpp +defaultbranch=stable/1810 diff --git a/Makefile b/Makefile index 36c7a5a741d0..692ea7bf8a1c 100644 --- a/Makefile +++ b/Makefile @@ -131,22 +131,21 @@ RPM_SUSE_PLATFORM_DEPS = distribution-release shadow rpm-build ifeq ($(OS_ID),opensuse) ifeq ($(SUSE_NAME),Tumbleweed) - RPM_SUSE_DEVEL_DEPS = libboost_headers-devel libboost_thread-devel gcc + RPM_SUSE_DEVEL_DEPS = libboost_headers1_68_0-devel-1.68.0 libboost_thread1_68_0-devel-1.68.0 gcc RPM_SUSE_PYTHON_DEPS += python2-ply python2-virtualenv endif ifeq ($(SUSE_ID),15.0) RPM_SUSE_DEVEL_DEPS = libboost_headers-devel libboost_thread-devel gcc6 - RPM_SUSE_PYTHON_DEPS += python2-ply python2-virtualenv else - RPM_SUSE_DEVEL_DEPS += boost_1_61-devel gcc6 + RPM_SUSE_DEVEL_DEPS += libboost_headers1_68_0-devel-1.68.0 gcc6 RPM_SUSE_PYTHON_DEPS += python-virtualenv endif endif ifeq ($(OS_ID),opensuse-leap) ifeq ($(SUSE_ID),15.0) - RPM_SUSE_DEVEL_DEPS = libboost_headers-devel libboost_thread-devel gcc6 - RPM_SUSE_PYTHON_DEPS += python2-ply python2-virtualenv + RPM_SUSE_DEVEL_DEPS = libboost_headers-devel libboost_thread-devel gcc + RPM_SUSE_PYTHON_DEPS += python3-ply python2-virtualenv endif endif @@ -296,8 +295,12 @@ endif @sudo -E apt-get update @sudo -E apt-get $(APT_ARGS) $(CONFIRM) $(FORCE) install $(DEB_DEPENDS) else ifneq ("$(wildcard /etc/redhat-release)","") - @sudo -E yum groupinstall $(CONFIRM) $(RPM_DEPENDS_GROUPS) +ifeq ($(OS_ID),rhel) + @sudo -E yum-config-manager --enable rhel-server-rhscl-7-rpms +else ifeq ($(OS_ID),centos) @sudo -E yum install $(CONFIRM) centos-release-scl-rh +endif + @sudo -E yum groupinstall $(CONFIRM) $(RPM_DEPENDS_GROUPS) @sudo -E yum install $(CONFIRM) $(RPM_DEPENDS) @sudo -E debuginfo-install $(CONFIRM) glibc openssl-libs mbedtls-devel zlib else ifeq ($(filter opensuse-tumbleweed,$(OS_ID)),$(OS_ID)) @@ -305,12 +308,12 @@ else ifeq ($(filter opensuse-tumbleweed,$(OS_ID)),$(OS_ID)) @sudo -E zypper install -y $(RPM_SUSE_DEPENDS) else ifeq ($(filter opensuse-leap,$(OS_ID)),$(OS_ID)) @sudo -E zypper refresh - @sudo -E zypper install -y $(RPM_SUSE_DEPENDS) + @sudo -E zypper install -y $(RPM_SUSE_DEPENDS) else ifeq ($(filter opensuse,$(OS_ID)),$(OS_ID)) @sudo -E zypper refresh @sudo -E zypper install -y $(RPM_SUSE_DEPENDS) else - $(error "This option currently works only on Ubuntu, Debian, Centos or openSUSE systems") + $(error "This option currently works only on Ubuntu, Debian, RHEL, CentOS or openSUSE systems") endif define make diff --git a/RELEASE.md b/RELEASE.md index 57ff828d13c7..c931d8aa11ea 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,5 +1,7 @@ # Release Notes {#release_notes} +* @subpage release_notes_1810 +* @subpage release_notes_1807 * @subpage release_notes_1804 * @subpage release_notes_18012 * @subpage release_notes_18011 @@ -12,6 +14,320 @@ * @subpage release_notes_1609 * @subpage release_notes_1606 +@page release_notes_1810 Release notes for VPP 18.10 + +More than 632 commits since the 18.07 release. + +## Features + +### Infrastructure +- DPDK 18.08 integration +- New Stats infrastructure (interface, error, node performance counters) +- Add configurable "Doug Lea malloc" support + +### VNET & Plugins +- Load balancing: support per-port VIP and all-port VIP +- Port NSH plugin to VPP +- NAT + - Configurable port range + - Virtual Fragmentation Reassembly for endpoint-dependent mode + - Client-IP based session affinity for load-balancing + - TCP MSS clamping + - Session timeout + - Bug-fixing and performance optimizations + +### Host stack +- Support for applications with multiple workers +- Support for binds from multiple app workers to same ip:port +- Switched to a message queue for io and control event notifications +- Support for eventfd based notifications as alternative to mutext-condvar pair +- VCL refactor to support async event notifications and multiple workers +- TLS async support in client for HW accleration +- Performance optimizations and bug-fixing +- A number of binary APIs will be deprecated in favor of using the event + message queue. Details in the API section. + +## Known issues + +For the full list of issues please refer to fd.io [JIRA](https://jira.fd.io). + +## Issues fixed + +For the full list of fixed issues please refer to: +- fd.io [JIRA](https://jira.fd.io) +- git [commit log](https://git.fd.io/vpp/log/?h=stable/1810) + +## API changes + +Description of results: + +* _Definition changed_: indicates that the API file was modified between releases. +* _Only in image_: indicates the API is new for this release. +* _Only in file_: indicates the API has been removed in this release. + + Message Name Result +api_versions_reply definition changed +app_cut_through_registration_add definition changed +app_worker_add_del definition changed +application_attach_reply definition changed +bd_ip_mac_details only in image +bd_ip_mac_dump only in image +bfd_udp_get_echo_source definition changed +bier_imp_details definition changed +bier_route_details definition changed +bind_sock definition changed +bridge_domain_details definition changed +bridge_flags definition changed +classify_add_del_session definition changed +classify_add_del_table definition changed +connect_sock definition changed +create_vhost_user_if definition changed +get_first_msg_id_reply definition changed +gpe_add_del_fwd_entry_reply definition changed +gpe_fwd_entry_path_details definition changed +ip6_fib_details definition changed +ip6nd_proxy_details definition changed +ip_add_del_route_reply definition changed +ip_address_details definition changed +ip_details definition changed +ip_fib_details definition changed +ip_mfib_details definition changed +ip_mroute_add_del_reply definition changed +ip_neighbor_add_del_reply definition changed +ip_neighbor_details definition changed +ip_reassembly_get_reply definition changed +ip_unnumbered_details definition changed +ipip_6rd_add_tunnel definition changed +ipip_add_tunnel definition changed +ipsec_spds_details only in image +ipsec_spds_dump only in image +l2_interface_efp_filter definition changed +lisp_eid_table_vni_details definition changed +map_another_segment definition changed +mfib_signal_details definition changed +mpls_route_add_del_reply definition changed +mpls_tunnel_add_del definition changed +mpls_tunnel_add_del_reply definition changed +mpls_tunnel_details definition changed +mpls_tunnel_dump definition changed +one_eid_table_vni_details definition changed +qos_mark_enable_disable definition changed +qos_record_enable_disable definition changed +reset_session_reply definition changed +rpc_call definition changed +show_threads definition changed +sockclnt_create_reply definition changed +sockclnt_delete definition changed +sockclnt_delete_reply definition changed +sw_interface_rx_placement_details only in image +sw_interface_rx_placement_dump only in image +sw_interface_set_ip_directed_broadcast definition changed +sw_interface_set_l2_bridge definition changed +sw_interface_set_rx_placement definition changed +sw_interface_set_vxlan_gbp_bypass definition changed +udp_encap_add definition changed +udp_encap_add_del_reply only in file +udp_encap_add_reply only in image +udp_encap_del definition changed +udp_encap_details definition changed +unbind_sock definition changed +vxlan_gbp_tunnel_add_del definition changed +vxlan_gbp_tunnel_details only in image +vxlan_gbp_tunnel_dump only in image +Found 68 api message signature differences + +### Patches that changed API definitions + +| @c src/plugins/avf/avf.api || +| ------- | ------- | +| [149d0e28](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=149d0e28) | avf: RSS support | +| [4e6014fc](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=4e6014fc) | avf: api fix | + +| @c src/plugins/gbp/gbp.api || +| ------- | ------- | +| [c0a93143](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=c0a93143) | GBP Endpoint Updates | +| [61b94c6b](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=61b94c6b) | vxlan-gbp: Add support for vxlan gbp | + +| @c src/plugins/igmp/igmp.api || +| ------- | ------- | +| [bdc0e6b7](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=bdc0e6b7) | Trivial: Clean up some typos. | + +| @c src/plugins/lb/lb.api || +| ------- | ------- | +| [6a4375e0](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=6a4375e0) | LB: fix flush flow table issue | +| [49ca2601](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=49ca2601) | Add flush flag on del as command | +| [219cc90c](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=219cc90c) | Support lb on both vip and per-port-vip case | + +| @c src/plugins/nat/nat.api || +| ------- | ------- | +| [bb4e0225](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=bb4e0225) | NAT: TCP MSS clamping | +| [5d28c7af](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=5d28c7af) | NAT: add support for configurable port range (VPP-1346) | +| [ea5b5be4](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=ea5b5be4) | NAT44: client-IP based session affinity for load-balancing (VPP-1297) | +| [878c646a](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=878c646a) | NAT44: add support for session timeout (VPP-1272) | +| [69ce30d6](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=69ce30d6) | NAT: update nat_show_config_reply API (VPP-1403) | +| [6bd197eb](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=6bd197eb) | Remove client_index field from replies in API | +| [c6c0d2a0](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=c6c0d2a0) | NAT44: LB NAT - local backends in multiple VRFs (VPP-1345) | + +| @c src/plugins/vmxnet3/vmxnet3.api || +| ------- | ------- | +| [df7f8e8c](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=df7f8e8c) | vmxnet3 device driver | + +| @c src/plugins/nsh/nsh.api || +| ------- | ------- | +| [d313f9e6](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=d313f9e6) | Port NSH plugin to VPP | + +| @c src/plugins/nsim/nsim.api || +| ------- | ------- | +| [9e3252b5](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=9e3252b5) | Network delay simulator plugin | + +| @c src/plugins/svs/svs.api || +| ------- | ------- | +| [d1e68ab7](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=d1e68ab7) | Source VRF Select | + +| @c src/vlibmemory/memclnt.api || +| ------- | ------- | +| [94495f2a](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=94495f2a) | PAPI: Use UNIX domain sockets instead of shared memory | +| [6bd197eb](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=6bd197eb) | Remove client_index field from replies in API | +| [75282457](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=75282457) | Fix "Old Style VLA" build warnings | + +| @c src/vnet/interface.api || +| ------- | ------- | +| [f0b42f48](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=f0b42f48) | itf: dump interface rx-placement | +| [bdc0e6b7](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=bdc0e6b7) | Trivial: Clean up some typos. | +| [54f7c51f](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=54f7c51f) | rx-placement: Add API call for interface rx-placement | +| [1855b8e4](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=1855b8e4) | IP directed broadcast | + +| @c src/vnet/bfd/bfd.api || +| ------- | ------- | +| [2d3c7b9c](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=2d3c7b9c) | BFD: add get echo source API (VPP-1367) | + +| @c src/vnet/bier/bier.api || +| ------- | ------- | +| [ef90ed08](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=ef90ed08) | BIER API and load-balancing fixes | +| [6bd197eb](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=6bd197eb) | Remove client_index field from replies in API | + +| @c src/vnet/classify/classify.api || +| ------- | ------- | +| [34eb5d42](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=34eb5d42) | classify_add_del_session API: Use more descriptive docstring (VPP-1385) | +| [75282457](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=75282457) | Fix "Old Style VLA" build warnings | + +| @c src/vnet/devices/pipe/pipe.api || +| ------- | ------- | +| [208c29aa](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=208c29aa) | VOM: support for pipes | + +| @c src/vnet/devices/virtio/vhost_user.api || +| ------- | ------- | +| [ee2e58f6](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=ee2e58f6) | vhost-user: Add disable feature support in api | + +| @c src/vnet/ethernet/ethernet_types.api || +| ------- | ------- | +| [de5b08fb](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=de5b08fb) | Introduce a mac_address_t on the API and in VPP | + +| @c src/vnet/ip/ip_types.api || +| ------- | ------- | +| [d0df49f2](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=d0df49f2) | Use IP address types on UDP encap API | + +| @c src/vnet/ip/ip.api || +| ------- | ------- | +| [412ecd32](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=412ecd32) | Improve ip_mroute_add_del documentation | +| [14260393](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=14260393) | Add adjacency counters to the stats segment | +| [28c142e3](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=28c142e3) | mroute routers in the stats segment | +| [008dbe10](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=008dbe10) | Route counters in the stats segment | +| [de5b08fb](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=de5b08fb) | Introduce a mac_address_t on the API and in VPP | +| [6bd197eb](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=6bd197eb) | Remove client_index field from replies in API | +| [b11f903a](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=b11f903a) | Fix context field position in API definition | + +| @c src/vnet/ipip/ipip.api || +| ------- | ------- | +| [61502115](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=61502115) | IPIP and SIXRD tunnels create API needs table-IDs not fib-indexes | + +| @c src/vnet/ipsec/ipsec.api || +| ------- | ------- | +| [a9a0b2ce](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=a9a0b2ce) | IPsec: add API for SPDs dump (VPP-1363) | +| [bdc0e6b7](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=bdc0e6b7) | Trivial: Clean up some typos. | + +| @c src/vnet/l2/l2.api || +| ------- | ------- | +| [0a4e0063](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=0a4e0063) | Fix documentation about sw_interface_set_l2_bridge | +| [b474380f](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=b474380f) | L2 BD: introduce a BD interface on which to send UU packets | +| [bdc0e6b7](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=bdc0e6b7) | Trivial: Clean up some typos. | +| [5c7c49d1](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=5c7c49d1) | Fix documentation for SHG in bridge domain | +| [5d82d2f1](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=5d82d2f1) | l2: arp termination dump | +| [6b9b41c8](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=6b9b41c8) | L2 EFP: byteswap sw_if_index, enable flag can be u8 on .api | + +| @c src/vnet/lisp-cp/lisp.api || +| ------- | ------- | +| [bdc0e6b7](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=bdc0e6b7) | Trivial: Clean up some typos. | +| [6bd197eb](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=6bd197eb) | Remove client_index field from replies in API | + +| @c src/vnet/lisp-cp/one.api || +| ------- | ------- | +| [bdc0e6b7](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=bdc0e6b7) | Trivial: Clean up some typos. | +| [6bd197eb](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=6bd197eb) | Remove client_index field from replies in API | + +| @c src/vnet/lisp-gpe/lisp_gpe.api || +| ------- | ------- | +| [6bd197eb](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=6bd197eb) | Remove client_index field from replies in API | +| [b11f903a](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=b11f903a) | Fix context field position in API definition | + +| @c src/vnet/mpls/mpls.api || +| ------- | ------- | +| [f5fa5ae2](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=f5fa5ae2) | MPLS tunnel dump: use sw_if_index not tunnel_index | +| [6a30b5f9](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=6a30b5f9) | MPLS tunnel dump fix | +| [008dbe10](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=008dbe10) | Route counters in the stats segment | +| [7c922dc4](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=7c922dc4) | SR-MPLS: fixes and tests | + +| @c src/vnet/qos/qos.api || +| ------- | ------- | +| [bdc0e6b7](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=bdc0e6b7) | Trivial: Clean up some typos. | +| [ed234e7f](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=ed234e7f) | Enum type on the API for QoS sources | + +| @c src/vnet/session/session.api || +| ------- | ------- | +| [ab2f6dbf](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=ab2f6dbf) | session: support multiple worker binds | +| [134a996a](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=134a996a) | vcl: add support for multi-worker apps | +| [1553197f](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=1553197f) | session: add support for multiple app workers | +| [6bd197eb](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=6bd197eb) | Remove client_index field from replies in API | +| [99368315](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=99368315) | vcl: support for eventfd mq signaling | + +| @c src/vnet/span/span.api || +| ------- | ------- | +| [bdc0e6b7](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=bdc0e6b7) | Trivial: Clean up some typos. | + +| @c src/vnet/udp/udp.api || +| ------- | ------- | +| [9c0a3c42](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=9c0a3c42) | UDP-Encap: name counters for the stats segment | +| [d0df49f2](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=d0df49f2) | Use IP address types on UDP encap API | + +| @c src/vnet/unix/tap.api || +| ------- | ------- | +| [bdc0e6b7](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=bdc0e6b7) | Trivial: Clean up some typos. | + +| @c src/vnet/vxlan-gbp/vxlan_gbp.api || +| ------- | ------- | +| [79a05f54](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=79a05f54) | VXLAN-GBP: use common types on the API | +| [61b94c6b](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=61b94c6b) | vxlan-gbp: Add support for vxlan gbp | + +| @c src/vpp/api/vpe.api || +| ------- | ------- | +| [5d64c786](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=5d64c786) | thread: Add show threads api | +| [ec11b13a](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=ec11b13a) | Trivial: Cleanup some typos. | + +| @c src/vpp/stats/stats.api || +| ------- | ------- | +| [ec11b13a](https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commit;h=ec11b13a) | Trivial: Cleanup some typos. | + +### Notice of future API deprecation +- bind_uri_reply +- accept_session +- accept_session_reply +- disconnect_session_reply +- reset_session +- reset_session_reply +- bind_sock_reply +- connect_session_reply + + @page release_notes_1807 Release notes for VPP 18.07 More than 533 commits since the 18.04 release. diff --git a/build-root/deb/debian/vpp.service b/build-root/deb/debian/vpp.service index aa1651c41197..2e86941de8b5 100644 --- a/build-root/deb/debian/vpp.service +++ b/build-root/deb/debian/vpp.service @@ -4,7 +4,6 @@ After=network.target [Service] Type=simple -ExecStartPre=-/bin/rm -f /dev/shm/db /dev/shm/global_vm /dev/shm/vpe-api ExecStartPre=-/sbin/modprobe uio_pci_generic ExecStart=/usr/bin/vpp -c /etc/vpp/startup.conf ExecStopPost=/bin/rm -f /dev/shm/db /dev/shm/global_vm /dev/shm/vpe-api diff --git a/build-root/deb/debian/vpp.upstart b/build-root/deb/debian/vpp.upstart index 62e1d2780e66..4a451dd45d01 100644 --- a/build-root/deb/debian/vpp.upstart +++ b/build-root/deb/debian/vpp.upstart @@ -1,12 +1,11 @@ description "vector packet processing engine" -author "Cisco Systems, Inc " +author "Cisco Systems, Inc " manual respawn pre-start script - rm -f /dev/shm/db /dev/shm/global_vm /dev/shm/vpe-api || true # should be there via dkms, but if not, start anyway modprobe uio_pci_generic || true end script diff --git a/build/external/Makefile b/build/external/Makefile index 8ac764caf25a..1082cfc1c397 100644 --- a/build/external/Makefile +++ b/build/external/Makefile @@ -20,7 +20,7 @@ MAKE_ARGS ?= -j BUILD_DIR ?= $(CURDIR)/_build INSTALL_DIR ?= $(CURDIR)/_install PKG_VERSION ?= $(shell git describe --abbrev=0 | cut -d- -f1 | cut -dv -f2) -PKG_SUFFIX ?= $(shell git log --oneline $$(git describe --abbrev=0).. . | wc -l) +PKG_SUFFIX ?= $(shell git log --oneline v$(PKG_VERSION)-rc0.. . | wc -l) JOBS := $(if $(shell [ -f /proc/cpuinfo ] && head /proc/cpuinfo),\ $(shell grep -c ^processor /proc/cpuinfo), 2) diff --git a/doxygen/test_framework_doc.md b/doxygen/test_framework_doc.md index cedd6d31ced3..3da29bbf1050 100644 --- a/doxygen/test_framework_doc.md +++ b/doxygen/test_framework_doc.md @@ -4,6 +4,8 @@ Test Framework Documentation {#test_framework_doc} PyDoc generated documentation for the "make test" framework is available for the following releases: +- [Test framework documentation for VPP 18.10](https://docs.fd.io/vpp/18.10/vpp_make_test/html) +- [Test framework documentation for VPP 18.07](https://docs.fd.io/vpp/18.07/vpp_make_test/html) - [Test framework documentation for VPP 18.04](https://docs.fd.io/vpp/18.04/vpp_make_test/html) - [Test framework documentation for VPP 18.01](https://docs.fd.io/vpp/18.01/vpp_make_test/html) - [Test framework documentation for VPP 17.10](https://docs.fd.io/vpp/17.10/vpp_make_test/html) diff --git a/extras/japi/java/jvpp-core/io/fd/vpp/jvpp/core/examples/L2AclExample.java b/extras/japi/java/jvpp-core/io/fd/vpp/jvpp/core/examples/L2AclExample.java index f89043a3b0a0..9a17136a6d95 100644 --- a/extras/japi/java/jvpp-core/io/fd/vpp/jvpp/core/examples/L2AclExample.java +++ b/extras/japi/java/jvpp-core/io/fd/vpp/jvpp/core/examples/L2AclExample.java @@ -34,7 +34,6 @@ import io.fd.vpp.jvpp.core.dto.InputAclSetInterface; import io.fd.vpp.jvpp.core.dto.InputAclSetInterfaceReply; import io.fd.vpp.jvpp.core.future.FutureJVppCoreFacade; -import javax.xml.bind.DatatypeConverter; /** *

Tests L2 ACL creation and read.
Equivalent to the following vppctl commands:
@@ -50,6 +49,8 @@ public class L2AclExample { private static final int LOCAL0_IFACE_ID = 0; + private static final char[] hexArray = "0123456789ABCDEF".toCharArray(); + private static ClassifyAddDelTable createClassifyTable() { ClassifyAddDelTable request = new ClassifyAddDelTable(); @@ -67,6 +68,16 @@ private static ClassifyAddDelTable createClassifyTable() { return request; } + private static String bytesToHex(byte[] bytes) { + char[] hexChars = new char[bytes.length * 2]; + for ( int j = 0; j < bytes.length; j++ ) { + int v = bytes[j] & 0xFF; + hexChars[j * 2] = hexArray[v >>> 4]; + hexChars[j * 2 + 1] = hexArray[v & 0x0F]; + } + return new String(hexChars); + } + private static ClassifyTableInfo createClassifyTableInfoRequest(final int tableId) { ClassifyTableInfo request = new ClassifyTableInfo(); request.tableId = tableId; @@ -120,7 +131,7 @@ private static void print(ClassifyTableIdsReply reply) { private static void print(final ClassifyTableInfoReply reply) { System.out.println(reply); if (reply != null) { - System.out.println("Mask hex: " + DatatypeConverter.printHexBinary(reply.mask)); + System.out.println("Mask hex: " + bytesToHex(reply.mask)); } } @@ -132,7 +143,7 @@ private static void print(final ClassifySessionDetailsReplyDump reply) { System.out.println(reply); reply.classifySessionDetails.forEach(detail -> { System.out.println(detail); - System.out.println("Match hex: " + DatatypeConverter.printHexBinary(detail.match)); + System.out.println("Match hex: " + bytesToHex(detail.match)); }); } diff --git a/extras/rpm/vpp.spec b/extras/rpm/vpp.spec index cd90a27494a2..bf2d83a84c8c 100644 --- a/extras/rpm/vpp.spec +++ b/extras/rpm/vpp.spec @@ -392,7 +392,7 @@ fi /usr/share/java/* %files api-python -%defattr(644,root,root) +%defattr(644,root,root,755) %{python2_sitelib}/vpp_* %files selinux-policy diff --git a/extras/scripts/list_api_changes.py b/extras/scripts/list_api_changes.py index a25edf2e8ebe..e56da0ace197 100755 --- a/extras/scripts/list_api_changes.py +++ b/extras/scripts/list_api_changes.py @@ -1,8 +1,8 @@ #!/usr/bin/env python import os, fnmatch, subprocess -starttag = 'v18.07-rc0' -endtag = 'v18.07' +starttag = 'v18.10-rc0' +endtag = 'v18.10' emit_md = True apifiles = [] diff --git a/src/plugins/acl/acl.c b/src/plugins/acl/acl.c index 866c6ffc5f09..d28144984165 100644 --- a/src/plugins/acl/acl.c +++ b/src/plugins/acl/acl.c @@ -4091,12 +4091,12 @@ acl_plugin_config (vlib_main_t * vm, unformat_input_t * input) { acl_main_t *am = &acl_main; u32 conn_table_hash_buckets; - u32 conn_table_hash_memory_size; + uword conn_table_hash_memory_size; u32 conn_table_max_entries; uword main_heap_size; uword hash_heap_size; u32 hash_lookup_hash_buckets; - u32 hash_lookup_hash_memory; + uword hash_lookup_hash_memory; u32 reclassify_sessions; u32 use_tuple_merge; u32 tuple_merge_split_threshold; @@ -4106,8 +4106,10 @@ acl_plugin_config (vlib_main_t * vm, unformat_input_t * input) if (unformat (input, "connection hash buckets %d", &conn_table_hash_buckets)) am->fa_conn_table_hash_num_buckets = conn_table_hash_buckets; - else if (unformat (input, "connection hash memory %d", - &conn_table_hash_memory_size)) + else + if (unformat + (input, "connection hash memory %U", unformat_memory_size, + &conn_table_hash_memory_size)) am->fa_conn_table_hash_memory_size = conn_table_hash_memory_size; else if (unformat (input, "connection count max %d", &conn_table_max_entries)) @@ -4125,8 +4127,10 @@ acl_plugin_config (vlib_main_t * vm, unformat_input_t * input) else if (unformat (input, "hash lookup hash buckets %d", &hash_lookup_hash_buckets)) am->hash_lookup_hash_buckets = hash_lookup_hash_buckets; - else if (unformat (input, "hash lookup hash memory %d", - &hash_lookup_hash_memory)) + else + if (unformat + (input, "hash lookup hash memory %U", unformat_memory_size, + &hash_lookup_hash_memory)) am->hash_lookup_hash_memory = hash_lookup_hash_memory; else if (unformat (input, "use tuple merge %d", &use_tuple_merge)) am->use_tuple_merge = use_tuple_merge; diff --git a/src/plugins/acl/acl.h b/src/plugins/acl/acl.h index c17946a97195..13e15478512d 100644 --- a/src/plugins/acl/acl.h +++ b/src/plugins/acl/acl.h @@ -142,7 +142,7 @@ typedef struct { hash_acl_info_t *hash_acl_infos; /* corresponding hash matching housekeeping info */ clib_bihash_48_8_t acl_lookup_hash; /* ACL lookup hash table. */ u32 hash_lookup_hash_buckets; - u32 hash_lookup_hash_memory; + uword hash_lookup_hash_memory; /* mheap to hold all the miscellaneous allocations related to hash-based lookups */ void *hash_lookup_mheap; diff --git a/src/plugins/acl/hash_lookup.c b/src/plugins/acl/hash_lookup.c index c37aae44a988..495395443f1c 100644 --- a/src/plugins/acl/hash_lookup.c +++ b/src/plugins/acl/hash_lookup.c @@ -607,6 +607,17 @@ hash_acl_set_heap(acl_main_t *am) clib_error("ACL plugin failed to allocate lookup heap of %U bytes", format_memory_size, am->hash_lookup_mheap_size); } +#if USE_DLMALLOC != 0 + /* + * DLMALLOC is being "helpful" in that it ignores the heap size parameter + * by default and tries to allocate the larger amount of memory. + * + * Pin the heap so this does not happen and if we run out of memory + * in this heap, we will bail out with "out of memory", rather than + * an obscure error sometime later. + */ + mspace_disable_expand(am->hash_lookup_mheap); +#endif } void *oldheap = clib_mem_set_heap(am->hash_lookup_mheap); return oldheap; @@ -736,6 +747,14 @@ hash_acl_apply(acl_main_t *am, u32 lc_index, int acl_index, u32 acl_position) vec_validate(am->hash_applied_mask_info_vec_by_lc_index, lc_index); + + /* since we know (in case of no split) how much we expand, preallocate that space */ + if (vec_len(ha->rules) > 0) { + int old_vec_len = vec_len(*applied_hash_aces); + vec_validate((*applied_hash_aces), old_vec_len + vec_len(ha->rules) - 1); + _vec_len((*applied_hash_aces)) = old_vec_len; + } + /* add the rules from the ACL to the hash table for lookup and append to the vector*/ for(i=0; i < vec_len(ha->rules); i++) { /* @@ -1171,6 +1190,13 @@ void hash_acl_add(acl_main_t *am, int acl_index) /* walk the newly added ACL entries and ensure that for each of them there is a mask type, increment a reference count for that mask type */ + + /* avoid small requests by preallocating the entire vector before running the additions */ + if (a->count > 0) { + vec_validate(ha->rules, a->count-1); + vec_reset_length(ha->rules); + } + for(i=0; i < a->count; i++) { hash_ace_info_t ace_info; fa_5tuple_t mask; @@ -1487,6 +1513,8 @@ split_partition(acl_main_t *am, u32 first_index, int i=0; u64 collisions = vec_len(pae->colliding_rules); for(i=0; ihash_acl_infos, pae->acl_index); DBG( "TM-collision: base_ace:%d (ace_mask:%d, first_collision_mask:%d)", pae->ace_index, pae->mask_type_index, coll_mask_type_index); diff --git a/src/plugins/acl/sess_mgmt_node.c b/src/plugins/acl/sess_mgmt_node.c index f38677f8d5a6..a6c5e8049d6a 100644 --- a/src/plugins/acl/sess_mgmt_node.c +++ b/src/plugins/acl/sess_mgmt_node.c @@ -689,7 +689,7 @@ acl_fa_session_cleaner_process (vlib_main_t * vm, vlib_node_runtime_t * rt, } } } - acl_log_err + acl_log_info ("ACL_FA_CLEANER_DELETE_BY_SW_IF_INDEX bitmap: %U, clear_all: %u", format_bitmap_hex, clear_sw_if_index_bitmap, clear_all); vec_foreach (pw0, am->per_worker_data) @@ -727,7 +727,7 @@ acl_fa_session_cleaner_process (vlib_main_t * vm, vlib_node_runtime_t * rt, pw0->pending_clear_sw_if_index_bitmap = clib_bitmap_dup (clear_sw_if_index_bitmap); } - acl_log_err + acl_log_info ("ACL_FA_CLEANER: thread %u, pending clear bitmap: %U", (am->per_worker_data - pw0), format_bitmap_hex, pw0->pending_clear_sw_if_index_bitmap); @@ -738,8 +738,9 @@ acl_fa_session_cleaner_process (vlib_main_t * vm, vlib_node_runtime_t * rt, send_interrupts_to_workers (vm, am); /* now wait till they all complete */ - acl_log_err ("CLEANER mains len: %u per-worker len: %d", - vec_len (vlib_mains), vec_len (am->per_worker_data)); + acl_log_info ("CLEANER mains len: %u per-worker len: %d", + vec_len (vlib_mains), + vec_len (am->per_worker_data)); vec_foreach (pw0, am->per_worker_data) { CLIB_MEMORY_BARRIER (); @@ -758,7 +759,7 @@ acl_fa_session_cleaner_process (vlib_main_t * vm, vlib_node_runtime_t * rt, } } } - acl_log_err ("ACL_FA_NODE_CLEAN: cleaning done"); + acl_log_info ("ACL_FA_NODE_CLEAN: cleaning done"); clib_bitmap_free (clear_sw_if_index_bitmap); } am->fa_cleaner_cnt_delete_by_sw_index_ok++; diff --git a/src/plugins/ioam/udp-ping/udp_ping_node.c b/src/plugins/ioam/udp-ping/udp_ping_node.c index 7a725258d655..59e4511eb7db 100644 --- a/src/plugins/ioam/udp-ping/udp_ping_node.c +++ b/src/plugins/ioam/udp-ping/udp_ping_node.c @@ -38,6 +38,23 @@ typedef enum UDP_PING_N_NEXT, } udp_ping_next_t; +#define foreach_udp_ping_error \ +_(BADHBH, "Malformed hop-by-hop header") + +typedef enum +{ +#define _(sym,str) UDP_PING_ERROR_##sym, + foreach_udp_ping_error +#undef _ + UDP_PING_N_ERROR, +} udp_ping_error_t; + +static char *udp_ping_error_strings[] = { +#define _(sym,string) string, + foreach_udp_ping_error +#undef _ +}; + udp_ping_main_t udp_ping_main; uword @@ -502,15 +519,26 @@ udp_ping_analyse_hbh (vlib_buffer_t * b0, * */ void -udp_ping_local_analyse (vlib_buffer_t * b0, - ip6_header_t * ip0, - ip6_hop_by_hop_header_t * hbh0, u16 * next0) +udp_ping_local_analyse (vlib_node_runtime_t * node, vlib_buffer_t * b0, + ip6_header_t * ip0, ip6_hop_by_hop_header_t * hbh0, + u16 * next0) { ip6_main_t *im = &ip6_main; ip_lookup_main_t *lm = &im->lookup_main; *next0 = UDP_PING_NEXT_IP6_DROP; + /* + * Sanity check: hbh header length must be less than + * b0->current_length. + */ + if (PREDICT_FALSE ((hbh0->length + 1) << 3) >= b0->current_length) + { + *next0 = UDP_PING_NEXT_DROP; + b0->error = node->errors[UDP_PING_ERROR_BADHBH]; + return; + } + if (PREDICT_TRUE (hbh0->protocol == IP_PROTOCOL_UDP)) { ip6_hop_by_hop_option_t *opt0; @@ -600,7 +628,7 @@ udp_ping_local_analyse (vlib_buffer_t * b0, * @par Graph mechanics: buffer, next index usage * * Uses: - * - udp_ping_local_analyse(p0, ip0, hbh0, &next0) + * - udp_ping_local_analyse(node, p0, ip0, hbh0, &next0) * - Checks packet type - request/respnse and process them. * * Next Index: @@ -660,8 +688,8 @@ udp_ping_local_node_fn (vlib_main_t * vm, hbh0 = (ip6_hop_by_hop_header_t *) (ip0 + 1); hbh1 = (ip6_hop_by_hop_header_t *) (ip1 + 1); - udp_ping_local_analyse (p0, ip0, hbh0, &next0); - udp_ping_local_analyse (p1, ip1, hbh1, &next1); + udp_ping_local_analyse (node, p0, ip0, hbh0, &next0); + udp_ping_local_analyse (node, p1, ip1, hbh1, &next1); if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE))) { @@ -727,7 +755,7 @@ udp_ping_local_node_fn (vlib_main_t * vm, ip0 = vlib_buffer_get_current (p0); hbh0 = (ip6_hop_by_hop_header_t *) (ip0 + 1); - udp_ping_local_analyse (p0, ip0, hbh0, &next0); + udp_ping_local_analyse (node, p0, ip0, hbh0, &next0); if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE))) { @@ -774,6 +802,8 @@ VLIB_REGISTER_NODE (udp_ping_local, static) = .format_trace = format_udp_ping_trace, .type = VLIB_NODE_TYPE_INTERNAL, .n_next_nodes = UDP_PING_N_NEXT, + .n_errors = UDP_PING_N_ERROR, + .error_strings = udp_ping_error_strings, .next_nodes = { [UDP_PING_NEXT_DROP] = "error-drop", diff --git a/src/plugins/nat/in2out.c b/src/plugins/nat/in2out.c index b99aef3944d3..cb169814eac9 100755 --- a/src/plugins/nat/in2out.c +++ b/src/plugins/nat/in2out.c @@ -952,8 +952,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, } else { - if (PREDICT_FALSE - (proto0 == ~0 || proto0 == SNAT_PROTOCOL_ICMP)) + if (PREDICT_FALSE (proto0 == ~0)) { next0 = SNAT_IN2OUT_NEXT_SLOW_PATH; goto trace00; @@ -964,6 +963,12 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, next0 = SNAT_IN2OUT_NEXT_REASS; goto trace00; } + + if (PREDICT_FALSE (proto0 == SNAT_PROTOCOL_ICMP)) + { + next0 = SNAT_IN2OUT_NEXT_SLOW_PATH; + goto trace00; + } } key0.addr = ip0->src_address; @@ -1131,8 +1136,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, } else { - if (PREDICT_FALSE - (proto1 == ~0 || proto1 == SNAT_PROTOCOL_ICMP)) + if (PREDICT_FALSE (proto1 == ~0)) { next1 = SNAT_IN2OUT_NEXT_SLOW_PATH; goto trace01; @@ -1143,6 +1147,12 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, next1 = SNAT_IN2OUT_NEXT_REASS; goto trace01; } + + if (PREDICT_FALSE (proto1 == SNAT_PROTOCOL_ICMP)) + { + next1 = SNAT_IN2OUT_NEXT_SLOW_PATH; + goto trace01; + } } key1.addr = ip1->src_address; @@ -1346,8 +1356,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, } else { - if (PREDICT_FALSE - (proto0 == ~0 || proto0 == SNAT_PROTOCOL_ICMP)) + if (PREDICT_FALSE (proto0 == ~0)) { next0 = SNAT_IN2OUT_NEXT_SLOW_PATH; goto trace0; @@ -1358,6 +1367,12 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, next0 = SNAT_IN2OUT_NEXT_REASS; goto trace0; } + + if (PREDICT_FALSE (proto0 == SNAT_PROTOCOL_ICMP)) + { + next0 = SNAT_IN2OUT_NEXT_SLOW_PATH; + goto trace0; + } } key0.addr = ip0->src_address; @@ -1672,6 +1687,7 @@ nat44_in2out_reass_node_fn (vlib_main_t * vm, nat_reass_ip4_t *reass0; udp_header_t *udp0; tcp_header_t *tcp0; + icmp46_header_t *icmp0; snat_session_key_t key0; clib_bihash_kv_8_8_t kv0, value0; snat_session_t *s0 = 0; @@ -1704,6 +1720,7 @@ nat44_in2out_reass_node_fn (vlib_main_t * vm, ip0 = (ip4_header_t *) vlib_buffer_get_current (b0); udp0 = ip4_next_header (ip0); tcp0 = (tcp_header_t *) udp0; + icmp0 = (icmp46_header_t *) udp0; proto0 = ip_proto_to_snat_proto (ip0->protocol); reass0 = nat_ip4_reass_find_or_create (ip0->src_address, @@ -1722,6 +1739,25 @@ nat44_in2out_reass_node_fn (vlib_main_t * vm, if (PREDICT_FALSE (ip4_is_first_fragment (ip0))) { + if (PREDICT_FALSE (proto0 == SNAT_PROTOCOL_ICMP)) + { + next0 = icmp_in2out_slow_path + (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, + next0, now, thread_index, &s0); + + if (PREDICT_TRUE (next0 != SNAT_IN2OUT_NEXT_DROP)) + { + if (s0) + reass0->sess_index = s0 - per_thread_data->sessions; + else + reass0->flags |= NAT_REASS_FLAG_ED_DONT_TRANSLATE; + nat_ip4_reass_get_frags (reass0, + &fragments_to_loopback); + } + + goto trace0; + } + key0.addr = ip0->src_address; key0.port = udp0->src_port; key0.protocol = proto0; diff --git a/src/plugins/nat/in2out_ed.c b/src/plugins/nat/in2out_ed.c index 8db53c081428..ea30035faf2f 100644 --- a/src/plugins/nat/in2out_ed.c +++ b/src/plugins/nat/in2out_ed.c @@ -37,7 +37,8 @@ _(BAD_ICMP_TYPE, "unsupported ICMP type") \ _(MAX_SESSIONS_EXCEEDED, "Maximum sessions exceeded") \ _(DROP_FRAGMENT, "Drop fragment") \ _(MAX_REASS, "Maximum reassemblies exceeded") \ -_(MAX_FRAG, "Maximum fragments per reassembly exceeded") +_(MAX_FRAG, "Maximum fragments per reassembly exceeded")\ +_(NON_SYN, "non-SYN packet try to create session") typedef enum { @@ -254,7 +255,8 @@ slow_path_ed (snat_main_t * sm, u32 rx_fib_index, clib_bihash_kv_16_8_t * kv, snat_session_t ** sessionp, - vlib_node_runtime_t * node, u32 next, u32 thread_index, f64 now) + vlib_node_runtime_t * node, u32 next, u32 thread_index, f64 now, + tcp_header_t * tcp) { snat_session_t *s = 0; snat_user_t *u; @@ -314,6 +316,15 @@ slow_path_ed (snat_main_t * sm, is_sm = 1; } + if (proto == SNAT_PROTOCOL_TCP) + { + if (!tcp_is_init (tcp)) + { + b->error = node->errors[NAT_IN2OUT_ED_ERROR_NON_SYN]; + return NAT_IN2OUT_ED_NEXT_DROP; + } + } + u = nat_user_get_or_create (sm, &key->l_addr, rx_fib_index, thread_index); if (!u) { @@ -513,7 +524,19 @@ nat44_ed_not_translate_output_feature (snat_main_t * sm, ip4_header_t * ip, make_ed_kv (&kv, &ip->src_address, &ip->dst_address, proto, tx_fib_index, src_port, dst_port); if (!clib_bihash_search_16_8 (&tsm->out2in_ed, &kv, &value)) - return 1; + { + s = pool_elt_at_index (tsm->sessions, value.value); + if (nat44_is_ses_closed (s)) + { + nat_log_debug ("TCP close connection %U", format_snat_session, + &sm->per_thread_data[thread_index], s); + nat_free_session_data (sm, s, thread_index); + nat44_delete_session (sm, s, thread_index); + } + else + s->flags |= SNAT_SESSION_FLAG_OUTPUT_FEATURE; + return 1; + } /* dst NAT check */ make_ed_kv (&kv, &ip->dst_address, &ip->src_address, proto, rx_fib_index, @@ -613,7 +636,7 @@ icmp_match_in2out_ed (snat_main_t * sm, vlib_node_runtime_t * node, } next = slow_path_ed (sm, b, rx_fib_index, &kv, &s, node, next, - thread_index, vlib_time_now (sm->vlib_main)); + thread_index, vlib_time_now (sm->vlib_main), 0); if (PREDICT_FALSE (next == NAT_IN2OUT_ED_NEXT_DROP)) goto out; @@ -1023,7 +1046,7 @@ nat44_ed_in2out_node_fn_inline (vlib_main_t * vm, next0 = slow_path_ed (sm, b0, rx_fib_index0, &kv0, &s0, node, - next0, thread_index, now); + next0, thread_index, now, tcp0); if (PREDICT_FALSE (next0 == NAT_IN2OUT_ED_NEXT_DROP)) goto trace00; @@ -1227,7 +1250,7 @@ nat44_ed_in2out_node_fn_inline (vlib_main_t * vm, next1 = slow_path_ed (sm, b1, rx_fib_index1, &kv1, &s1, node, - next1, thread_index, now); + next1, thread_index, now, tcp1); if (PREDICT_FALSE (next1 == NAT_IN2OUT_ED_NEXT_DROP)) goto trace01; @@ -1460,7 +1483,7 @@ nat44_ed_in2out_node_fn_inline (vlib_main_t * vm, next0 = slow_path_ed (sm, b0, rx_fib_index0, &kv0, &s0, node, - next0, thread_index, now); + next0, thread_index, now, tcp0); if (PREDICT_FALSE (next0 == NAT_IN2OUT_ED_NEXT_DROP)) goto trace0; @@ -1859,7 +1882,8 @@ nat44_ed_in2out_reass_node_fn_inline (vlib_main_t * vm, } next0 = slow_path_ed (sm, b0, rx_fib_index0, &kv0, - &s0, node, next0, thread_index, now); + &s0, node, next0, thread_index, now, + tcp0); if (PREDICT_FALSE (next0 == NAT_IN2OUT_ED_NEXT_DROP)) goto trace0; @@ -1960,11 +1984,8 @@ nat44_ed_in2out_reass_node_fn_inline (vlib_main_t * vm, } /* Hairpinning */ - if (PREDICT_TRUE (proto0 != SNAT_PROTOCOL_ICMP)) - nat44_reass_hairpinning (sm, b0, ip0, s0->out2in.port, - s0->ext_host_port, proto0, 1); - else - snat_icmp_hairpinning (sm, b0, ip0, icmp0, 1); + nat44_reass_hairpinning (sm, b0, ip0, s0->out2in.port, + s0->ext_host_port, proto0, 1); /* Accounting */ nat44_session_update_counters (s0, now, diff --git a/src/plugins/nat/nat.c b/src/plugins/nat/nat.c index ae2e64e1f82e..540d3bf8a0be 100755 --- a/src/plugins/nat/nat.c +++ b/src/plugins/nat/nat.c @@ -397,64 +397,67 @@ nat_ed_session_alloc (snat_main_t * sm, snat_user_t * u, u32 thread_index, u32 oldest_index; u64 sess_timeout_time; - if ((u->nsessions + u->nstaticsessions) >= sm->max_translations_per_user) + if (PREDICT_FALSE (!(u->nsessions) && !(u->nstaticsessions))) + goto alloc_new; + + oldest_index = + clib_dlist_remove_head (tsm->list_pool, + u->sessions_per_user_list_head_index); + oldest_elt = pool_elt_at_index (tsm->list_pool, oldest_index); + s = pool_elt_at_index (tsm->sessions, oldest_elt->value); + sess_timeout_time = s->last_heard + (f64) nat44_session_get_timeout (sm, s); + if (now >= sess_timeout_time) { - oldest_index = - clib_dlist_remove_head (tsm->list_pool, - u->sessions_per_user_list_head_index); - oldest_elt = pool_elt_at_index (tsm->list_pool, oldest_index); - s = pool_elt_at_index (tsm->sessions, oldest_elt->value); - sess_timeout_time = - s->last_heard + (f64) nat44_session_get_timeout (sm, s); - if (now >= sess_timeout_time) - { - clib_dlist_addtail (tsm->list_pool, - u->sessions_per_user_list_head_index, - oldest_index); - nat_free_session_data (sm, s, thread_index); - if (snat_is_session_static (s)) - u->nstaticsessions--; - else - u->nsessions--; - s->flags = 0; - s->total_bytes = 0; - s->total_pkts = 0; - s->state = 0; - s->ext_host_addr.as_u32 = 0; - s->ext_host_port = 0; - s->ext_host_nat_addr.as_u32 = 0; - s->ext_host_nat_port = 0; - } + clib_dlist_addtail (tsm->list_pool, + u->sessions_per_user_list_head_index, oldest_index); + nat_free_session_data (sm, s, thread_index); + if (snat_is_session_static (s)) + u->nstaticsessions--; else + u->nsessions--; + s->flags = 0; + s->total_bytes = 0; + s->total_pkts = 0; + s->state = 0; + s->ext_host_addr.as_u32 = 0; + s->ext_host_port = 0; + s->ext_host_nat_addr.as_u32 = 0; + s->ext_host_nat_port = 0; + } + else + { + clib_dlist_addhead (tsm->list_pool, + u->sessions_per_user_list_head_index, oldest_index); + if ((u->nsessions + u->nstaticsessions) >= + sm->max_translations_per_user) { - clib_dlist_addhead (tsm->list_pool, - u->sessions_per_user_list_head_index, - oldest_index); nat_log_warn ("max translations per user %U", format_ip4_address, &u->addr); snat_ipfix_logging_max_entries_per_user (sm->max_translations_per_user, u->addr.as_u32); return 0; } - } - else - { - pool_get (tsm->sessions, s); - memset (s, 0, sizeof (*s)); + else + { + alloc_new: + pool_get (tsm->sessions, s); + memset (s, 0, sizeof (*s)); - /* Create list elts */ - pool_get (tsm->list_pool, per_user_translation_list_elt); - clib_dlist_init (tsm->list_pool, - per_user_translation_list_elt - tsm->list_pool); + /* Create list elts */ + pool_get (tsm->list_pool, per_user_translation_list_elt); + clib_dlist_init (tsm->list_pool, + per_user_translation_list_elt - tsm->list_pool); - per_user_translation_list_elt->value = s - tsm->sessions; - s->per_user_index = per_user_translation_list_elt - tsm->list_pool; - s->per_user_list_head_index = u->sessions_per_user_list_head_index; + per_user_translation_list_elt->value = s - tsm->sessions; + s->per_user_index = per_user_translation_list_elt - tsm->list_pool; + s->per_user_list_head_index = u->sessions_per_user_list_head_index; - clib_dlist_addtail (tsm->list_pool, - s->per_user_list_head_index, - per_user_translation_list_elt - tsm->list_pool); + clib_dlist_addtail (tsm->list_pool, + s->per_user_list_head_index, + per_user_translation_list_elt - tsm->list_pool); + } } + return s; } @@ -558,6 +561,10 @@ is_snat_address_used_in_static_mapping (snat_main_t * sm, ip4_address_t addr) /* *INDENT-OFF* */ pool_foreach (m, sm->static_mappings, ({ + if (is_addr_only_static_mapping (m) || + is_out2in_only_static_mapping (m) || + is_identity_static_mapping (m)) + continue; if (m->external_addr.as_u32 == addr.as_u32) return 1; })); @@ -631,7 +638,6 @@ snat_add_static_mapping (ip4_address_t l_addr, ip4_address_t e_addr, clib_bihash_kv_8_8_t kv, value; snat_address_t *a = 0; u32 fib_index = ~0; - uword *p; snat_interface_t *interface; int i; snat_main_per_thread_data_t *tsm; @@ -643,6 +649,8 @@ snat_add_static_mapping (ip4_address_t l_addr, ip4_address_t e_addr, u64 user_index; snat_session_t *s; snat_static_map_resolve_t *rp, *rp_match = 0; + nat44_lb_addr_port_t *local; + u8 find = 0; if (!sm->endpoint_dependent) { @@ -732,19 +740,42 @@ snat_add_static_mapping (ip4_address_t l_addr, ip4_address_t e_addr, if (is_add) { if (m) - return VNET_API_ERROR_VALUE_EXIST; + { + if (is_identity_static_mapping (m)) + { + /* *INDENT-OFF* */ + vec_foreach (local, m->locals) + { + if (local->vrf_id == vrf_id) + return VNET_API_ERROR_VALUE_EXIST; + } + /* *INDENT-ON* */ + vec_add2 (m->locals, local, 1); + local->vrf_id = vrf_id; + local->fib_index = + fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, vrf_id, + FIB_SOURCE_PLUGIN_LOW); + m_key.addr = m->local_addr; + m_key.port = m->local_port; + m_key.protocol = m->proto; + m_key.fib_index = local->fib_index; + kv.key = m_key.as_u64; + kv.value = m - sm->static_mappings; + clib_bihash_add_del_8_8 (&sm->static_mapping_by_local, &kv, 1); + return 0; + } + else + return VNET_API_ERROR_VALUE_EXIST; + } if (twice_nat && addr_only) return VNET_API_ERROR_UNSUPPORTED; /* Convert VRF id to FIB index */ if (vrf_id != ~0) - { - p = hash_get (sm->ip4_main->fib_index_by_table_id, vrf_id); - if (!p) - return VNET_API_ERROR_NO_SUCH_FIB; - fib_index = p[0]; - } + fib_index = + fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, vrf_id, + FIB_SOURCE_PLUGIN_LOW); /* If not specified use inside VRF id from SNAT plugin startup config */ else { @@ -752,7 +783,7 @@ snat_add_static_mapping (ip4_address_t l_addr, ip4_address_t e_addr, vrf_id = sm->inside_vrf_id; } - if (!out2in_only) + if (!(out2in_only || identity_nat)) { m_key.addr = l_addr; m_key.port = addr_only ? 0 : l_port; @@ -825,15 +856,23 @@ snat_add_static_mapping (ip4_address_t l_addr, ip4_address_t e_addr, m->tag = vec_dup (tag); m->local_addr = l_addr; m->external_addr = e_addr; - m->vrf_id = vrf_id; - m->fib_index = fib_index; m->twice_nat = twice_nat; if (out2in_only) m->flags |= NAT_STATIC_MAPPING_FLAG_OUT2IN_ONLY; if (addr_only) m->flags |= NAT_STATIC_MAPPING_FLAG_ADDR_ONLY; if (identity_nat) - m->flags |= NAT_STATIC_MAPPING_FLAG_IDENTITY_NAT; + { + m->flags |= NAT_STATIC_MAPPING_FLAG_IDENTITY_NAT; + vec_add2 (m->locals, local, 1); + local->vrf_id = vrf_id; + local->fib_index = fib_index; + } + else + { + m->vrf_id = vrf_id; + m->fib_index = fib_index; + } if (!addr_only) { m->local_port = l_port; @@ -855,7 +894,7 @@ snat_add_static_mapping (ip4_address_t l_addr, ip4_address_t e_addr, m_key.addr = m->local_addr; m_key.port = m->local_port; m_key.protocol = m->proto; - m_key.fib_index = m->fib_index; + m_key.fib_index = fib_index; kv.key = m_key.as_u64; kv.value = m - sm->static_mappings; if (!out2in_only) @@ -920,6 +959,28 @@ snat_add_static_mapping (ip4_address_t l_addr, ip4_address_t e_addr, return VNET_API_ERROR_NO_SUCH_ENTRY; } + if (identity_nat) + { + if (vrf_id == ~0) + vrf_id = sm->inside_vrf_id; + + for (i = 0; i < vec_len (m->locals); i++) + { + if (m->locals[i].vrf_id == vrf_id) + { + find = 1; + break; + } + } + if (!find) + return VNET_API_ERROR_NO_SUCH_ENTRY; + + fib_index = m->locals[i].fib_index; + vec_del1 (m->locals, i); + } + else + fib_index = m->fib_index; + /* Free external address port */ if (!(addr_only || sm->static_mapping_only || out2in_only)) { @@ -958,23 +1019,17 @@ snat_add_static_mapping (ip4_address_t l_addr, ip4_address_t e_addr, m_key.addr = m->local_addr; m_key.port = m->local_port; m_key.protocol = m->proto; - m_key.fib_index = m->fib_index; + m_key.fib_index = fib_index; kv.key = m_key.as_u64; if (!out2in_only) clib_bihash_add_del_8_8 (&sm->static_mapping_by_local, &kv, 0); - m_key.addr = m->external_addr; - m_key.port = m->external_port; - m_key.fib_index = 0; - kv.key = m_key.as_u64; - clib_bihash_add_del_8_8 (&sm->static_mapping_by_external, &kv, 0); - /* Delete session(s) for static mapping if exist */ if (!(sm->static_mapping_only) || (sm->static_mapping_only && sm->static_mapping_connection_tracking)) { u_key.addr = m->local_addr; - u_key.fib_index = m->fib_index; + u_key.fib_index = fib_index; kv.key = u_key.as_u64; if (!clib_bihash_search_8_8 (&tsm->user_hash, &kv, &value)) { @@ -1018,6 +1073,16 @@ snat_add_static_mapping (ip4_address_t l_addr, ip4_address_t e_addr, } } + fib_table_unlock (fib_index, FIB_PROTOCOL_IP4, FIB_SOURCE_PLUGIN_LOW); + if (vec_len (m->locals)) + return 0; + + m_key.addr = m->external_addr; + m_key.port = m->external_port; + m_key.fib_index = 0; + kv.key = m_key.as_u64; + clib_bihash_add_del_8_8 (&sm->static_mapping_by_external, &kv, 0); + vec_free (m->tag); vec_free (m->workers); /* Delete static mapping from pool */ @@ -1137,6 +1202,7 @@ nat44_add_del_lb_static_mapping (ip4_address_t e_addr, u16 e_port, m->external_port = e_port; m->proto = proto; m->twice_nat = twice_nat; + m->flags |= NAT_STATIC_MAPPING_FLAG_LB; if (out2in_only) m->flags |= NAT_STATIC_MAPPING_FLAG_OUT2IN_ONLY; m->affinity = affinity; @@ -1205,6 +1271,9 @@ nat44_add_del_lb_static_mapping (ip4_address_t e_addr, u16 e_port, if (!m) return VNET_API_ERROR_NO_SUCH_ENTRY; + if (!is_lb_static_mapping (m)) + return VNET_API_ERROR_INVALID_VALUE; + /* Free external address port */ if (!(sm->static_mapping_only || out2in_only)) { @@ -2041,7 +2110,7 @@ snat_static_mapping_match (snat_main_t * sm, if (by_external) { - if (vec_len (m->locals)) + if (is_lb_static_mapping (m)) { if (PREDICT_FALSE (lb != 0)) *lb = m->affinity ? AFFINITY_LB_NAT : LB_NAT; @@ -2612,7 +2681,7 @@ nat44_ed_get_worker_out2in_cb (ip4_header_t * ip, u32 rx_fib_index) (&sm->static_mapping_by_external, &kv, &value)) { m = pool_elt_at_index (sm->static_mappings, value.value); - if (!vec_len (m->locals)) + if (!is_lb_static_mapping (m)) return m->workers[0]; hash = ip->src_address.as_u32 + (ip->src_address.as_u32 >> 8) + diff --git a/src/plugins/nat/nat.h b/src/plugins/nat/nat.h index 0549acdba3af..3162e41b696e 100644 --- a/src/plugins/nat/nat.h +++ b/src/plugins/nat/nat.h @@ -165,6 +165,7 @@ typedef enum #define NAT44_SES_O2I_FIN_ACK 8 #define NAT44_SES_I2O_SYN 16 #define NAT44_SES_O2I_SYN 32 +#define NAT44_SES_RST 64 /* Session flags */ #define SNAT_SESSION_FLAG_STATIC_MAPPING 1 @@ -174,6 +175,7 @@ typedef enum #define SNAT_SESSION_FLAG_ENDPOINT_DEPENDENT 16 #define SNAT_SESSION_FLAG_FWD_BYPASS 32 #define SNAT_SESSION_FLAG_AFFINITY 64 +#define SNAT_SESSION_FLAG_OUTPUT_FEATURE 128 /* NAT interface flags */ #define NAT_INTERFACE_FLAG_IS_INSIDE 1 @@ -183,6 +185,7 @@ typedef enum #define NAT_STATIC_MAPPING_FLAG_ADDR_ONLY 1 #define NAT_STATIC_MAPPING_FLAG_OUT2IN_ONLY 2 #define NAT_STATIC_MAPPING_FLAG_IDENTITY_NAT 4 +#define NAT_STATIC_MAPPING_FLAG_LB 8 /* *INDENT-OFF* */ typedef CLIB_PACKED(struct @@ -666,6 +669,18 @@ unformat_function_t unformat_snat_protocol; */ #define is_identity_static_mapping(sm) (sm->flags & NAT_STATIC_MAPPING_FLAG_IDENTITY_NAT) +/** \brief Check if NAT static mapping is load-balancing. + @param sm NAT static mapping + @return 1 if load-balancing +*/ +#define is_lb_static_mapping(sm) (sm->flags & NAT_STATIC_MAPPING_FLAG_LB) + +/** \brief Check if client initiating TCP connection (received SYN from client) + @param t TCP header + @return 1 if client initiating TCP connection +*/ +#define tcp_is_init(t) ((t->flags & TCP_FLAG_SYN) && !(t->flags & TCP_FLAG_ACK)) + /* logging */ #define nat_log_err(...) \ vlib_log(VLIB_LOG_LEVEL_ERR, snat_main.log_class, __VA_ARGS__) diff --git a/src/plugins/nat/nat44_hairpinning.c b/src/plugins/nat/nat44_hairpinning.c index c07427d6bcb7..09ea419e637c 100644 --- a/src/plugins/nat/nat44_hairpinning.c +++ b/src/plugins/nat/nat44_hairpinning.c @@ -286,39 +286,6 @@ snat_icmp_hairpinning (snat_main_t * sm, } else { - if (!is_ed) - { - icmp_echo_header_t *echo0 = (icmp_echo_header_t *) (icmp0 + 1); - u16 icmp_id0 = echo0->identifier; - key0.addr = ip0->dst_address; - key0.port = icmp_id0; - key0.protocol = SNAT_PROTOCOL_ICMP; - key0.fib_index = sm->outside_fib_index; - kv0.key = key0.as_u64; - if (sm->num_workers > 1) - ti = - (clib_net_to_host_u16 (icmp_id0) - 1024) / sm->port_per_thread; - else - ti = sm->num_workers; - int rv = - clib_bihash_search_8_8 (&sm->per_thread_data[ti].out2in, &kv0, - &value0); - if (!rv) - { - si = value0.value; - s0 = pool_elt_at_index (sm->per_thread_data[ti].sessions, si); - new_dst_addr0 = s0->in2out.addr.as_u32; - vnet_buffer (b0)->sw_if_index[VLIB_TX] = s0->in2out.fib_index; - echo0->identifier = s0->in2out.port; - sum0 = icmp0->checksum; - sum0 = ip_csum_update (sum0, icmp_id0, s0->in2out.port, - icmp_echo_header_t, identifier); - icmp0->checksum = ip_csum_fold (sum0); - goto change_addr; - } - ti = 0; - } - key0.addr = ip0->dst_address; key0.port = 0; key0.protocol = 0; @@ -327,7 +294,44 @@ snat_icmp_hairpinning (snat_main_t * sm, if (clib_bihash_search_8_8 (&sm->static_mapping_by_external, &kv0, &value0)) - return 1; + { + if (!is_ed) + { + icmp_echo_header_t *echo0 = (icmp_echo_header_t *) (icmp0 + 1); + u16 icmp_id0 = echo0->identifier; + key0.addr = ip0->dst_address; + key0.port = icmp_id0; + key0.protocol = SNAT_PROTOCOL_ICMP; + key0.fib_index = sm->outside_fib_index; + kv0.key = key0.as_u64; + if (sm->num_workers > 1) + ti = + (clib_net_to_host_u16 (icmp_id0) - + 1024) / sm->port_per_thread; + else + ti = sm->num_workers; + int rv = + clib_bihash_search_8_8 (&sm->per_thread_data[ti].out2in, &kv0, + &value0); + if (!rv) + { + si = value0.value; + s0 = + pool_elt_at_index (sm->per_thread_data[ti].sessions, si); + new_dst_addr0 = s0->in2out.addr.as_u32; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = + s0->in2out.fib_index; + echo0->identifier = s0->in2out.port; + sum0 = icmp0->checksum; + sum0 = ip_csum_update (sum0, icmp_id0, s0->in2out.port, + icmp_echo_header_t, identifier); + icmp0->checksum = ip_csum_fold (sum0); + goto change_addr; + } + } + + return 1; + } m0 = pool_elt_at_index (sm->static_mappings, value0.value); diff --git a/src/plugins/nat/nat_api.c b/src/plugins/nat/nat_api.c index 8ad5c6652cad..4727826049bf 100644 --- a/src/plugins/nat/nat_api.c +++ b/src/plugins/nat/nat_api.c @@ -1100,7 +1100,7 @@ vl_api_nat44_static_mapping_dump_t_handler (vl_api_nat44_static_mapping_dump_t /* *INDENT-OFF* */ pool_foreach (m, sm->static_mappings, ({ - if (!is_identity_static_mapping(m) && !vec_len (m->locals)) + if (!is_identity_static_mapping(m) && !is_lb_static_mapping (m)) send_nat44_static_mapping_details (m, reg, mp->context); })); /* *INDENT-ON* */ @@ -1181,17 +1181,17 @@ static void *vl_api_nat44_add_del_identity_mapping_t_print if (mp->addr_only == 0) s = - format (s, "protocol %d port %d", mp->protocol, + format (s, " protocol %d port %d", mp->protocol, clib_net_to_host_u16 (mp->port)); if (mp->vrf_id != ~0) - s = format (s, "vrf %d", clib_net_to_host_u32 (mp->vrf_id)); + s = format (s, " vrf %d", clib_net_to_host_u32 (mp->vrf_id)); FINISH; } static void -send_nat44_identity_mapping_details (snat_static_mapping_t * m, +send_nat44_identity_mapping_details (snat_static_mapping_t * m, int index, vl_api_registration_t * reg, u32 context) { vl_api_nat44_identity_mapping_details_t *rmp; @@ -1205,7 +1205,7 @@ send_nat44_identity_mapping_details (snat_static_mapping_t * m, clib_memcpy (rmp->ip_address, &(m->local_addr), 4); rmp->port = htons (m->local_port); rmp->sw_if_index = ~0; - rmp->vrf_id = htonl (m->vrf_id); + rmp->vrf_id = htonl (m->locals[index].vrf_id); rmp->protocol = snat_proto_to_ip_proto (m->proto); rmp->context = context; if (m->tag) @@ -1258,8 +1258,11 @@ static void /* *INDENT-OFF* */ pool_foreach (m, sm->static_mappings, ({ - if (is_identity_static_mapping(m) && !vec_len (m->locals)) - send_nat44_identity_mapping_details (m, reg, mp->context); + if (is_identity_static_mapping(m) && !is_lb_static_mapping (m)) + { + for (j = 0; j < vec_len (m->locals); j++) + send_nat44_identity_mapping_details (m, j, reg, mp->context); + } })); /* *INDENT-ON* */ @@ -1689,7 +1692,7 @@ static void /* *INDENT-OFF* */ pool_foreach (m, sm->static_mappings, ({ - if (vec_len(m->locals)) + if (is_lb_static_mapping(m)) send_nat44_lb_static_mapping_details (m, reg, mp->context); })); /* *INDENT-ON* */ diff --git a/src/plugins/nat/nat_format.c b/src/plugins/nat/nat_format.c index a4b62b6e9773..8e5ac4cade44 100644 --- a/src/plugins/nat/nat_format.c +++ b/src/plugins/nat/nat_format.c @@ -220,6 +220,24 @@ format_snat_static_mapping (u8 * s, va_list * args) snat_static_mapping_t *m = va_arg (*args, snat_static_mapping_t *); nat44_lb_addr_port_t *local; + if (is_identity_static_mapping (m)) + { + if (is_addr_only_static_mapping (m)) + s = format (s, "identity mapping %U", + format_ip4_address, &m->local_addr); + else + s = format (s, "identity mapping %U %U:%d", + format_snat_protocol, m->proto, + format_ip4_address, &m->local_addr, m->local_port); + + /* *INDENT-OFF* */ + vec_foreach (local, m->locals) + s = format (s, " vrf %d", local->vrf_id); + /* *INDENT-ON* */ + + return s; + } + if (is_addr_only_static_mapping (m)) s = format (s, "local %U external %U vrf %d %s %s", format_ip4_address, &m->local_addr, @@ -230,7 +248,7 @@ format_snat_static_mapping (u8 * s, va_list * args) is_out2in_only_static_mapping (m) ? "out2in-only" : ""); else { - if (vec_len (m->locals)) + if (is_lb_static_mapping (m)) { s = format (s, "%U external %U:%d %s %s", format_snat_protocol, m->proto, diff --git a/src/plugins/nat/nat_inlines.h b/src/plugins/nat/nat_inlines.h index 4bdb2cb66d9f..9000a3ddcd35 100644 --- a/src/plugins/nat/nat_inlines.h +++ b/src/plugins/nat/nat_inlines.h @@ -200,6 +200,10 @@ always_inline int nat44_set_tcp_session_state_i2o (snat_main_t * sm, snat_session_t * ses, tcp_header_t * tcp, u32 thread_index) { + if ((ses->state == 0) && (tcp->flags & TCP_FLAG_RST)) + ses->state = NAT44_SES_RST; + if ((ses->state == NAT44_SES_RST) && !(tcp->flags & TCP_FLAG_RST)) + ses->state = 0; if ((tcp->flags & TCP_FLAG_ACK) && (ses->state & NAT44_SES_I2O_SYN) && (ses->state & NAT44_SES_O2I_SYN)) ses->state = 0; @@ -215,7 +219,8 @@ nat44_set_tcp_session_state_i2o (snat_main_t * sm, snat_session_t * ses, if (clib_net_to_host_u32 (tcp->ack_number) > ses->o2i_fin_seq) ses->state |= NAT44_SES_O2I_FIN_ACK; } - if (nat44_is_ses_closed (ses)) + if (nat44_is_ses_closed (ses) + && !(ses->flags & SNAT_SESSION_FLAG_OUTPUT_FEATURE)) { nat_log_debug ("TCP close connection %U", format_snat_session, &sm->per_thread_data[thread_index], ses); @@ -230,6 +235,10 @@ always_inline int nat44_set_tcp_session_state_o2i (snat_main_t * sm, snat_session_t * ses, tcp_header_t * tcp, u32 thread_index) { + if ((ses->state == 0) && (tcp->flags & TCP_FLAG_RST)) + ses->state = NAT44_SES_RST; + if ((ses->state == NAT44_SES_RST) && !(tcp->flags & TCP_FLAG_RST)) + ses->state = 0; if ((tcp->flags & TCP_FLAG_ACK) && (ses->state & NAT44_SES_I2O_SYN) && (ses->state & NAT44_SES_O2I_SYN)) ses->state = 0; diff --git a/src/plugins/nat/out2in.c b/src/plugins/nat/out2in.c index eeecf1652647..8c013d9b0749 100755 --- a/src/plugins/nat/out2in.c +++ b/src/plugins/nat/out2in.c @@ -775,17 +775,17 @@ snat_out2in_node_fn (vlib_main_t * vm, goto trace0; } - if (PREDICT_FALSE (proto0 == SNAT_PROTOCOL_ICMP)) + if (PREDICT_FALSE (ip4_is_fragment (ip0))) { - next0 = icmp_out2in_slow_path - (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, now, thread_index, &s0); + next0 = SNAT_OUT2IN_NEXT_REASS; goto trace0; } - if (PREDICT_FALSE (ip4_is_fragment (ip0))) + if (PREDICT_FALSE (proto0 == SNAT_PROTOCOL_ICMP)) { - next0 = SNAT_OUT2IN_NEXT_REASS; + next0 = icmp_out2in_slow_path + (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, + next0, now, thread_index, &s0); goto trace0; } @@ -936,17 +936,17 @@ snat_out2in_node_fn (vlib_main_t * vm, goto trace1; } - if (PREDICT_FALSE (proto1 == SNAT_PROTOCOL_ICMP)) + if (PREDICT_FALSE (ip4_is_fragment (ip1))) { - next1 = icmp_out2in_slow_path - (sm, b1, ip1, icmp1, sw_if_index1, rx_fib_index1, node, - next1, now, thread_index, &s1); + next1 = SNAT_OUT2IN_NEXT_REASS; goto trace1; } - if (PREDICT_FALSE (ip4_is_fragment (ip1))) + if (PREDICT_FALSE (proto1 == SNAT_PROTOCOL_ICMP)) { - next1 = SNAT_OUT2IN_NEXT_REASS; + next1 = icmp_out2in_slow_path + (sm, b1, ip1, icmp1, sw_if_index1, rx_fib_index1, node, + next1, now, thread_index, &s1); goto trace1; } @@ -1134,17 +1134,17 @@ snat_out2in_node_fn (vlib_main_t * vm, goto trace00; } - if (PREDICT_FALSE (proto0 == SNAT_PROTOCOL_ICMP)) + if (PREDICT_FALSE (ip4_is_fragment (ip0))) { - next0 = icmp_out2in_slow_path - (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, now, thread_index, &s0); + next0 = SNAT_OUT2IN_NEXT_REASS; goto trace00; } - if (PREDICT_FALSE (ip4_is_fragment (ip0))) + if (PREDICT_FALSE (proto0 == SNAT_PROTOCOL_ICMP)) { - next0 = SNAT_OUT2IN_NEXT_REASS; + next0 = icmp_out2in_slow_path + (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, + next0, now, thread_index, &s0); goto trace00; } @@ -1336,6 +1336,7 @@ nat44_out2in_reass_node_fn (vlib_main_t * vm, nat_reass_ip4_t *reass0; udp_header_t *udp0; tcp_header_t *tcp0; + icmp46_header_t *icmp0; snat_session_key_t key0, sm0; clib_bihash_kv_8_8_t kv0, value0; snat_session_t *s0 = 0; @@ -1369,6 +1370,7 @@ nat44_out2in_reass_node_fn (vlib_main_t * vm, ip0 = (ip4_header_t *) vlib_buffer_get_current (b0); udp0 = ip4_next_header (ip0); tcp0 = (tcp_header_t *) udp0; + icmp0 = (icmp46_header_t *) udp0; proto0 = ip_proto_to_snat_proto (ip0->protocol); reass0 = nat_ip4_reass_find_or_create (ip0->src_address, @@ -1387,6 +1389,26 @@ nat44_out2in_reass_node_fn (vlib_main_t * vm, if (PREDICT_FALSE (ip4_is_first_fragment (ip0))) { + if (PREDICT_FALSE (proto0 == SNAT_PROTOCOL_ICMP)) + { + next0 = icmp_out2in_slow_path + (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, + next0, now, thread_index, &s0); + + if (PREDICT_TRUE (next0 != SNAT_OUT2IN_NEXT_DROP)) + { + if (s0) + reass0->sess_index = s0 - per_thread_data->sessions; + else + reass0->flags |= NAT_REASS_FLAG_ED_DONT_TRANSLATE; + reass0->thread_index = thread_index; + nat_ip4_reass_get_frags (reass0, + &fragments_to_loopback); + } + + goto trace0; + } + key0.addr = ip0->dst_address; key0.port = udp0->dst_port; key0.protocol = proto0; @@ -1421,6 +1443,12 @@ nat44_out2in_reass_node_fn (vlib_main_t * vm, node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; next0 = SNAT_OUT2IN_NEXT_DROP; } + else + { + reass0->flags |= NAT_REASS_FLAG_ED_DONT_TRANSLATE; + nat_ip4_reass_get_frags (reass0, + &fragments_to_loopback); + } goto trace0; } @@ -1452,6 +1480,8 @@ nat44_out2in_reass_node_fn (vlib_main_t * vm, } else { + if (reass0->flags & NAT_REASS_FLAG_ED_DONT_TRANSLATE) + goto trace0; if (PREDICT_FALSE (reass0->sess_index == (u32) ~ 0)) { if (nat_ip4_reass_add_fragment diff --git a/src/plugins/nat/out2in_ed.c b/src/plugins/nat/out2in_ed.c index b2dbc513df6c..b4ae6502e0d2 100644 --- a/src/plugins/nat/out2in_ed.c +++ b/src/plugins/nat/out2in_ed.c @@ -39,7 +39,8 @@ _(NO_TRANSLATION, "No translation") \ _(MAX_SESSIONS_EXCEEDED, "Maximum sessions exceeded") \ _(DROP_FRAGMENT, "Drop fragment") \ _(MAX_REASS, "Maximum reassemblies exceeded") \ -_(MAX_FRAG, "Maximum fragments per reassembly exceeded") +_(MAX_FRAG, "Maximum fragments per reassembly exceeded")\ +_(NON_SYN, "non-SYN packet try to create session") typedef enum { @@ -875,6 +876,13 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm, if (PREDICT_FALSE (identity_nat0)) goto trace00; + if ((proto0 == SNAT_PROTOCOL_TCP) && !tcp_is_init (tcp0)) + { + b0->error = node->errors[NAT_OUT2IN_ED_ERROR_NON_SYN]; + next0 = NAT44_ED_OUT2IN_NEXT_DROP; + goto trace00; + } + /* Create session initiated by host from external network */ s0 = create_session_for_static_mapping_ed (sm, b0, l_key0, e_key0, node, @@ -1097,6 +1105,13 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm, if (PREDICT_FALSE (identity_nat1)) goto trace01; + if ((proto1 == SNAT_PROTOCOL_TCP) && !tcp_is_init (tcp1)) + { + b1->error = node->errors[NAT_OUT2IN_ED_ERROR_NON_SYN]; + next1 = NAT44_ED_OUT2IN_NEXT_DROP; + goto trace01; + } + /* Create session initiated by host from external network */ s1 = create_session_for_static_mapping_ed (sm, b1, l_key1, e_key1, node, @@ -1353,6 +1368,13 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm, if (PREDICT_FALSE (identity_nat0)) goto trace0; + if ((proto0 == SNAT_PROTOCOL_TCP) && !tcp_is_init (tcp0)) + { + b0->error = node->errors[NAT_OUT2IN_ED_ERROR_NON_SYN]; + next0 = NAT44_ED_OUT2IN_NEXT_DROP; + goto trace0; + } + /* Create session initiated by host from external network */ s0 = create_session_for_static_mapping_ed (sm, b0, l_key0, e_key0, node, @@ -1702,6 +1724,13 @@ nat44_ed_out2in_reass_node_fn (vlib_main_t * vm, goto trace0; } + if ((proto0 == SNAT_PROTOCOL_TCP) && !tcp_is_init (tcp0)) + { + b0->error = node->errors[NAT_OUT2IN_ED_ERROR_NON_SYN]; + next0 = NAT44_ED_OUT2IN_NEXT_DROP; + goto trace0; + } + /* Create session initiated by host from external network */ s0 = create_session_for_static_mapping_ed (sm, b0, l_key0, e_key0, node, diff --git a/src/plugins/unittest/tcp_test.c b/src/plugins/unittest/tcp_test.c index d26532d4596d..d06578771c0e 100644 --- a/src/plugins/unittest/tcp_test.c +++ b/src/plugins/unittest/tcp_test.c @@ -89,6 +89,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) tc->snd_una_max = 1000; tc->snd_nxt = 1000; tc->rcv_opts.flags |= TCP_OPTS_FLAG_SACK; + tc->snd_mss = 150; scoreboard_init (&tc->sack_sb); for (i = 0; i < 1000 / 100; i++) @@ -110,8 +111,8 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) tcp_rcv_sacks (tc, 0); if (verbose) - vlib_cli_output (vm, "sb after even blocks:\n%U", format_tcp_scoreboard, - sb); + vlib_cli_output (vm, "sb after even blocks (mss %u):\n%U", + tc->snd_mss, format_tcp_scoreboard, sb, tc); TCP_TEST ((pool_elts (sb->holes) == 5), "scoreboard has %d elements", pool_elts (sb->holes)); @@ -127,7 +128,9 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); TCP_TEST ((sb->last_sacked_bytes == 400), "last sacked bytes %d", sb->last_sacked_bytes); - TCP_TEST ((sb->high_sacked == 900), "max byte sacked %u", sb->high_sacked); + TCP_TEST ((sb->high_sacked == 900), "high sacked %u", sb->high_sacked); + TCP_TEST ((sb->lost_bytes == 300), "lost bytes %u", sb->lost_bytes); + /* * Inject odd blocks */ @@ -141,8 +144,8 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) tcp_rcv_sacks (tc, 0); if (verbose) - vlib_cli_output (vm, "sb after odd blocks:\n%U", format_tcp_scoreboard, - sb); + vlib_cli_output (vm, "\nsb after odd blocks:\n%U", format_tcp_scoreboard, + sb, tc); hole = scoreboard_first_hole (sb); TCP_TEST ((pool_elts (sb->holes) == 1), @@ -151,17 +154,18 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) "first hole start %u end %u", hole->start, hole->end); TCP_TEST ((sb->sacked_bytes == 900), "sacked bytes %d", sb->sacked_bytes); TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); - TCP_TEST ((sb->high_sacked == 1000), "max sacked byte %u", sb->high_sacked); + TCP_TEST ((sb->high_sacked == 1000), "high sacked %u", sb->high_sacked); TCP_TEST ((sb->last_sacked_bytes == 500), "last sacked bytes %d", sb->last_sacked_bytes); + TCP_TEST ((sb->lost_bytes == 100), "lost bytes %u", sb->lost_bytes); /* * Ack until byte 100, all bytes are now acked + sacked */ tcp_rcv_sacks (tc, 100); if (verbose) - vlib_cli_output (vm, "ack until byte 100:\n%U", format_tcp_scoreboard, - sb); + vlib_cli_output (vm, "\nack until byte 100:\n%U", format_tcp_scoreboard, + sb, tc); TCP_TEST ((pool_elts (sb->holes) == 0), "scoreboard has %d elements", pool_elts (sb->holes)); @@ -171,6 +175,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); TCP_TEST ((sb->last_sacked_bytes == 0), "last sacked bytes %d", sb->last_sacked_bytes); + TCP_TEST ((sb->lost_bytes == 0), "lost bytes %u", sb->lost_bytes); /* * Add new block @@ -182,16 +187,14 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) block.end = 1300; vec_add1 (tc->rcv_opts.sacks, block); - if (verbose) - vlib_cli_output (vm, "add [1200, 1300]:\n%U", format_tcp_scoreboard, sb); tc->snd_una_max = 1500; tc->snd_una = 1000; tc->snd_nxt = 1500; tcp_rcv_sacks (tc, 1000); if (verbose) - vlib_cli_output (vm, "sb snd_una_max 1500, snd_una 1000:\n%U", - format_tcp_scoreboard, sb); + vlib_cli_output (vm, "\nadd [1200, 1300] snd_una_max 1500, snd_una 1000:" + " \n%U", format_tcp_scoreboard, sb, tc); TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv after ack %u", sb->snd_una_adv); @@ -207,6 +210,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) TCP_TEST ((hole->start == 1300 && hole->end == 1500), "last hole start %u end %u", hole->start, hole->end); TCP_TEST ((sb->sacked_bytes == 100), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->lost_bytes == 0), "lost bytes %u", sb->lost_bytes); /* * Ack first hole @@ -216,19 +220,19 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) tcp_rcv_sacks (tc, 1200); if (verbose) - vlib_cli_output (vm, "sb ack up to byte 1200:\n%U", format_tcp_scoreboard, - sb); + vlib_cli_output (vm, "\nsb ack up to byte 1200:\n%U", + format_tcp_scoreboard, sb, tc); TCP_TEST ((sb->snd_una_adv == 100), "snd_una_adv after ack %u", sb->snd_una_adv); TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); - TCP_TEST ((pool_elts (sb->holes) == 1), + TCP_TEST ((pool_elts (sb->holes) == 0), "scoreboard has %d elements", pool_elts (sb->holes)); - hole = scoreboard_first_hole (sb); - TCP_TEST ((hole->prev == TCP_INVALID_SACK_HOLE_INDEX - && hole->next == TCP_INVALID_SACK_HOLE_INDEX), "hole is valid"); TCP_TEST ((sb->last_bytes_delivered == 100), "last bytes delivered %d", sb->last_bytes_delivered); + TCP_TEST ((sb->lost_bytes == 0), "lost bytes %u", sb->lost_bytes); + TCP_TEST ((sb->head == TCP_INVALID_SACK_HOLE_INDEX), "head %u", sb->head); + TCP_TEST ((sb->tail == TCP_INVALID_SACK_HOLE_INDEX), "tail %u", sb->tail); /* * Add some more blocks and then remove all @@ -246,7 +250,8 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) scoreboard_clear (sb); if (verbose) - vlib_cli_output (vm, "sb cleared all:\n%U", format_tcp_scoreboard, sb); + vlib_cli_output (vm, "\nsb cleared all:\n%U", format_tcp_scoreboard, sb, + tc); TCP_TEST ((pool_elts (sb->holes) == 0), "number of holes %d", pool_elts (sb->holes)); @@ -267,14 +272,17 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); tcp_rcv_sacks (tc, 0); if (verbose) - vlib_cli_output (vm, "sb added odd blocks and ack [0, 950]:\n%U", - format_tcp_scoreboard, sb); + vlib_cli_output (vm, "\nsb added odd blocks snd_una 0 snd_una_max 1500:" + "\n%U", format_tcp_scoreboard, sb, tc); + TCP_TEST ((pool_elts (sb->holes) == 5), + "scoreboard has %d elements", pool_elts (sb->holes)); + TCP_TEST ((sb->lost_bytes == 300), "lost bytes %u", sb->lost_bytes); tcp_rcv_sacks (tc, 950); if (verbose) - vlib_cli_output (vm, "sb added odd blocks and ack [0, 950]:\n%U", - format_tcp_scoreboard, sb); + vlib_cli_output (vm, "\nack [0, 950]:\n%U", format_tcp_scoreboard, sb, + tc); TCP_TEST ((pool_elts (sb->holes) == 0), "scoreboard has %d elements", pool_elts (sb->holes)); @@ -282,6 +290,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); TCP_TEST ((sb->last_sacked_bytes == 0), "last sacked bytes %d", sb->last_sacked_bytes); + TCP_TEST ((sb->lost_bytes == 0), "lost bytes %u", sb->lost_bytes); /* * Inject one block, ack it and overlap hole @@ -299,22 +308,26 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) tcp_rcv_sacks (tc, 0); if (verbose) - vlib_cli_output (vm, "sb added [100, 500]:\n%U", - format_tcp_scoreboard, sb); + vlib_cli_output (vm, "\nsb added [100, 500] snd_una 0 snd_una_max 1000:" + "\n%U", format_tcp_scoreboard, sb, tc); tcp_rcv_sacks (tc, 800); if (verbose) - vlib_cli_output (vm, "sb ack [0, 800]:\n%U", format_tcp_scoreboard, sb); + vlib_cli_output (vm, "\nsb ack [0, 800]:\n%U", format_tcp_scoreboard, sb, + tc); - TCP_TEST ((pool_elts (sb->holes) == 1), + TCP_TEST ((pool_elts (sb->holes) == 0), "scoreboard has %d elements", pool_elts (sb->holes)); TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); - TCP_TEST ((sb->last_sacked_bytes == 0), - "last sacked bytes %d", sb->last_sacked_bytes); + TCP_TEST ((sb->last_sacked_bytes == 0), "last sacked bytes %d", + sb->last_sacked_bytes); TCP_TEST ((sb->last_bytes_delivered == 400), "last bytes delivered %d", sb->last_bytes_delivered); + TCP_TEST ((sb->lost_bytes == 0), "lost bytes %u", sb->lost_bytes); + TCP_TEST ((sb->head == TCP_INVALID_SACK_HOLE_INDEX), "head %u", sb->head); + TCP_TEST ((sb->tail == TCP_INVALID_SACK_HOLE_INDEX), "tail %u", sb->tail); /* * One hole close to head, patch head, split in two and start acking @@ -332,8 +345,12 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) tcp_rcv_sacks (tc, 0); if (verbose) - vlib_cli_output (vm, "sb added [500, 1000]:\n%U", - format_tcp_scoreboard, sb); + vlib_cli_output (vm, "\nsb added [500, 1000]:\n%U", + format_tcp_scoreboard, sb, tc); + TCP_TEST ((sb->sacked_bytes == 500), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->last_sacked_bytes == 500), "last sacked bytes %d", + sb->last_sacked_bytes); + TCP_TEST ((sb->lost_bytes == 500), "lost bytes %u", sb->lost_bytes); vec_reset_length (tc->rcv_opts.sacks); block.start = 300; @@ -342,17 +359,82 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); tcp_rcv_sacks (tc, 100); if (verbose) - vlib_cli_output (vm, "sb added [0, 100] [300, 400]:\n%U", - format_tcp_scoreboard, sb); + vlib_cli_output (vm, "\nsb added [0, 100] [300, 400]:\n%U", + format_tcp_scoreboard, sb, tc); TCP_TEST ((pool_elts (sb->holes) == 2), "scoreboard has %d elements", pool_elts (sb->holes)); + TCP_TEST ((sb->sacked_bytes == 600), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->last_sacked_bytes == 100), "last sacked bytes %d", + sb->last_sacked_bytes); + TCP_TEST ((sb->last_bytes_delivered == 0), "last bytes delivered %d", + sb->last_bytes_delivered); + TCP_TEST ((sb->lost_bytes == 300), "lost bytes %u", sb->lost_bytes); tc->snd_una = 100; tcp_rcv_sacks (tc, 200); + tc->snd_una = 200; tcp_rcv_sacks (tc, 300); if (verbose) - vlib_cli_output (vm, "sb added [0, 300]:\n%U", format_tcp_scoreboard, sb); + vlib_cli_output (vm, "\nacked [0, 300] in two steps:\n%U", + format_tcp_scoreboard, sb, tc); TCP_TEST ((sb->sacked_bytes == 500), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->lost_bytes == 100), "lost bytes %u", sb->lost_bytes); + TCP_TEST ((sb->last_bytes_delivered == 100), "last bytes delivered %d", + sb->last_bytes_delivered); + + tc->snd_una = 400; + tcp_rcv_sacks (tc, 500); + if (verbose) + vlib_cli_output (vm, "\nacked [400, 500]:\n%U", format_tcp_scoreboard, sb, + tc); + TCP_TEST ((pool_elts (sb->holes) == 0), + "scoreboard has %d elements", pool_elts (sb->holes)); + TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->last_sacked_bytes == 0), "last sacked bytes %d", + sb->last_sacked_bytes); + TCP_TEST ((sb->last_bytes_delivered == 500), "last bytes delivered %d", + sb->last_bytes_delivered); + TCP_TEST ((sb->lost_bytes == 0), "lost bytes %u", sb->lost_bytes); + TCP_TEST ((sb->snd_una_adv == 500), "snd_una_adv %u", sb->snd_una_adv); + TCP_TEST ((sb->head == TCP_INVALID_SACK_HOLE_INDEX), "head %u", sb->head); + TCP_TEST ((sb->tail == TCP_INVALID_SACK_HOLE_INDEX), "tail %u", sb->tail); + + /* + * Re-ack high sacked, to make sure last_bytes_delivered and + * snd_una_adv are 0-ed + */ + tcp_rcv_sacks (tc, 1000); + if (verbose) + vlib_cli_output (vm, "\nAck high sacked:\n%U", format_tcp_scoreboard, sb, + tc); + TCP_TEST ((sb->last_bytes_delivered == 0), "last bytes delivered %d", + sb->last_bytes_delivered); + TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); + + /* + * Add [1200, 1500] and test that [1000, 1200] is lost (bytes condition) + * snd_una = 1000 and snd_una_max = 1600 + */ + tc->snd_una = 1000; + tc->snd_una_max = 1600; + vec_reset_length (tc->rcv_opts.sacks); + block.start = 1200; + block.end = 1500; + vec_add1 (tc->rcv_opts.sacks, block); + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); + tcp_rcv_sacks (tc, 1000); + if (verbose) + vlib_cli_output (vm, "\nacked [1200, 1500] test first hole is lost:\n%U", + format_tcp_scoreboard, sb, tc); + TCP_TEST ((pool_elts (sb->holes) == 2), "scoreboard has %d elements", + pool_elts (sb->holes)); + TCP_TEST ((sb->sacked_bytes == 300), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->last_sacked_bytes == 300), "last sacked bytes %d", + sb->last_sacked_bytes); + TCP_TEST ((sb->last_bytes_delivered == 0), "last bytes delivered %d", + sb->last_bytes_delivered); + TCP_TEST ((sb->lost_bytes == 200), "lost bytes %u", sb->lost_bytes); + TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); return 0; } @@ -1559,13 +1641,14 @@ tcp_test_lookup (vlib_main_t * vm, unformat_input_t * input) tcp_connection_t *tc; stream_session_t *s, *s1; u8 cmp = 0, is_filtered = 0; + u32 sidx; /* * Allocate fake session and connection 1 */ pool_get (smm->sessions[0], s); memset (s, 0, sizeof (*s)); - s->session_index = s - smm->sessions[0]; + s->session_index = sidx = s - smm->sessions[0]; pool_get (tm->connections[0], tc); memset (tc, 0, sizeof (*tc)); @@ -1580,7 +1663,6 @@ tcp_test_lookup (vlib_main_t * vm, unformat_input_t * input) tc->connection.proto = TRANSPORT_PROTO_TCP; tc->connection.is_ip4 = 1; clib_memcpy (tc1, &tc->connection, sizeof (*tc1)); - s1 = s; /* * Allocate fake session and connection 2 @@ -1607,6 +1689,7 @@ tcp_test_lookup (vlib_main_t * vm, unformat_input_t * input) * Confirm that connection lookup works */ + s1 = pool_elt_at_index (smm->sessions[0], sidx); session_lookup_add_connection (tc1, session_handle (s1)); tconn = session_lookup_connection_wt4 (0, &tc1->lcl_ip.ip4, &tc1->rmt_ip.ip4, diff --git a/src/plugins/vmxnet3/README.md b/src/plugins/vmxnet3/README.md index 4f03c1575f9a..031c5962ee9b 100644 --- a/src/plugins/vmxnet3/README.md +++ b/src/plugins/vmxnet3/README.md @@ -16,7 +16,8 @@ vfio driver can still be used with recent kernels which support no-iommu mode. ##Known issues * NUMA support -* TSO +* TSO/LRO +* RSS/multiple queues * VLAN filter ## Usage diff --git a/src/plugins/vmxnet3/cli.c b/src/plugins/vmxnet3/cli.c index 40d379bb21a7..096791b10037 100644 --- a/src/plugins/vmxnet3/cli.c +++ b/src/plugins/vmxnet3/cli.c @@ -184,7 +184,8 @@ VLIB_CLI_COMMAND (vmxnet3_test_command, static) = { /* *INDENT-ON* */ static void -show_vmxnet3 (vlib_main_t * vm, u32 * hw_if_indices, u8 show_descr) +show_vmxnet3 (vlib_main_t * vm, u32 * hw_if_indices, u8 show_descr, + u8 show_one_table, u32 which, u8 show_one_slot, u32 slot) { u32 i, desc_idx; vmxnet3_device_t *vd; @@ -228,6 +229,8 @@ show_vmxnet3 (vlib_main_t * vm, u32 * hw_if_indices, u8 show_descr) rxq->rx_comp_ring.next); vlib_cli_output (vm, " RX completion generation flag 0x%x", rxq->rx_comp_ring.gen); + + /* RX descriptors tables */ for (rid = 0; rid < VMXNET3_RX_RING_SIZE; rid++) { vmxnet3_rx_ring *ring = &rxq->rx_ring[rid]; @@ -248,16 +251,70 @@ show_vmxnet3 (vlib_main_t * vm, u32 * hw_if_indices, u8 show_descr) vlib_cli_output (vm, " %5u 0x%016llx 0x%08x", desc_idx, rxd->address, rxd->flags); } + } + else if (show_one_table) + { + if (((which == VMXNET3_SHOW_RX_DESC0) && (rid == 0)) || + ((which == VMXNET3_SHOW_RX_DESC1) && (rid == 1))) + { + vlib_cli_output (vm, "RX descriptors table"); + vlib_cli_output (vm, " %5s %18s %10s", + "slot", "address", "flags"); + if (show_one_slot) + { + rxd = &rxq->rx_desc[rid][slot]; + vlib_cli_output (vm, " %5u 0x%016llx 0x%08x", + slot, rxd->address, rxd->flags); + } + else + for (desc_idx = 0; desc_idx < rxq->size; desc_idx++) + { + rxd = &rxq->rx_desc[rid][desc_idx]; + vlib_cli_output (vm, " %5u 0x%016llx 0x%08x", + desc_idx, rxd->address, + rxd->flags); + } + } + } + } + + /* RX completion table */ + if (show_descr) + { + vlib_cli_output (vm, "RX completion descriptors table"); + vlib_cli_output (vm, " %5s %10s %10s %10s %10s", + "slot", "index", "rss", "len", "flags"); + for (desc_idx = 0; desc_idx < rxq->size; desc_idx++) + { + rx_comp = &rxq->rx_comp[desc_idx]; + vlib_cli_output (vm, " %5u 0x%08x %10u %10u 0x%08x", + desc_idx, rx_comp->index, rx_comp->rss, + rx_comp->len, rx_comp->flags); + } + } + else if (show_one_table) + { + if (which == VMXNET3_SHOW_RX_COMP) + { vlib_cli_output (vm, "RX completion descriptors table"); vlib_cli_output (vm, " %5s %10s %10s %10s %10s", "slot", "index", "rss", "len", "flags"); - for (desc_idx = 0; desc_idx < rxq->size; desc_idx++) + if (show_one_slot) { - rx_comp = &rxq->rx_comp[desc_idx]; + rx_comp = &rxq->rx_comp[slot]; vlib_cli_output (vm, " %5u 0x%08x %10u %10u 0x%08x", - desc_idx, rx_comp->index, rx_comp->rss, + slot, rx_comp->index, rx_comp->rss, rx_comp->len, rx_comp->flags); } + else + for (desc_idx = 0; desc_idx < rxq->size; desc_idx++) + { + rx_comp = &rxq->rx_comp[desc_idx]; + vlib_cli_output (vm, + " %5u 0x%08x %10u %10u 0x%08x", + desc_idx, rx_comp->index, rx_comp->rss, + rx_comp->len, rx_comp->flags); + } } } } @@ -285,6 +342,7 @@ show_vmxnet3 (vlib_main_t * vm, u32 * hw_if_indices, u8 show_descr) desc_idx, txd->address, txd->flags[0], txd->flags[1]); } + vlib_cli_output (vm, "TX completion descriptors table"); vlib_cli_output (vm, " %5s %10s %10s", "slot", "index", "flags"); @@ -295,6 +353,50 @@ show_vmxnet3 (vlib_main_t * vm, u32 * hw_if_indices, u8 show_descr) desc_idx, tx_comp->index, tx_comp->flags); } } + else if (show_one_table) + { + if (which == VMXNET3_SHOW_TX_DESC) + { + vlib_cli_output (vm, "TX descriptors table"); + vlib_cli_output (vm, " %5s %18s %10s %10s", + "slot", "address", "flags0", "flags1"); + if (show_one_slot) + { + txd = &txq->tx_desc[slot]; + vlib_cli_output (vm, " %5u 0x%016llx 0x%08x 0x%08x", + slot, txd->address, txd->flags[0], + txd->flags[1]); + } + else + for (desc_idx = 0; desc_idx < txq->size; desc_idx++) + { + txd = &txq->tx_desc[desc_idx]; + vlib_cli_output (vm, " %5u 0x%016llx 0x%08x 0x%08x", + desc_idx, txd->address, txd->flags[0], + txd->flags[1]); + } + } + else if (which == VMXNET3_SHOW_TX_COMP) + { + vlib_cli_output (vm, "TX completion descriptors table"); + vlib_cli_output (vm, " %5s %10s %10s", + "slot", "index", "flags"); + if (show_one_slot) + { + tx_comp = &txq->tx_comp[slot]; + vlib_cli_output (vm, " %5u 0x%08x 0x%08x", + slot, tx_comp->index, tx_comp->flags); + } + else + for (desc_idx = 0; desc_idx < txq->size; desc_idx++) + { + tx_comp = &txq->tx_comp[desc_idx]; + vlib_cli_output (vm, " %5u 0x%08x 0x%08x", + desc_idx, tx_comp->index, + tx_comp->flags); + } + } + } } } } @@ -308,8 +410,9 @@ show_vmxnet3_fn (vlib_main_t * vm, unformat_input_t * input, vmxnet3_device_t *vd; clib_error_t *error = 0; u32 hw_if_index, *hw_if_indices = 0; - vnet_hw_interface_t *hi; - u8 show_descr = 0; + vnet_hw_interface_t *hi = 0; + u8 show_descr = 0, show_one_table = 0, show_one_slot = 0; + u32 which = ~0, slot; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { @@ -325,8 +428,110 @@ show_vmxnet3_fn (vlib_main_t * vm, unformat_input_t * input, } vec_add1 (hw_if_indices, hw_if_index); } - else if (unformat (input, "descriptors") || unformat (input, "desc")) + else if (unformat (input, "desc")) show_descr = 1; + else if (hi) + { + vmxnet3_device_t *vd = + vec_elt_at_index (vmxm->devices, hi->dev_instance); + + if (unformat (input, "rx-comp")) + { + show_one_table = 1; + which = VMXNET3_SHOW_RX_COMP; + if (unformat (input, "%u", &slot)) + { + vmxnet3_rxq_t *rxq = vec_elt_at_index (vd->rxqs, 0); + + if (slot >= rxq->size) + { + error = clib_error_return (0, + "slot size must be < rx queue " + "size %u", rxq->size); + goto done; + } + show_one_slot = 1; + } + } + else if (unformat (input, "rx-desc-0")) + { + show_one_table = 1; + which = VMXNET3_SHOW_RX_DESC0; + if (unformat (input, "%u", &slot)) + { + vmxnet3_rxq_t *rxq = vec_elt_at_index (vd->rxqs, 0); + + if (slot >= rxq->size) + { + error = clib_error_return (0, + "slot size must be < rx queue " + "size %u", rxq->size); + goto done; + } + show_one_slot = 1; + } + } + else if (unformat (input, "rx-desc-1")) + { + show_one_table = 1; + which = VMXNET3_SHOW_RX_DESC1; + if (unformat (input, "%u", &slot)) + { + vmxnet3_rxq_t *rxq = vec_elt_at_index (vd->rxqs, 0); + + if (slot >= rxq->size) + { + error = clib_error_return (0, + "slot size must be < rx queue " + "size %u", rxq->size); + goto done; + } + show_one_slot = 1; + } + } + else if (unformat (input, "tx-comp")) + { + show_one_table = 1; + which = VMXNET3_SHOW_TX_COMP; + if (unformat (input, "%u", &slot)) + { + vmxnet3_txq_t *txq = vec_elt_at_index (vd->txqs, 0); + + if (slot >= txq->size) + { + error = clib_error_return (0, + "slot size must be < tx queue " + "size %u", txq->size); + goto done; + } + show_one_slot = 1; + } + } + else if (unformat (input, "tx-desc")) + { + show_one_table = 1; + which = VMXNET3_SHOW_TX_DESC; + if (unformat (input, "%u", &slot)) + { + vmxnet3_txq_t *txq = vec_elt_at_index (vd->txqs, 0); + + if (slot >= txq->size) + { + error = clib_error_return (0, + "slot size must be < tx queue " + "size %u", txq->size); + goto done; + } + show_one_slot = 1; + } + } + else + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + goto done; + } + } else { error = clib_error_return (0, "unknown input `%U'", @@ -342,7 +547,8 @@ show_vmxnet3_fn (vlib_main_t * vm, unformat_input_t * input, ); } - show_vmxnet3 (vm, hw_if_indices, show_descr); + show_vmxnet3 (vm, hw_if_indices, show_descr, show_one_table, which, + show_one_slot, slot); done: vec_free (hw_if_indices); @@ -352,7 +558,8 @@ show_vmxnet3_fn (vlib_main_t * vm, unformat_input_t * input, /* *INDENT-OFF* */ VLIB_CLI_COMMAND (show_vmxnet3_command, static) = { .path = "show vmxnet3", - .short_help = "show vmxnet3 []", + .short_help = "show vmxnet3 [[] ([desc] | ([rx-comp] | " + "[rx-desc-0] | [rx-desc-1] | [tx-comp] | [tx-desc]) [])]", .function = show_vmxnet3_fn, }; /* *INDENT-ON* */ @@ -360,9 +567,12 @@ VLIB_CLI_COMMAND (show_vmxnet3_command, static) = { clib_error_t * vmxnet3_cli_init (vlib_main_t * vm) { + vmxnet3_main_t *vmxm = &vmxnet3_main; + /* initialize binary API */ vmxnet3_plugin_api_hookup (vm); + vmxm->log_default = vlib_log_register_class ("vmxnet3", 0); return 0; } diff --git a/src/plugins/vmxnet3/input.c b/src/plugins/vmxnet3/input.c index 4ff459a066ac..9392d57747d8 100644 --- a/src/plugins/vmxnet3/input.c +++ b/src/plugins/vmxnet3/input.c @@ -27,6 +27,7 @@ _(BUFFER_ALLOC, "buffer alloc error") \ _(RX_PACKET_NO_SOP, "Rx packet error - no SOP") \ _(RX_PACKET, "Rx packet error") \ + _(RX_PACKET_EOP, "Rx packet error found on EOP") \ _(NO_BUFFER, "Rx no buffer error") typedef enum @@ -79,7 +80,6 @@ vmxnet3_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, uword n_trace = vlib_get_trace_count (vm, node); u32 n_rx_packets = 0, n_rx_bytes = 0; vmxnet3_rx_comp *rx_comp; - u32 comp_idx; u32 desc_idx; vmxnet3_rxq_t *rxq; u32 thread_index = vm->thread_index; @@ -98,16 +98,14 @@ vmxnet3_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, comp_ring = &rxq->rx_comp_ring; bi = buffer_indices; next = nexts; + rx_comp = &rxq->rx_comp[comp_ring->next]; + while (PREDICT_TRUE (n_rx_packets < VLIB_FRAME_SIZE) && - (comp_ring->gen == - (rxq->rx_comp[comp_ring->next].flags & VMXNET3_RXCF_GEN))) + (comp_ring->gen == (rx_comp->flags & VMXNET3_RXCF_GEN))) { vlib_buffer_t *b0; u32 bi0; - comp_idx = comp_ring->next; - rx_comp = &rxq->rx_comp[comp_idx]; - rid = vmxnet3_find_rid (vd, rx_comp); ring = &rxq->rx_ring[rid]; @@ -117,10 +115,15 @@ vmxnet3_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { vlib_error_count (vm, node->node_index, VMXNET3_INPUT_ERROR_NO_BUFFER, 1); + if (hb) + { + vlib_buffer_free_one (vm, vlib_get_buffer_index (vm, hb)); + hb = 0; + } + prev_b0 = 0; break; } - vmxnet3_rx_comp_ring_advance_next (rxq); desc_idx = rx_comp->index & VMXNET3_RXC_INDEX; ring->consume = desc_idx; rxd = &rxq->rx_desc[rid][desc_idx]; @@ -146,14 +149,14 @@ vmxnet3_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { vlib_buffer_free_one (vm, bi0); vlib_error_count (vm, node->node_index, - VMXNET3_INPUT_ERROR_RX_PACKET, 1); + VMXNET3_INPUT_ERROR_RX_PACKET_EOP, 1); if (hb && vlib_get_buffer_index (vm, hb) != bi0) { vlib_buffer_free_one (vm, vlib_get_buffer_index (vm, hb)); hb = 0; } prev_b0 = 0; - continue; + goto next; } if (rx_comp->index & VMXNET3_RXCI_SOP) @@ -199,7 +202,7 @@ vmxnet3_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_free_one (vm, vlib_get_buffer_index (vm, hb)); hb = 0; } - continue; + goto next; } } else if (prev_b0) // !sop && !eop @@ -213,7 +216,15 @@ vmxnet3_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } else { - ASSERT (0); + vlib_error_count (vm, node->node_index, + VMXNET3_INPUT_ERROR_RX_PACKET, 1); + vlib_buffer_free_one (vm, bi0); + if (hb && vlib_get_buffer_index (vm, hb) != bi0) + { + vlib_buffer_free_one (vm, vlib_get_buffer_index (vm, hb)); + hb = 0; + } + goto next; } n_rx_bytes += b0->current_length; @@ -275,6 +286,10 @@ vmxnet3_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, hb = 0; got_packet = 0; } + + next: + vmxnet3_rx_comp_ring_advance_next (rxq); + rx_comp = &rxq->rx_comp[comp_ring->next]; } if (PREDICT_FALSE ((n_trace = vlib_get_trace_count (vm, node)))) diff --git a/src/plugins/vmxnet3/output.c b/src/plugins/vmxnet3/output.c index bcb02949184e..1dc394a62ef0 100644 --- a/src/plugins/vmxnet3/output.c +++ b/src/plugins/vmxnet3/output.c @@ -143,15 +143,22 @@ VNET_DEVICE_CLASS_TX_FN (vmxnet3_device_class) (vlib_main_t * vm, } if (PREDICT_FALSE (space_left < space_needed)) { - vlib_buffer_free_one (vm, bi0); - vlib_error_count (vm, node->node_index, - VMXNET3_TX_ERROR_NO_FREE_SLOTS, 1); - buffers++; - n_left--; - /* - * Drop this packet. But we may have enough room for the next packet - */ - continue; + vmxnet3_txq_release (vm, vd, txq); + space_left = vmxnet3_tx_ring_space_left (txq); + + if (PREDICT_FALSE (space_left < space_needed)) + { + vlib_buffer_free_one (vm, bi0); + vlib_error_count (vm, node->node_index, + VMXNET3_TX_ERROR_NO_FREE_SLOTS, 1); + buffers++; + n_left--; + /* + * Drop this packet. But we may have enough room for the next + * packet + */ + continue; + } } /* @@ -190,7 +197,8 @@ VNET_DEVICE_CLASS_TX_FN (vmxnet3_device_class) (vlib_main_t * vm, * Device can start reading the packet */ txq->tx_desc[first_idx].flags[0] ^= VMXNET3_TXF_GEN; - vmxnet3_reg_write (vd, 0, VMXNET3_REG_TXPROD, txq->tx_ring.produce); + vmxnet3_reg_write_inline (vd, 0, VMXNET3_REG_TXPROD, + txq->tx_ring.produce); buffers++; n_left--; diff --git a/src/plugins/vmxnet3/vmxnet3.c b/src/plugins/vmxnet3/vmxnet3.c index ac99411a4e21..ccc76dae06f4 100644 --- a/src/plugins/vmxnet3/vmxnet3.c +++ b/src/plugins/vmxnet3/vmxnet3.c @@ -319,7 +319,7 @@ vmxnet3_device_init (vlib_main_t * vm, vmxnet3_device_t * vd, ret = vmxnet3_reg_read (vd, 1, VMXNET3_REG_CMD); if (ret != 0) { - error = clib_error_return (0, "error on quisecing device rc (%u)", ret); + error = clib_error_return (0, "error on quiescing device rc (%u)", ret); return error; } @@ -497,6 +497,9 @@ vmxnet3_create_if (vlib_main_t * vm, vmxnet3_create_if_args_t * args) clib_error_return (error, "queue size must be <= 4096, >= 64, " "and multiples of 64"); + vlib_log (VLIB_LOG_LEVEL_ERR, vmxm->log_default, "%U: %s", + format_vlib_pci_addr, &args->addr, + "queue size must be <= 4096, >= 64, and multiples of 64"); return; } @@ -507,6 +510,8 @@ vmxnet3_create_if (vlib_main_t * vm, vmxnet3_create_if_args_t * args) args->rv = VNET_API_ERROR_INVALID_VALUE; args->error = clib_error_return (error, "PCI address in use"); + vlib_log (VLIB_LOG_LEVEL_ERR, vmxm->log_default, "%U: %s", + format_vlib_pci_addr, &args->addr, "pci address in use"); return; } })); @@ -528,37 +533,70 @@ vmxnet3_create_if (vlib_main_t * vm, vmxnet3_create_if_args_t * args) args->error = clib_error_return (error, "pci-addr %U", format_vlib_pci_addr, &args->addr); + vlib_log (VLIB_LOG_LEVEL_ERR, vmxm->log_default, "%U: %s", + format_vlib_pci_addr, &args->addr, + "error encountered on pci device open"); return; } - vd->pci_dev_handle = h; + /* + * Do not use vmxnet3_log_error prior to this line since the macro + * references vd->pci_dev_handle + */ + vd->pci_dev_handle = h; vlib_pci_set_private_data (h, vd->dev_instance); if ((error = vlib_pci_bus_master_enable (h))) - goto error; + { + vmxnet3_log_error (vd, "error encountered on pci bus master enable"); + goto error; + } if ((error = vlib_pci_map_region (h, 0, (void **) &vd->bar[0]))) - goto error; + { + vmxnet3_log_error (vd, "error encountered on pci map region for bar 0"); + goto error; + } if ((error = vlib_pci_map_region (h, 1, (void **) &vd->bar[1]))) - goto error; + { + vmxnet3_log_error (vd, "error encountered on pci map region for bar 1"); + goto error; + } if ((error = vlib_pci_register_msix_handler (h, 0, 1, &vmxnet3_irq_0_handler))) - goto error; + { + vmxnet3_log_error (vd, + "error encountered on pci register msix handler 0"); + goto error; + } if ((error = vlib_pci_register_msix_handler (h, 1, 1, &vmxnet3_irq_1_handler))) - goto error; + { + vmxnet3_log_error (vd, + "error encountered on pci register msix handler 1"); + goto error; + } if ((error = vlib_pci_enable_msix_irq (h, 0, 2))) - goto error; + { + vmxnet3_log_error (vd, "error encountered on pci enable msix irq"); + goto error; + } if ((error = vlib_pci_intr_enable (h))) - goto error; + { + vmxnet3_log_error (vd, "error encountered on pci interrupt enable"); + goto error; + } if ((error = vmxnet3_device_init (vm, vd, args))) - goto error; + { + vmxnet3_log_error (vd, "error encountered on device init"); + goto error; + } /* create interface */ error = ethernet_register_interface (vnm, vmxnet3_device_class.index, @@ -566,7 +604,11 @@ vmxnet3_create_if (vlib_main_t * vm, vmxnet3_create_if_args_t * args) &vd->hw_if_index, vmxnet3_flag_change); if (error) - goto error; + { + vmxnet3_log_error (vd, + "error encountered on ethernet register interface"); + goto error; + } vnet_sw_interface_t *sw = vnet_get_hw_sw_interface (vnm, vd->hw_if_index); vd->sw_if_index = sw->sw_if_index; diff --git a/src/plugins/vmxnet3/vmxnet3.h b/src/plugins/vmxnet3/vmxnet3.h index f3868a88ae3b..daf6275ec0f1 100644 --- a/src/plugins/vmxnet3/vmxnet3.h +++ b/src/plugins/vmxnet3/vmxnet3.h @@ -43,6 +43,20 @@ enum #undef _ }; +#define foreach_vmxnet3_show_entry \ + _(RX_COMP, "rx comp") \ + _(RX_DESC0, "rx desc 0") \ + _(RX_DESC1, "rx desc 1") \ + _(TX_COMP, "tx comp") \ + _(TX_DESC, "tx desc") + +enum +{ +#define _(a, b) VMXNET3_SHOW_##a, + foreach_vmxnet3_show_entry +#undef _ +}; + /* BAR 0 */ #define VMXNET3_REG_IMR 0x0000 /* Interrupt Mask Register */ #define VMXNET3_REG_TXPROD 0x0600 /* Tx Producer Index */ @@ -152,7 +166,7 @@ enum _(7, GET_DEV_EXTRA_INFO, "get dev extra info") \ _(8, GET_CONF_INTR, "get conf intr") \ _(9, GET_ADAPTIVE_RING_INFO, "get adaptive ring info") \ - _(10, GET_TXDATA_DESC_SIZE, "gte txdata desc size") \ + _(10, GET_TXDATA_DESC_SIZE, "get txdata desc size") \ _(11, RESERVED5, "reserved5") enum @@ -396,8 +410,8 @@ typedef struct typedef struct { CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); - u64 next; u32 gen; + u16 next; } vmxnet3_rx_comp_ring; typedef struct @@ -423,8 +437,8 @@ typedef struct typedef struct { CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); - u64 next; u32 gen; + u16 next; } vmxnet3_tx_comp_ring; typedef struct @@ -482,6 +496,7 @@ typedef struct vlib_physmem_region_index_t physmem_region; u32 physmem_region_alloc; u16 msg_id_base; + vlib_log_class_t log_default; } vmxnet3_main_t; extern vmxnet3_main_t vmxnet3_main; @@ -517,16 +532,39 @@ format_function_t format_vmxnet3_device; format_function_t format_vmxnet3_device_name; format_function_t format_vmxnet3_input_trace; +#define vmxnet3_log_debug(dev, f, ...) \ + vlib_log (VLIB_LOG_LEVEL_DEBUG, vmxnet3_main.log_default, "%U: " f, \ + format_vlib_pci_addr, vlib_pci_get_addr(dev->pci_dev_handle), \ + ## __VA_ARGS__) + +#define vmxnet3_log_error(dev, f, ...) \ + vlib_log (VLIB_LOG_LEVEL_ERR, vmxnet3_main.log_default, "%U: " f, \ + format_vlib_pci_addr, vlib_pci_get_addr(dev->pci_dev_handle), \ + ## __VA_ARGS__) + +/* no log version, called by data plane */ static_always_inline void -vmxnet3_reg_write (vmxnet3_device_t * vd, u8 bar, u32 addr, u32 val) +vmxnet3_reg_write_inline (vmxnet3_device_t * vd, u8 bar, u32 addr, u32 val) { *(volatile u32 *) ((u8 *) vd->bar[bar] + addr) = val; } +static_always_inline void +vmxnet3_reg_write (vmxnet3_device_t * vd, u8 bar, u32 addr, u32 val) +{ + vmxnet3_log_debug (vd, "reg wr bar %u addr 0x%x val 0x%x", bar, addr, val); + vmxnet3_reg_write_inline (vd, bar, addr, val); +} + static_always_inline u32 vmxnet3_reg_read (vmxnet3_device_t * vd, u8 bar, u32 addr) { - return *(volatile u32 *) (vd->bar[bar] + addr); + u32 val; + + val = *(volatile u32 *) (vd->bar[bar] + addr); + vmxnet3_log_debug (vd, "reg rd bar %u addr 0x%x val 0x%x", bar, addr, val); + + return val; } static_always_inline uword @@ -586,7 +624,7 @@ vmxnet3_rxq_refill_ring0 (vlib_main_t * vm, vmxnet3_device_t * vd, n_alloc--; } - vmxnet3_reg_write (vd, 0, VMXNET3_REG_RXPROD, ring->produce); + vmxnet3_reg_write_inline (vd, 0, VMXNET3_REG_RXPROD, ring->produce); return 0; } @@ -628,7 +666,7 @@ vmxnet3_rxq_refill_ring1 (vlib_main_t * vm, vmxnet3_device_t * vd, n_alloc--; } - vmxnet3_reg_write (vd, 0, VMXNET3_REG_RXPROD2, ring->produce); + vmxnet3_reg_write_inline (vd, 0, VMXNET3_REG_RXPROD2, ring->produce); return 0; } diff --git a/src/plugins/vmxnet3/vmxnet3_test.c b/src/plugins/vmxnet3/vmxnet3_test.c index b08f61b0bd99..1b5fd5b7c9a8 100644 --- a/src/plugins/vmxnet3/vmxnet3_test.c +++ b/src/plugins/vmxnet3/vmxnet3_test.c @@ -227,6 +227,14 @@ api_vmxnet3_dump (vat_main_t * vam) return ret; } +static u8 * +format_pci_addr (u8 * s, va_list * va) +{ + vlib_pci_addr_t *addr = va_arg (*va, vlib_pci_addr_t *); + return format (s, "%04x:%02x:%02x.%x", addr->domain, addr->bus, + addr->slot, addr->function); +} + static void vl_api_vmxnet3_details_t_handler (vl_api_vmxnet3_details_t * mp) { @@ -246,7 +254,7 @@ vl_api_vmxnet3_details_t_handler (vl_api_vmxnet3_details_t * mp) " state %s\n", mp->if_name, ntohl (mp->sw_if_index), format_ethernet_address, mp->hw_addr, mp->version, - format_vlib_pci_addr, &pci_addr, + format_pci_addr, &pci_addr, ntohs (mp->rx_next), ntohs (mp->rx_qid), ntohs (mp->rx_qsize), ntohs (mp->rx_fill[0]), diff --git a/src/vat/CMakeLists.txt b/src/vat/CMakeLists.txt index 2ff907026db3..0df3bb384c8a 100644 --- a/src/vat/CMakeLists.txt +++ b/src/vat/CMakeLists.txt @@ -29,6 +29,8 @@ add_vpp_executable(vpp_api_test ENABLE_EXPORTS plugin.c json_format.c + DEPENDS api_headers + LINK_LIBRARIES vlibmemoryclient svm diff --git a/src/vat/api_format.c b/src/vat/api_format.c index 0e53f56196ad..56ded020bca2 100644 --- a/src/vat/api_format.c +++ b/src/vat/api_format.c @@ -8315,7 +8315,7 @@ api_tap_create_v2 (vat_main_t * vam) clib_memcpy (mp->host_bridge, host_bridge, vec_len (host_bridge)); if (host_ip4_prefix_len) clib_memcpy (mp->host_ip4_addr, &host_ip4_addr, 4); - if (host_ip4_prefix_len) + if (host_ip6_prefix_len) clib_memcpy (mp->host_ip6_addr, &host_ip6_addr, 16); if (host_ip4_gw_set) clib_memcpy (mp->host_ip4_gw, &host_ip4_gw, 4); @@ -20428,14 +20428,14 @@ vl_api_mpls_fib_path_print (vat_main_t * vam, vl_api_fib_path_t * fp) print (vam->ofp, " weight %d, sw_if_index %d, is_local %d, is_drop %d, " "is_unreach %d, is_prohitbit %d, afi %d, next_hop %U", - ntohl (fp->weight), ntohl (fp->sw_if_index), fp->is_local, + fp->weight, ntohl (fp->sw_if_index), fp->is_local, fp->is_drop, fp->is_unreach, fp->is_prohibit, fp->afi, format_ip6_address, fp->next_hop); else if (fp->afi == IP46_TYPE_IP4) print (vam->ofp, " weight %d, sw_if_index %d, is_local %d, is_drop %d, " "is_unreach %d, is_prohitbit %d, afi %d, next_hop %U", - ntohl (fp->weight), ntohl (fp->sw_if_index), fp->is_local, + fp->weight, ntohl (fp->sw_if_index), fp->is_local, fp->is_drop, fp->is_unreach, fp->is_prohibit, fp->afi, format_ip4_address, fp->next_hop); } diff --git a/src/vcl/vcl_private.h b/src/vcl/vcl_private.h index 5975f15ac4f3..d1a40b933a7c 100644 --- a/src/vcl/vcl_private.h +++ b/src/vcl/vcl_private.h @@ -153,6 +153,7 @@ typedef struct /* Socket configuration state */ u8 is_vep; u8 is_vep_session; + u8 has_rx_evt; u32 attr; u32 wait_cont_idx; vppcom_epoll_t vep; diff --git a/src/vcl/vcl_test.h b/src/vcl/vcl_test.h index 927110f55d20..9d28b262e3ab 100644 --- a/src/vcl/vcl_test.h +++ b/src/vcl/vcl_test.h @@ -438,7 +438,7 @@ vcl_test_write (int fd, uint8_t * buf, uint32_t nbytes, { if (stats) stats->tx_eagain++; - continue; + break; } else break; diff --git a/src/vcl/vppcom.c b/src/vcl/vppcom.c index 3f12b86df429..fad2ac98538a 100644 --- a/src/vcl/vppcom.c +++ b/src/vcl/vppcom.c @@ -1293,13 +1293,14 @@ vppcom_session_read_internal (uint32_t session_handle, void *buf, int n, is_ct = vcl_session_is_ct (s); mq = is_ct ? s->our_evt_q : wrk->app_event_queue; rx_fifo = s->rx_fifo; + s->has_rx_evt = 0; if (svm_fifo_is_empty (rx_fifo)) { if (is_nonblocking) { svm_fifo_unset_event (rx_fifo); - return VPPCOM_OK; + return VPPCOM_EWOULDBLOCK; } while (svm_fifo_is_empty (rx_fifo)) { @@ -1385,13 +1386,14 @@ vppcom_session_read_segments (uint32_t session_handle, is_ct = vcl_session_is_ct (s); mq = is_ct ? s->our_evt_q : wrk->app_event_queue; rx_fifo = s->rx_fifo; + s->has_rx_evt = 0; if (svm_fifo_is_empty (rx_fifo)) { if (is_nonblocking) { svm_fifo_unset_event (rx_fifo); - return VPPCOM_OK; + return VPPCOM_EWOULDBLOCK; } while (svm_fifo_is_empty (rx_fifo)) { @@ -1551,7 +1553,8 @@ vppcom_session_write (uint32_t session_handle, void *buf, size_t n) { svm_fifo_set_want_tx_evt (tx_fifo, 1); svm_msg_q_lock (mq); - svm_msg_q_wait (mq); + if (svm_msg_q_is_empty (mq)) + svm_msg_q_wait (mq); svm_msg_q_sub_w_lock (mq, &msg); e = svm_msg_q_msg_data (mq, &msg); @@ -2303,11 +2306,12 @@ vcl_epoll_wait_handle_mq_event (vcl_worker_t * wrk, session_event_t * e, sid = e->fifo->client_session_index; session = vcl_session_get (wrk, sid); session_events = session->vep.ev.events; - if (!(EPOLLIN & session->vep.ev.events)) + if (!(EPOLLIN & session->vep.ev.events) || session->has_rx_evt) break; add_event = 1; events[*num_ev].events |= EPOLLIN; session_evt_data = session->vep.ev.data.u64; + session->has_rx_evt = 1; break; case FIFO_EVENT_APP_TX: sid = e->fifo->client_session_index; @@ -2324,11 +2328,12 @@ vcl_epoll_wait_handle_mq_event (vcl_worker_t * wrk, session_event_t * e, session = vcl_ct_session_get_from_fifo (wrk, e->fifo, 0); sid = session->session_index; session_events = session->vep.ev.events; - if (!(EPOLLIN & session->vep.ev.events)) + if (!(EPOLLIN & session->vep.ev.events) || session->has_rx_evt) break; add_event = 1; events[*num_ev].events |= EPOLLIN; session_evt_data = session->vep.ev.data.u64; + session->has_rx_evt = 1; break; case SESSION_IO_EVT_CT_RX: session = vcl_ct_session_get_from_fifo (wrk, e->fifo, 1); @@ -2452,15 +2457,13 @@ vcl_epoll_wait_handle_mq (vcl_worker_t * wrk, svm_msg_q_t * mq, { msg = vec_elt_at_index (wrk->mq_msg_vector, i); e = svm_msg_q_msg_data (mq, msg); - vcl_epoll_wait_handle_mq_event (wrk, e, events, num_ev); + if (*num_ev < maxevents) + vcl_epoll_wait_handle_mq_event (wrk, e, events, num_ev); + else + vec_add1 (wrk->unhandled_evts_vector, *e); svm_msg_q_free_msg (mq, msg); - if (*num_ev == maxevents) - { - i += 1; - break; - } } - vec_delete (wrk->mq_msg_vector, i, 0); + vec_reset_length (wrk->mq_msg_vector); return *num_ev; } @@ -2508,6 +2511,7 @@ vppcom_epoll_wait_eventfd (vcl_worker_t * wrk, struct epoll_event *events, u64 buf; vec_validate (wrk->mq_events, pool_elts (wrk->mq_evt_conns)); +again: n_mq_evts = epoll_wait (wrk->mqs_epfd, wrk->mq_events, vec_len (wrk->mq_events), wait_for_time); for (i = 0; i < n_mq_evts; i++) @@ -2516,6 +2520,8 @@ vppcom_epoll_wait_eventfd (vcl_worker_t * wrk, struct epoll_event *events, n_read = read (mqc->mq_fd, &buf, sizeof (buf)); vcl_epoll_wait_handle_mq (wrk, mqc->mq, events, maxevents, 0, &n_evts); } + if (!n_evts && n_mq_evts > 0) + goto again; return (int) n_evts; } diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 651e7f0dc8fb..9254cfb804f4 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -51,6 +51,14 @@ #define VLIB_BUFFER_DATA_SIZE (2048) #define VLIB_BUFFER_PRE_DATA_SIZE __PRE_DATA_SIZE +/* Minimum buffer chain segment size. Does not apply to last buffer in chain. + Dataplane code can safely asume that specified amount of data is not split + into 2 chained buffers */ +#define VLIB_BUFFER_MIN_CHAIN_SEG_SIZE (128) + +/* Amount of head buffer data copied to each replica head buffer */ +#define VLIB_BUFFER_CLONE_HEAD_SIZE (256) + typedef u8 vlib_buffer_free_list_index_t; /** \file @@ -212,6 +220,9 @@ vlib_buffer_advance (vlib_buffer_t * b, word l) ASSERT (b->current_length >= l); b->current_data += l; b->current_length -= l; + + ASSERT ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0 || + b->current_length >= VLIB_BUFFER_MIN_CHAIN_SEG_SIZE); } /** \brief Check if there is enough space in buffer to advance diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index d8abdf31d79b..7ab41567c4d7 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -1269,6 +1269,143 @@ vlib_buffer_chain_compress (vlib_main_t * vm, (first->flags & VLIB_BUFFER_NEXT_PRESENT)); } +always_inline u32 +vlib_buffer_space_left_at_end (vlib_main_t * vm, vlib_buffer_t * b) +{ + return b->data + VLIB_BUFFER_DATA_SIZE - + ((u8 *) vlib_buffer_get_current (b) + b->current_length); +} + +always_inline u32 +vlib_buffer_chain_linearize (vlib_main_t * vm, vlib_buffer_t * b) +{ + vlib_buffer_t *db = b, *sb, *first = b; + int is_cloned = 0; + u32 bytes_left = 0, data_size; + u16 src_left, dst_left, n_buffers = 1; + u8 *dp, *sp; + u32 to_free = 0; + + if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0)) + return 1; + + data_size = VLIB_BUFFER_DATA_SIZE; + + dst_left = vlib_buffer_space_left_at_end (vm, b); + + while (b->flags & VLIB_BUFFER_NEXT_PRESENT) + { + b = vlib_get_buffer (vm, b->next_buffer); + if (b->n_add_refs > 0) + is_cloned = 1; + bytes_left += b->current_length; + n_buffers++; + } + + /* if buffer is cloned, create completely new chain - unless everything fits + * into one buffer */ + if (is_cloned && bytes_left >= dst_left) + { + u32 len = 0; + u32 space_needed = bytes_left - dst_left; + u32 tail; + + if (vlib_buffer_alloc (vm, &tail, 1) == 0) + return 0; + + ++n_buffers; + len += data_size; + b = vlib_get_buffer (vm, tail); + + while (len < space_needed) + { + u32 bi; + if (vlib_buffer_alloc (vm, &bi, 1) == 0) + { + vlib_buffer_free_one (vm, tail); + return 0; + } + b->flags = VLIB_BUFFER_NEXT_PRESENT; + b->next_buffer = bi; + b = vlib_get_buffer (vm, bi); + len += data_size; + n_buffers++; + } + sb = vlib_get_buffer (vm, first->next_buffer); + to_free = first->next_buffer; + first->next_buffer = tail; + } + else + sb = vlib_get_buffer (vm, first->next_buffer); + + src_left = sb->current_length; + sp = vlib_buffer_get_current (sb); + dp = vlib_buffer_get_tail (db); + + while (bytes_left) + { + u16 bytes_to_copy; + + if (dst_left == 0) + { + if (db != first) + db->current_data = 0; + db->current_length = dp - (u8 *) vlib_buffer_get_current (db); + ASSERT (db->flags & VLIB_BUFFER_NEXT_PRESENT); + db = vlib_get_buffer (vm, db->next_buffer); + dst_left = data_size; + dp = db->data; + } + + while (src_left == 0) + { + ASSERT (sb->flags & VLIB_BUFFER_NEXT_PRESENT); + sb = vlib_get_buffer (vm, sb->next_buffer); + src_left = sb->current_length; + sp = vlib_buffer_get_current (sb); + } + + bytes_to_copy = clib_min (dst_left, src_left); + + if (dp != sp) + { + if (sb == db) + bytes_to_copy = clib_min (bytes_to_copy, sp - dp); + + clib_memcpy (dp, sp, bytes_to_copy); + } + + src_left -= bytes_to_copy; + dst_left -= bytes_to_copy; + dp += bytes_to_copy; + sp += bytes_to_copy; + bytes_left -= bytes_to_copy; + } + if (db != first) + db->current_data = 0; + db->current_length = dp - (u8 *) vlib_buffer_get_current (db); + + if (is_cloned && to_free) + vlib_buffer_free_one (vm, to_free); + else + { + if (db->flags & VLIB_BUFFER_NEXT_PRESENT) + vlib_buffer_free_one (vm, db->next_buffer); + db->flags &= ~VLIB_BUFFER_NEXT_PRESENT; + b = first; + n_buffers = 1; + while (b->flags & VLIB_BUFFER_NEXT_PRESENT) + { + b = vlib_get_buffer (vm, b->next_buffer); + ++n_buffers; + } + } + + first->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID; + + return n_buffers; +} + #endif /* included_vlib_buffer_funcs_h */ /* diff --git a/src/vlib/buffer_node.h b/src/vlib/buffer_node.h index 93ffb1e9dce8..35e15a5d9196 100644 --- a/src/vlib/buffer_node.h +++ b/src/vlib/buffer_node.h @@ -366,10 +366,15 @@ vlib_buffer_enqueue_to_next (vlib_main_t * vm, vlib_node_runtime_t * node, n_enqueued = count_trailing_zeros (~bitmap) / 2; #else u16 x = 0; - x |= next_index ^ nexts[1]; - x |= next_index ^ nexts[2]; - x |= next_index ^ nexts[3]; - n_enqueued = (x == 0) ? 4 : 1; + if (count + 3 < max) + { + x |= next_index ^ nexts[1]; + x |= next_index ^ nexts[2]; + x |= next_index ^ nexts[3]; + n_enqueued = (x == 0) ? 4 : 1; + } + else + n_enqueued = 1; #endif if (PREDICT_FALSE (n_enqueued > max)) diff --git a/src/vlibmemory/memory_api.c b/src/vlibmemory/memory_api.c index 1f5da4c786e9..5849d719faed 100644 --- a/src/vlibmemory/memory_api.c +++ b/src/vlibmemory/memory_api.c @@ -898,6 +898,28 @@ vlibmemory_init (vlib_main_t * vm) api_main_t *am = &api_main; svm_map_region_args_t _a, *a = &_a; clib_error_t *error; + u8 *remove_path1, *remove_path2; + + /* + * By popular request / to avoid support fires, remove any old api segment + * files Right Here. + */ + if (am->root_path == 0) + { + remove_path1 = format (0, "/dev/shm/global_vm%c", 0); + remove_path2 = format (0, "/dev/shm/vpe-api%c", 0); + } + else + { + remove_path1 = format (0, "/dev/shm/%s-global_vm%c", am->root_path, 0); + remove_path2 = format (0, "/dev/shm/%s-vpe-api%c", am->root_path, 0); + } + + (void) unlink ((char *) remove_path1); + (void) unlink ((char *) remove_path2); + + vec_free (remove_path1); + vec_free (remove_path2); memset (a, 0, sizeof (*a)); a->root_path = am->root_path; diff --git a/src/vnet/bfd/bfd_main.c b/src/vnet/bfd/bfd_main.c index 55ea23dea410..bd2addf3b0f3 100644 --- a/src/vnet/bfd/bfd_main.c +++ b/src/vnet/bfd/bfd_main.c @@ -1165,6 +1165,7 @@ bfd_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) } } now = clib_cpu_time_now (); + uword *session_index; switch (event_type) { case ~0: /* no events => timeout */ @@ -1180,35 +1181,41 @@ bfd_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) * each event or timeout */ break; case BFD_EVENT_NEW_SESSION: - bfd_lock (bm); - if (!pool_is_free_index (bm->sessions, *event_data)) - { - bfd_session_t *bs = - pool_elt_at_index (bm->sessions, *event_data); - bfd_send_periodic (vm, rt, bm, bs, now); - bfd_set_timer (bm, bs, now, 1); - } - else - { - BFD_DBG ("Ignoring event for non-existent session index %u", - (u32) * event_data); - } - bfd_unlock (bm); + vec_foreach (session_index, event_data) + { + bfd_lock (bm); + if (!pool_is_free_index (bm->sessions, *session_index)) + { + bfd_session_t *bs = + pool_elt_at_index (bm->sessions, *session_index); + bfd_send_periodic (vm, rt, bm, bs, now); + bfd_set_timer (bm, bs, now, 1); + } + else + { + BFD_DBG ("Ignoring event for non-existent session index %u", + (u32) * session_index); + } + bfd_unlock (bm); + } break; case BFD_EVENT_CONFIG_CHANGED: - bfd_lock (bm); - if (!pool_is_free_index (bm->sessions, *event_data)) - { - bfd_session_t *bs = - pool_elt_at_index (bm->sessions, *event_data); - bfd_on_config_change (vm, rt, bm, bs, now); - } - else - { - BFD_DBG ("Ignoring event for non-existent session index %u", - (u32) * event_data); - } - bfd_unlock (bm); + vec_foreach (session_index, event_data) + { + bfd_lock (bm); + if (!pool_is_free_index (bm->sessions, *session_index)) + { + bfd_session_t *bs = + pool_elt_at_index (bm->sessions, *session_index); + bfd_on_config_change (vm, rt, bm, bs, now); + } + else + { + BFD_DBG ("Ignoring event for non-existent session index %u", + (u32) * session_index); + } + bfd_unlock (bm); + } break; default: vlib_log_err (bm->log_class, "BUG: event type 0x%wx", event_type); diff --git a/src/vnet/bier/bier_lookup.c b/src/vnet/bier/bier_lookup.c index d4500823f13f..51011c980083 100644 --- a/src/vnet/bier/bier_lookup.c +++ b/src/vnet/bier/bier_lookup.c @@ -223,7 +223,7 @@ bier_lookup (vlib_main_t * vm, num_cloned = vlib_buffer_clone(vm, bi0, blm->blm_clones[thread_index], n_clones, - n_bytes + 8); + VLIB_BUFFER_CLONE_HEAD_SIZE); if (num_cloned != vec_len(blm->blm_fmasks[thread_index])) { diff --git a/src/vnet/bonding/cli.c b/src/vnet/bonding/cli.c index 846fbdb38c9e..26d9eefad96e 100644 --- a/src/vnet/bonding/cli.c +++ b/src/vnet/bonding/cli.c @@ -512,11 +512,13 @@ bond_enslave (vlib_main_t * vm, bond_enslave_args_t * args) ethernet_set_rx_redirect (vnm, sif_hw, 1); } - if ((bif->mode == BOND_MODE_LACP) && bm->lacp_enable_disable) + if (bif->mode == BOND_MODE_LACP) { - (*bm->lacp_enable_disable) (vm, bif, sif, 1); + if (bm->lacp_enable_disable) + (*bm->lacp_enable_disable) (vm, bif, sif, 1); } - else + else if (sif->port_enabled && + (sif_hw->flags & VNET_HW_INTERFACE_FLAG_LINK_UP)) { bond_enable_collecting_distributing (vm, sif); } diff --git a/src/vnet/bonding/node.c b/src/vnet/bonding/node.c index 361509c549dd..725fde02e6c7 100644 --- a/src/vnet/bonding/node.c +++ b/src/vnet/bonding/node.c @@ -400,19 +400,21 @@ bond_sw_interface_up_down (vnet_main_t * vnm, u32 sw_if_index, u32 flags) if (sif) { sif->port_enabled = flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP; + if (sif->lacp_enabled) + return 0; + if (sif->port_enabled == 0) { - if (sif->lacp_enabled == 0) - { - bond_disable_collecting_distributing (vm, sif); - } + bond_disable_collecting_distributing (vm, sif); } else { - if (sif->lacp_enabled == 0) - { - bond_enable_collecting_distributing (vm, sif); - } + vnet_main_t *vnm = vnet_get_main (); + vnet_hw_interface_t *hw = + vnet_get_sup_hw_interface (vnm, sw_if_index); + + if (hw->flags & VNET_HW_INTERFACE_FLAG_LINK_UP) + bond_enable_collecting_distributing (vm, sif); } } @@ -433,19 +435,16 @@ bond_hw_interface_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) sif = bond_get_slave_by_sw_if_index (sw->sw_if_index); if (sif) { + if (sif->lacp_enabled) + return 0; + if (!(flags & VNET_HW_INTERFACE_FLAG_LINK_UP)) { - if (sif->lacp_enabled == 0) - { - bond_disable_collecting_distributing (vm, sif); - } + bond_disable_collecting_distributing (vm, sif); } - else + else if (sif->port_enabled) { - if (sif->lacp_enabled == 0) - { - bond_enable_collecting_distributing (vm, sif); - } + bond_enable_collecting_distributing (vm, sif); } } diff --git a/src/vnet/devices/virtio/device.c b/src/vnet/devices/virtio/device.c index c7efe6519cd2..d50ef88d3fbc 100644 --- a/src/vnet/devices/virtio/device.c +++ b/src/vnet/devices/virtio/device.c @@ -169,7 +169,6 @@ add_buffer_to_slot (vlib_main_t * vm, virtio_vring_t * vring, u32 bi, return n_added; } - static_always_inline uword virtio_interface_tx_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, virtio_if_t * vif) @@ -184,6 +183,10 @@ virtio_interface_tx_inline (vlib_main_t * vm, vlib_node_runtime_t * node, clib_spinlock_lock_if_init (&vif->lockp); + if ((vring->used->flags & VIRTIO_RING_FLAG_MASK_INT) == 0 && + vring->last_kick_avail_idx != vring->avail->idx) + virtio_kick (vring); + /* free consumed buffers */ virtio_free_used_desc (vm, vring); @@ -209,10 +212,7 @@ virtio_interface_tx_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vring->desc_next = next; vring->desc_in_use = used; if ((vring->used->flags & VIRTIO_RING_FLAG_MASK_INT) == 0) - { - u64 x = 1; - CLIB_UNUSED (int r) = write (vring->kick_fd, &x, sizeof (x)); - } + virtio_kick (vring); } diff --git a/src/vnet/devices/virtio/node.c b/src/vnet/devices/virtio/node.c index 339c48c93f56..419b025b3ae7 100644 --- a/src/vnet/devices/virtio/node.c +++ b/src/vnet/devices/virtio/node.c @@ -87,17 +87,23 @@ virtio_refill_vring (vlib_main_t * vm, virtio_vring_t * vring) u16 sz = vring->size; u16 mask = sz - 1; +more: used = vring->desc_in_use; if (sz - used < sz / 8) return; - n_slots = sz - used; + /* deliver free buffers in chunks of 64 */ + n_slots = clib_min (sz - used, 64); + next = vring->desc_next; avail = vring->avail->idx; n_slots = vlib_buffer_alloc_to_ring (vm, vring->buffers, next, vring->size, n_slots); + if (n_slots == 0) + return; + while (n_slots) { struct vring_desc *d = &vring->desc[next];; @@ -117,10 +123,8 @@ virtio_refill_vring (vlib_main_t * vm, virtio_vring_t * vring) vring->desc_in_use = used; if ((vring->used->flags & VIRTIO_RING_FLAG_MASK_INT) == 0) - { - u64 b = 1; - CLIB_UNUSED (int r) = write (vring->kick_fd, &b, sizeof (b)); - } + virtio_kick (vring); + goto more; } static_always_inline uword @@ -140,6 +144,10 @@ virtio_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, u16 last = vring->last_used_idx; u16 n_left = vring->used->idx - last; + if ((vring->used->flags & VIRTIO_RING_FLAG_MASK_INT) == 0 && + vring->last_kick_avail_idx != vring->avail->idx) + virtio_kick (vring); + if (n_left == 0) goto refill; diff --git a/src/vnet/devices/virtio/vhost_user.c b/src/vnet/devices/virtio/vhost_user.c index 1342030a6518..48c5d1a5fbae 100644 --- a/src/vnet/devices/virtio/vhost_user.c +++ b/src/vnet/devices/virtio/vhost_user.c @@ -116,12 +116,13 @@ unmap_all_mem_regions (vhost_user_intf_t * vui) } } -static void +static_always_inline void vhost_user_tx_thread_placement (vhost_user_intf_t * vui) { //Let's try to assign one queue to each thread - u32 qid = 0; + u32 qid; u32 thread_index = 0; + vui->use_tx_spinlock = 0; while (1) { @@ -156,67 +157,27 @@ vhost_user_tx_thread_placement (vhost_user_intf_t * vui) * @brief Unassign existing interface/queue to thread mappings and re-assign * new interface/queue to thread mappings */ -static void -vhost_user_rx_thread_placement () +static_always_inline void +vhost_user_rx_thread_placement (vhost_user_intf_t * vui, u32 qid) { - vhost_user_main_t *vum = &vhost_user_main; - vhost_user_intf_t *vui; - vhost_user_vring_t *txvq; + vhost_user_vring_t *txvq = &vui->vrings[qid]; vnet_main_t *vnm = vnet_get_main (); - u32 qid; int rv; - u16 *queue; - - // Scrap all existing mappings for all interfaces/queues - /* *INDENT-OFF* */ - pool_foreach (vui, vum->vhost_user_interfaces, { - vec_foreach (queue, vui->rx_queues) - { - rv = vnet_hw_interface_unassign_rx_thread (vnm, vui->hw_if_index, - *queue); - if (rv) - vu_log_warn (vui, "unable to unassign interface %d, " - "queue %d: rc=%d", vui->hw_if_index, *queue, rv); - } - vec_reset_length (vui->rx_queues); - }); - /* *INDENT-ON* */ - - // Create the rx_queues for all interfaces - /* *INDENT-OFF* */ - pool_foreach (vui, vum->vhost_user_interfaces, { - for (qid = 0; qid < VHOST_VRING_MAX_N / 2; qid++) - { - txvq = &vui->vrings[VHOST_VRING_IDX_TX (qid)]; - if (txvq->started) - { - if (txvq->mode == VNET_HW_INTERFACE_RX_MODE_UNKNOWN) - /* Set polling as the default */ - txvq->mode = VNET_HW_INTERFACE_RX_MODE_POLLING; - vec_add1 (vui->rx_queues, qid); - } - } - }); - /* *INDENT-ON* */ - - // Assign new mappings for all interfaces/queues - /* *INDENT-OFF* */ - pool_foreach (vui, vum->vhost_user_interfaces, { - vnet_hw_interface_set_input_node (vnm, vui->hw_if_index, - vhost_user_input_node.index); - vec_foreach (queue, vui->rx_queues) - { - vnet_hw_interface_assign_rx_thread (vnm, vui->hw_if_index, *queue, - ~0); - txvq = &vui->vrings[VHOST_VRING_IDX_TX (*queue)]; - rv = vnet_hw_interface_set_rx_mode (vnm, vui->hw_if_index, *queue, - txvq->mode); - if (rv) - vu_log_warn (vui, "unable to set rx mode for interface %d, " - "queue %d: rc=%d", vui->hw_if_index, *queue, rv); - } - }); - /* *INDENT-ON* */ + u32 q = qid >> 1; + + ASSERT ((qid & 1) == 1); // should be odd + // Assign new queue mappings for the interface + vnet_hw_interface_set_input_node (vnm, vui->hw_if_index, + vhost_user_input_node.index); + vnet_hw_interface_assign_rx_thread (vnm, vui->hw_if_index, q, ~0); + if (txvq->mode == VNET_HW_INTERFACE_RX_MODE_UNKNOWN) + /* Set polling as the default */ + txvq->mode = VNET_HW_INTERFACE_RX_MODE_POLLING; + txvq->qid = q; + rv = vnet_hw_interface_set_rx_mode (vnm, vui->hw_if_index, q, txvq->mode); + if (rv) + vu_log_warn (vui, "unable to set rx mode for interface %d, " + "queue %d: rc=%d", vui->hw_if_index, q, rv); } /** @brief Returns whether at least one TX and one RX vring are enabled */ @@ -232,7 +193,7 @@ vhost_user_intf_ready (vhost_user_intf_t * vui) return found[0] && found[1]; } -static void +static_always_inline void vhost_user_update_iface_state (vhost_user_intf_t * vui) { /* if we have pointers to descriptor table, go up */ @@ -247,8 +208,6 @@ vhost_user_update_iface_state (vhost_user_intf_t * vui) : 0); vui->is_ready = is_ready; } - vhost_user_rx_thread_placement (); - vhost_user_tx_thread_placement (vui); } static void @@ -278,6 +237,18 @@ vhost_user_callfd_read_ready (clib_file_t * uf) return 0; } +static_always_inline void +vhost_user_thread_placement (vhost_user_intf_t * vui, u32 qid) +{ + if (qid & 1) // RX is odd, TX is even + { + if (vui->vrings[qid].qid == -1) + vhost_user_rx_thread_placement (vui, qid); + } + else + vhost_user_tx_thread_placement (vui); +} + static clib_error_t * vhost_user_kickfd_read_ready (clib_file_t * uf) { @@ -293,10 +264,12 @@ vhost_user_kickfd_read_ready (clib_file_t * uf) if (!vui->vrings[qid].started || (vhost_user_intf_ready (vui) != vui->is_ready)) { - vlib_worker_thread_barrier_sync (vlib_get_main ()); - vui->vrings[qid].started = 1; - vhost_user_update_iface_state (vui); - vlib_worker_thread_barrier_release (vlib_get_main ()); + if (vui->vrings[qid].started == 0) + { + vui->vrings[qid].started = 1; + vhost_user_thread_placement (vui, qid); + vhost_user_update_iface_state (vui); + } } vhost_user_set_interrupt_pending (vui, uf->private_data); @@ -311,6 +284,7 @@ vhost_user_vring_init (vhost_user_intf_t * vui, u32 qid) vring->kickfd_idx = ~0; vring->callfd_idx = ~0; vring->errfd = -1; + vring->qid = -1; /* * We have a bug with some qemu 2.5, and this may be a fix. @@ -329,6 +303,7 @@ static_always_inline void vhost_user_vring_close (vhost_user_intf_t * vui, u32 qid) { vhost_user_vring_t *vring = &vui->vrings[qid]; + if (vring->kickfd_idx != ~0) { clib_file_t *uf = pool_elt_at_index (file_main.file_pool, @@ -348,7 +323,12 @@ vhost_user_vring_close (vhost_user_intf_t * vui, u32 qid) close (vring->errfd); vring->errfd = -1; } + + // save the qid so that we don't need to unassign and assign_rx_thread + // when the interface comes back up. They are expensive calls. + u16 q = vui->vrings[qid].qid; vhost_user_vring_init (vui, qid); + vui->vrings[qid].qid = q; } static_always_inline void @@ -377,7 +357,7 @@ vhost_user_if_disconnect (vhost_user_intf_t * vui) static clib_error_t * vhost_user_socket_read (clib_file_t * uf) { - int n, i; + int n, i, j; int fd, number_of_fds = 0; int fds[VHOST_MEMORY_MAX_NREGIONS]; vhost_user_msg_t msg; @@ -389,6 +369,7 @@ vhost_user_socket_read (clib_file_t * uf) u8 q; clib_file_t template = { 0 }; vnet_main_t *vnm = vnet_get_main (); + vlib_main_t *vm = vlib_get_main (); vui = pool_elt_at_index (vum->vhost_user_interfaces, uf->private_data); @@ -411,9 +392,6 @@ vhost_user_socket_read (clib_file_t * uf) n = recvmsg (uf->file_descriptor, &mh, 0); - /* Stop workers to avoid end of the world */ - vlib_worker_thread_barrier_sync (vlib_get_main ()); - if (n != VHOST_USER_MSG_HDR_SZ) { if (n == -1) @@ -488,6 +466,13 @@ vhost_user_socket_read (clib_file_t * uf) msg.size = sizeof (msg.u64); vu_log_debug (vui, "if %d msg VHOST_USER_GET_FEATURES - reply " "0x%016llx", vui->hw_if_index, msg.u64); + n = + send (uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0); + if (n != (msg.size + VHOST_USER_MSG_HDR_SZ)) + { + vu_log_debug (vui, "could not send message response"); + goto close_socket; + } break; case VHOST_USER_SET_FEATURES: @@ -509,10 +494,6 @@ vhost_user_socket_read (clib_file_t * uf) ASSERT (vui->virtio_net_hdr_sz < VLIB_BUFFER_PRE_DATA_SIZE); vnet_hw_interface_set_flags (vnm, vui->hw_if_index, 0); vui->is_ready = 0; - - /*for (q = 0; q < VHOST_VRING_MAX_N; q++) - vhost_user_vring_close(&vui->vrings[q]); */ - break; case VHOST_USER_SET_MEM_TABLE: @@ -522,10 +503,8 @@ vhost_user_socket_read (clib_file_t * uf) if ((msg.memory.nregions < 1) || (msg.memory.nregions > VHOST_MEMORY_MAX_NREGIONS)) { - vu_log_debug (vui, "number of mem regions must be between 1 and %i", VHOST_MEMORY_MAX_NREGIONS); - goto close_socket; } @@ -534,39 +513,50 @@ vhost_user_socket_read (clib_file_t * uf) vu_log_debug (vui, "each memory region must have FD"); goto close_socket; } - unmap_all_mem_regions (vui); + + /* Do the mmap without barrier sync */ + void *region_mmap_addr[VHOST_MEMORY_MAX_NREGIONS]; for (i = 0; i < msg.memory.nregions; i++) { - clib_memcpy (&(vui->regions[i]), &msg.memory.regions[i], - sizeof (vhost_user_memory_region_t)); - long page_sz = get_huge_page_size (fds[i]); /* align size to page */ - ssize_t map_sz = (vui->regions[i].memory_size + - vui->regions[i].mmap_offset + + ssize_t map_sz = (msg.memory.regions[i].memory_size + + msg.memory.regions[i].mmap_offset + page_sz - 1) & ~(page_sz - 1); - vui->region_mmap_addr[i] = mmap (0, map_sz, PROT_READ | PROT_WRITE, - MAP_SHARED, fds[i], 0); - vui->region_guest_addr_lo[i] = vui->regions[i].guest_phys_addr; - vui->region_guest_addr_hi[i] = vui->regions[i].guest_phys_addr + - vui->regions[i].memory_size; - - vu_log_debug (vui, "map memory region %d addr 0 len 0x%lx fd %d " - "mapped 0x%lx page_sz 0x%x", i, map_sz, fds[i], - vui->region_mmap_addr[i], page_sz); - - if (vui->region_mmap_addr[i] == MAP_FAILED) + region_mmap_addr[i] = mmap (0, map_sz, PROT_READ | PROT_WRITE, + MAP_SHARED, fds[i], 0); + if (region_mmap_addr[i] == MAP_FAILED) { vu_log_err (vui, "failed to map memory. errno is %d", errno); + for (j = 0; j < i; j++) + munmap (region_mmap_addr[j], map_sz); goto close_socket; } + vu_log_debug (vui, "map memory region %d addr 0 len 0x%lx fd %d " + "mapped 0x%lx page_sz 0x%x", i, map_sz, fds[i], + region_mmap_addr[i], page_sz); + } + + vlib_worker_thread_barrier_sync (vm); + unmap_all_mem_regions (vui); + for (i = 0; i < msg.memory.nregions; i++) + { + clib_memcpy (&(vui->regions[i]), &msg.memory.regions[i], + sizeof (vhost_user_memory_region_t)); + + vui->region_mmap_addr[i] = region_mmap_addr[i]; + vui->region_guest_addr_lo[i] = vui->regions[i].guest_phys_addr; + vui->region_guest_addr_hi[i] = vui->regions[i].guest_phys_addr + + vui->regions[i].memory_size; + vui->region_mmap_addr[i] += vui->regions[i].mmap_offset; vui->region_mmap_fd[i] = fds[i]; vui->nregions++; } + vlib_worker_thread_barrier_release (vm); break; case VHOST_USER_SET_VRING_NUM: @@ -598,22 +588,22 @@ vhost_user_socket_read (clib_file_t * uf) goto close_socket; } - vui->vrings[msg.state.index].desc = (vring_desc_t *) - map_user_mem (vui, msg.addr.desc_user_addr); - vui->vrings[msg.state.index].used = (vring_used_t *) - map_user_mem (vui, msg.addr.used_user_addr); - vui->vrings[msg.state.index].avail = (vring_avail_t *) - map_user_mem (vui, msg.addr.avail_user_addr); + vring_desc_t *desc = map_user_mem (vui, msg.addr.desc_user_addr); + vring_used_t *used = map_user_mem (vui, msg.addr.used_user_addr); + vring_avail_t *avail = map_user_mem (vui, msg.addr.avail_user_addr); - if ((vui->vrings[msg.state.index].desc == NULL) || - (vui->vrings[msg.state.index].used == NULL) || - (vui->vrings[msg.state.index].avail == NULL)) + if ((desc == NULL) || (used == NULL) || (avail == NULL)) { vu_log_debug (vui, "failed to map user memory for hw_if_index %d", vui->hw_if_index); goto close_socket; } + vlib_worker_thread_barrier_sync (vm); + vui->vrings[msg.state.index].desc = desc; + vui->vrings[msg.state.index].used = used; + vui->vrings[msg.state.index].avail = avail; + vui->vrings[msg.state.index].log_guest_addr = msg.addr.log_guest_addr; vui->vrings[msg.state.index].log_used = (msg.addr.flags & (1 << VHOST_VRING_F_LOG)) ? 1 : 0; @@ -621,9 +611,7 @@ vhost_user_socket_read (clib_file_t * uf) /* Spec says: If VHOST_USER_F_PROTOCOL_FEATURES has not been negotiated, the ring is initialized in an enabled state. */ if (!(vui->features & (1 << FEAT_VHOST_USER_F_PROTOCOL_FEATURES))) - { - vui->vrings[msg.state.index].enabled = 1; - } + vui->vrings[msg.state.index].enabled = 1; vui->vrings[msg.state.index].last_used_idx = vui->vrings[msg.state.index].last_avail_idx = @@ -631,6 +619,8 @@ vhost_user_socket_read (clib_file_t * uf) /* tell driver that we don't want interrupts */ vui->vrings[msg.state.index].used->flags = VRING_USED_F_NO_NOTIFY; + vlib_worker_thread_barrier_release (vm); + vhost_user_update_iface_state (vui); break; case VHOST_USER_SET_OWNER: @@ -709,8 +699,9 @@ vhost_user_socket_read (clib_file_t * uf) //When no kickfd is set, the queue is initialized as started vui->vrings[q].kickfd_idx = ~0; vui->vrings[q].started = 1; + vhost_user_thread_placement (vui, q); } - + vhost_user_update_iface_state (vui); break; case VHOST_USER_SET_VRING_ERR: @@ -731,14 +722,14 @@ vhost_user_socket_read (clib_file_t * uf) } else vui->vrings[q].errfd = -1; - break; case VHOST_USER_SET_VRING_BASE: vu_log_debug (vui, "if %d msg VHOST_USER_SET_VRING_BASE idx %d num %d", vui->hw_if_index, msg.state.index, msg.state.num); - + vlib_worker_thread_barrier_sync (vm); vui->vrings[msg.state.index].last_avail_idx = msg.state.num; + vlib_worker_thread_barrier_release (vm); break; case VHOST_USER_GET_VRING_BASE: @@ -749,6 +740,8 @@ vhost_user_socket_read (clib_file_t * uf) goto close_socket; } + /* protection is needed to prevent rx/tx from changing last_avail_idx */ + vlib_worker_thread_barrier_sync (vm); /* * Copy last_avail_idx from the vring before closing it because * closing the vring also initializes the vring last_avail_idx @@ -757,68 +750,84 @@ vhost_user_socket_read (clib_file_t * uf) msg.flags |= 4; msg.size = sizeof (msg.state); - /* Spec says: Client must [...] stop ring upon receiving VHOST_USER_GET_VRING_BASE. */ + /* + * Spec says: Client must [...] stop ring upon receiving + * VHOST_USER_GET_VRING_BASE + */ vhost_user_vring_close (vui, msg.state.index); + vlib_worker_thread_barrier_release (vm); vu_log_debug (vui, "if %d msg VHOST_USER_GET_VRING_BASE idx %d num %d", vui->hw_if_index, msg.state.index, msg.state.num); + n = + send (uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0); + if (n != (msg.size + VHOST_USER_MSG_HDR_SZ)) + { + vu_log_debug (vui, "could not send message response"); + goto close_socket; + } + vhost_user_update_iface_state (vui); break; case VHOST_USER_NONE: vu_log_debug (vui, "if %d msg VHOST_USER_NONE", vui->hw_if_index); - break; case VHOST_USER_SET_LOG_BASE: - { - vu_log_debug (vui, "if %d msg VHOST_USER_SET_LOG_BASE", - vui->hw_if_index); - - if (msg.size != sizeof (msg.log)) - { - vu_log_debug (vui, "invalid msg size for VHOST_USER_SET_LOG_BASE:" - " %d instead of %d", msg.size, sizeof (msg.log)); - goto close_socket; - } + vu_log_debug (vui, "if %d msg VHOST_USER_SET_LOG_BASE", + vui->hw_if_index); - if (! - (vui->protocol_features & (1 << VHOST_USER_PROTOCOL_F_LOG_SHMFD))) - { - vu_log_debug (vui, "VHOST_USER_PROTOCOL_F_LOG_SHMFD not set but " - "VHOST_USER_SET_LOG_BASE received"); - goto close_socket; - } + if (msg.size != sizeof (msg.log)) + { + vu_log_debug (vui, "invalid msg size for VHOST_USER_SET_LOG_BASE:" + " %d instead of %d", msg.size, sizeof (msg.log)); + goto close_socket; + } - fd = fds[0]; - /* align size to page */ - long page_sz = get_huge_page_size (fd); - ssize_t map_sz = - (msg.log.size + msg.log.offset + page_sz - 1) & ~(page_sz - 1); + if (!(vui->protocol_features & (1 << VHOST_USER_PROTOCOL_F_LOG_SHMFD))) + { + vu_log_debug (vui, "VHOST_USER_PROTOCOL_F_LOG_SHMFD not set but " + "VHOST_USER_SET_LOG_BASE received"); + goto close_socket; + } - vui->log_base_addr = mmap (0, map_sz, PROT_READ | PROT_WRITE, - MAP_SHARED, fd, 0); + fd = fds[0]; + /* align size to page */ + long page_sz = get_huge_page_size (fd); + ssize_t map_sz = + (msg.log.size + msg.log.offset + page_sz - 1) & ~(page_sz - 1); - vu_log_debug (vui, "map log region addr 0 len 0x%lx off 0x%lx fd %d " - "mapped 0x%lx", map_sz, msg.log.offset, fd, - vui->log_base_addr); + void *log_base_addr = mmap (0, map_sz, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); - if (vui->log_base_addr == MAP_FAILED) - { - vu_log_err (vui, "failed to map memory. errno is %d", errno); - goto close_socket; - } + vu_log_debug (vui, "map log region addr 0 len 0x%lx off 0x%lx fd %d " + "mapped 0x%lx", map_sz, msg.log.offset, fd, + log_base_addr); - vui->log_base_addr += msg.log.offset; - vui->log_size = msg.log.size; + if (log_base_addr == MAP_FAILED) + { + vu_log_err (vui, "failed to map memory. errno is %d", errno); + goto close_socket; + } - msg.flags |= 4; - msg.size = sizeof (msg.u64); + vlib_worker_thread_barrier_sync (vm); + vui->log_base_addr = log_base_addr; + vui->log_base_addr += msg.log.offset; + vui->log_size = msg.log.size; + vlib_worker_thread_barrier_release (vm); - break; - } + msg.flags |= 4; + msg.size = sizeof (msg.u64); + n = + send (uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0); + if (n != (msg.size + VHOST_USER_MSG_HDR_SZ)) + { + vu_log_debug (vui, "could not send message response"); + goto close_socket; + } + break; case VHOST_USER_SET_LOG_FD: vu_log_debug (vui, "if %d msg VHOST_USER_SET_LOG_FD", vui->hw_if_index); - break; case VHOST_USER_GET_PROTOCOL_FEATURES: @@ -828,14 +837,19 @@ vhost_user_socket_read (clib_file_t * uf) msg.size = sizeof (msg.u64); vu_log_debug (vui, "if %d msg VHOST_USER_GET_PROTOCOL_FEATURES - " "reply 0x%016llx", vui->hw_if_index, msg.u64); + n = + send (uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0); + if (n != (msg.size + VHOST_USER_MSG_HDR_SZ)) + { + vu_log_debug (vui, "could not send message response"); + goto close_socket; + } break; case VHOST_USER_SET_PROTOCOL_FEATURES: vu_log_debug (vui, "if %d msg VHOST_USER_SET_PROTOCOL_FEATURES " "features 0x%016llx", vui->hw_if_index, msg.u64); - vui->protocol_features = msg.u64; - break; case VHOST_USER_GET_QUEUE_NUM: @@ -844,6 +858,13 @@ vhost_user_socket_read (clib_file_t * uf) msg.size = sizeof (msg.u64); vu_log_debug (vui, "if %d msg VHOST_USER_GET_QUEUE_NUM - reply %d", vui->hw_if_index, msg.u64); + n = + send (uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0); + if (n != (msg.size + VHOST_USER_MSG_HDR_SZ)) + { + vu_log_debug (vui, "could not send message response"); + goto close_socket; + } break; case VHOST_USER_SET_VRING_ENABLE: @@ -858,6 +879,8 @@ vhost_user_socket_read (clib_file_t * uf) } vui->vrings[msg.state.index].enabled = msg.state.num; + vhost_user_thread_placement (vui, msg.state.index); + vhost_user_update_iface_state (vui); break; default: @@ -866,26 +889,13 @@ vhost_user_socket_read (clib_file_t * uf) goto close_socket; } - /* if we need to reply */ - if (msg.flags & 4) - { - n = - send (uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0); - if (n != (msg.size + VHOST_USER_MSG_HDR_SZ)) - { - vu_log_debug (vui, "could not send message response"); - goto close_socket; - } - } - - vhost_user_update_iface_state (vui); - vlib_worker_thread_barrier_release (vlib_get_main ()); return 0; close_socket: + vlib_worker_thread_barrier_sync (vm); vhost_user_if_disconnect (vui); + vlib_worker_thread_barrier_release (vm); vhost_user_update_iface_state (vui); - vlib_worker_thread_barrier_release (vlib_get_main ()); return 0; } @@ -900,7 +910,6 @@ vhost_user_socket_error (clib_file_t * uf) vu_log_debug (vui, "socket error on if %d", vui->sw_if_index); vlib_worker_thread_barrier_sync (vm); vhost_user_if_disconnect (vui); - vhost_user_rx_thread_placement (); vlib_worker_thread_barrier_release (vm); return 0; } @@ -984,7 +993,7 @@ vhost_user_send_interrupt_process (vlib_main_t * vm, f64 timeout = 3153600000.0 /* 100 years */ ; uword event_type, *event_data = 0; vhost_user_main_t *vum = &vhost_user_main; - u16 *queue; + u16 qid; f64 now, poll_time_remaining; f64 next_timeout; u8 stop_timer = 0; @@ -1022,13 +1031,13 @@ vhost_user_send_interrupt_process (vlib_main_t * vm, /* *INDENT-OFF* */ pool_foreach (vui, vum->vhost_user_interfaces, { next_timeout = timeout; - vec_foreach (queue, vui->rx_queues) + for (qid = 0; qid < VHOST_VRING_MAX_N / 2; qid += 2) { - vhost_user_vring_t *rxvq = - &vui->vrings[VHOST_VRING_IDX_RX (*queue)]; - vhost_user_vring_t *txvq = - &vui->vrings[VHOST_VRING_IDX_TX (*queue)]; + vhost_user_vring_t *rxvq = &vui->vrings[qid]; + vhost_user_vring_t *txvq = &vui->vrings[qid + 1]; + if (txvq->qid == -1) + continue; if (txvq->n_since_last_int) { if (now >= txvq->int_deadline) @@ -1196,6 +1205,24 @@ vhost_user_term_if (vhost_user_intf_t * vui) for (q = 0; q < VHOST_VRING_MAX_N; q++) { + // Remove existing queue mapping for the interface + if (q & 1) + { + int rv; + vnet_main_t *vnm = vnet_get_main (); + vhost_user_vring_t *txvq = &vui->vrings[q]; + + if (txvq->qid != -1) + { + rv = vnet_hw_interface_unassign_rx_thread (vnm, + vui->hw_if_index, + q >> 1); + if (rv) + vu_log_warn (vui, "unable to unassign interface %d, " + "queue %d: rc=%d", vui->hw_if_index, q >> 1, rv); + } + } + clib_mem_free ((void *) vui->vring_locks[q]); } @@ -1220,7 +1247,7 @@ vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm, u32 sw_if_index) vhost_user_intf_t *vui; int rv = 0; vnet_hw_interface_t *hwif; - u16 *queue; + u16 qid; if (!(hwif = vnet_get_sup_hw_interface (vnm, sw_if_index)) || hwif->dev_class_index != vhost_user_device_class.index) @@ -1231,27 +1258,28 @@ vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm, u32 sw_if_index) vu_log_debug (vui, "Deleting vhost-user interface %s (instance %d)", hwif->name, hwif->dev_instance); - vec_foreach (queue, vui->rx_queues) - { - vhost_user_vring_t *txvq; + for (qid = 1; qid < VHOST_VRING_MAX_N / 2; qid += 2) + { + vhost_user_vring_t *txvq = &vui->vrings[qid]; - txvq = &vui->vrings[VHOST_VRING_IDX_TX (*queue)]; - if ((vum->ifq_count > 0) && - ((txvq->mode == VNET_HW_INTERFACE_RX_MODE_INTERRUPT) || - (txvq->mode == VNET_HW_INTERFACE_RX_MODE_ADAPTIVE))) - { - vum->ifq_count--; - // Stop the timer if there is no more interrupt interface/queue - if ((vum->ifq_count == 0) && - (vum->coalesce_time > 0.0) && (vum->coalesce_frames > 0)) - { - vlib_process_signal_event (vm, - vhost_user_send_interrupt_node.index, - VHOST_USER_EVENT_STOP_TIMER, 0); - break; - } - } - } + if (txvq->qid == -1) + continue; + if ((vum->ifq_count > 0) && + ((txvq->mode == VNET_HW_INTERFACE_RX_MODE_INTERRUPT) || + (txvq->mode == VNET_HW_INTERFACE_RX_MODE_ADAPTIVE))) + { + vum->ifq_count--; + // Stop the timer if there is no more interrupt interface/queue + if ((vum->ifq_count == 0) && + (vum->coalesce_time > 0.0) && (vum->coalesce_frames > 0)) + { + vlib_process_signal_event (vm, + vhost_user_send_interrupt_node.index, + VHOST_USER_EVENT_STOP_TIMER, 0); + break; + } + } + } // Disable and reset interface vhost_user_term_if (vui); @@ -1467,12 +1495,16 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, } } + /* Protect the uninitialized vui from being dispatched by rx/tx */ + vlib_worker_thread_barrier_sync (vm); pool_get (vhost_user_main.vhost_user_interfaces, vui); - vhost_user_create_ethernet (vnm, vm, vui, hwaddr); + vlib_worker_thread_barrier_release (vm); + vhost_user_vui_init (vnm, vui, server_sock_fd, sock_filename, feature_mask, &sw_if_idx); vnet_sw_interface_set_mtu (vnm, vui->sw_if_index, 9000); + vhost_user_rx_thread_placement (vui, 1); if (renumber) vnet_interface_name_renumber (sw_if_idx, custom_dev_instance); @@ -1711,7 +1743,7 @@ show_vhost_user_command_fn (vlib_main_t * vm, vhost_user_intf_t *vui; u32 hw_if_index, *hw_if_indices = 0; vnet_hw_interface_t *hi; - u16 *queue; + u16 qid; u32 ci; int i, j, q; int show_descr = 0; @@ -1818,20 +1850,24 @@ show_vhost_user_command_fn (vlib_main_t * vm, vlib_cli_output (vm, " rx placement: "); - vec_foreach (queue, vui->rx_queues) - { - vnet_main_t *vnm = vnet_get_main (); - uword thread_index; - vnet_hw_interface_rx_mode mode; - - thread_index = vnet_get_device_input_thread_index (vnm, - vui->hw_if_index, - *queue); - vnet_hw_interface_get_rx_mode (vnm, vui->hw_if_index, *queue, &mode); - vlib_cli_output (vm, " thread %d on vring %d, %U\n", - thread_index, VHOST_VRING_IDX_TX (*queue), - format_vnet_hw_interface_rx_mode, mode); - } + for (qid = 1; qid < VHOST_VRING_MAX_N / 2; qid += 2) + { + vnet_main_t *vnm = vnet_get_main (); + uword thread_index; + vnet_hw_interface_rx_mode mode; + vhost_user_vring_t *txvq = &vui->vrings[qid]; + + if (txvq->qid == -1) + continue; + thread_index = + vnet_get_device_input_thread_index (vnm, vui->hw_if_index, + qid >> 1); + vnet_hw_interface_get_rx_mode (vnm, vui->hw_if_index, qid >> 1, + &mode); + vlib_cli_output (vm, " thread %d on vring %d, %U\n", + thread_index, qid, + format_vnet_hw_interface_rx_mode, mode); + } vlib_cli_output (vm, " tx placement: %s\n", vui->use_tx_spinlock ? "spin-lock" : "lock-free"); @@ -1986,6 +2022,7 @@ VLIB_CLI_COMMAND (vhost_user_connect_command, static) = { .short_help = "create vhost-user socket [server] " "[feature-mask ] [hwaddr ] [renumber ] ", .function = vhost_user_connect_command_fn, + .is_mp_safe = 1, }; /* *INDENT-ON* */ diff --git a/src/vnet/devices/virtio/vhost_user.h b/src/vnet/devices/virtio/vhost_user.h index f2ed2dffd468..7dadfed23340 100644 --- a/src/vnet/devices/virtio/vhost_user.h +++ b/src/vnet/devices/virtio/vhost_user.h @@ -250,6 +250,14 @@ typedef struct /* The rx queue policy (interrupt/adaptive/polling) for this queue */ u32 mode; + + /* + * It contains the device queue number. -1 if it does not. The idea is + * to not invoke vnet_hw_interface_assign_rx_thread and + * vnet_hw_interface_unassign_rx_thread more than once for the duration of + * the interface even if it is disconnected and reconnected. + */ + i16 qid; } vhost_user_vring_t; #define VHOST_USER_EVENT_START_TIMER 1 @@ -293,9 +301,6 @@ typedef struct /* Whether to use spinlock or per_cpu_tx_qid assignment */ u8 use_tx_spinlock; u16 *per_cpu_tx_qid; - - /* Vector of active rx queues for this interface */ - u16 *rx_queues; } vhost_user_intf_t; typedef struct diff --git a/src/vnet/devices/virtio/vhost_user_api.c b/src/vnet/devices/virtio/vhost_user_api.c index 016ccbd26878..bd5f7e5773d9 100644 --- a/src/vnet/devices/virtio/vhost_user_api.c +++ b/src/vnet/devices/virtio/vhost_user_api.c @@ -244,6 +244,9 @@ vhost_user_api_hookup (vlib_main_t * vm) foreach_vpe_api_msg; #undef _ + /* Mark CREATE_VHOST_USER_IF as mp safe */ + am->is_mp_safe[VL_API_CREATE_VHOST_USER_IF] = 1; + /* * Set up the (msg_name, crc, message-id) table */ diff --git a/src/vnet/devices/virtio/virtio.h b/src/vnet/devices/virtio/virtio.h index 5fc521672d9a..8ac87c8ccfd6 100644 --- a/src/vnet/devices/virtio/virtio.h +++ b/src/vnet/devices/virtio/virtio.h @@ -86,6 +86,7 @@ typedef struct u32 call_file_index; u32 *buffers; u16 last_used_idx; + u16 last_kick_avail_idx; } virtio_vring_t; typedef struct @@ -136,6 +137,16 @@ extern void virtio_free_used_desc (vlib_main_t * vm, virtio_vring_t * vring); format_function_t format_virtio_device_name; +static_always_inline void +virtio_kick (virtio_vring_t * vring) +{ + u64 x = 1; + int __clib_unused r; + + r = write (vring->kick_fd, &x, sizeof (x)); + vring->last_kick_avail_idx = vring->avail->idx; +} + #endif /* _VNET_DEVICES_VIRTIO_VIRTIO_H_ */ /* diff --git a/src/vnet/dpo/replicate_dpo.c b/src/vnet/dpo/replicate_dpo.c index 6742bff41aa6..bc4db0cd6966 100644 --- a/src/vnet/dpo/replicate_dpo.c +++ b/src/vnet/dpo/replicate_dpo.c @@ -673,7 +673,8 @@ replicate_inline (vlib_main_t * vm, vec_validate (rm->clones[thread_index], rep0->rep_n_buckets - 1); num_cloned = vlib_buffer_clone (vm, bi0, rm->clones[thread_index], - rep0->rep_n_buckets, 128); + rep0->rep_n_buckets, + VLIB_BUFFER_CLONE_HEAD_SIZE); if (num_cloned != rep0->rep_n_buckets) { diff --git a/src/vnet/ethernet/node.c b/src/vnet/ethernet/node.c index 0034577694c7..53d5b4eb02d9 100755 --- a/src/vnet/ethernet/node.c +++ b/src/vnet/ethernet/node.c @@ -657,9 +657,9 @@ ethernet_input_inline (vlib_main_t * vm, (hi->hw_address != 0) && !eth_mac_equal ((u8 *) e0, hi->hw_address)) error0 = ETHERNET_ERROR_L3_MAC_MISMATCH; + vlib_buffer_advance (b0, sizeof (ethernet_header_t)); determine_next_node (em, variant, 0, type0, b0, &error0, &next0); - vlib_buffer_advance (b0, sizeof (ethernet_header_t)); } goto ship_it0; } diff --git a/src/vnet/fib/fib_api.c b/src/vnet/fib/fib_api.c index 3c832eb01fbd..1a7d6fde5736 100644 --- a/src/vnet/fib/fib_api.c +++ b/src/vnet/fib/fib_api.c @@ -55,10 +55,20 @@ fib_path_api_parse (const vl_api_fib_path_t *in, out->frp_proto = in->afi; // .frp_addr = (NULL == next_hop ? zero_addr : *next_hop), out->frp_sw_if_index = ntohl(in->sw_if_index); - out->frp_fib_index = ntohl(in->table_id); out->frp_weight = in->weight; out->frp_preference = in->preference; + if (DPO_PROTO_IP4 == out->frp_proto || + DPO_PROTO_IP6 == out->frp_proto || + DPO_PROTO_MPLS == out->frp_proto) + { + out->frp_fib_index = fib_table_find (dpo_proto_to_fib(out->frp_proto), + ntohl (in->table_id)); + + if (~0 == out->frp_fib_index) + return (VNET_API_ERROR_NO_SUCH_FIB); + } + /* * the special INVALID label meams we are not recursing via a * label. Exp-null value is never a valid via-label so that diff --git a/src/vnet/fib/fib_walk.c b/src/vnet/fib/fib_walk.c index d0942401153f..3fe586e37d68 100644 --- a/src/vnet/fib/fib_walk.c +++ b/src/vnet/fib/fib_walk.c @@ -350,7 +350,9 @@ fib_walk_advance (fib_node_index_t fwi) while (ii < n_ctxs) { - wrc = fib_node_back_walk_one(&sibling, &fwalk->fw_ctx[ii]); + fib_node_back_walk_ctx_t ctx = fwalk->fw_ctx[ii]; + + wrc = fib_node_back_walk_one(&sibling, &ctx); ii++; fwalk = fib_walk_get(fwi); diff --git a/src/vnet/interface_api.c b/src/vnet/interface_api.c index 644babef894b..8713ae6e48b0 100644 --- a/src/vnet/interface_api.c +++ b/src/vnet/interface_api.c @@ -1129,6 +1129,7 @@ vl_api_create_vlan_subif_t_handler (vl_api_create_vlan_subif_t * mp) memset (&template, 0, sizeof (template)); template.type = VNET_SW_INTERFACE_TYPE_SUB; + template.flood_class = VNET_FLOOD_CLASS_NORMAL; template.sup_sw_if_index = hi->sw_if_index; template.sub.id = id; template.sub.eth.raw_flags = 0; @@ -1209,6 +1210,7 @@ vl_api_create_subif_t_handler (vl_api_create_subif_t * mp) memset (&template, 0, sizeof (template)); template.type = VNET_SW_INTERFACE_TYPE_SUB; + template.flood_class = VNET_FLOOD_CLASS_NORMAL; template.sup_sw_if_index = sw_if_index; template.sub.id = sub_id; template.sub.eth.flags.no_tags = mp->no_tags; @@ -1376,6 +1378,11 @@ interface_api_hookup (vlib_main_t * vm) foreach_vpe_api_msg; #undef _ + /* Mark these APIs as mp safe */ + am->is_mp_safe[VL_API_SW_INTERFACE_DUMP] = 1; + am->is_mp_safe[VL_API_SW_INTERFACE_DETAILS] = 1; + am->is_mp_safe[VL_API_SW_INTERFACE_TAG_ADD_DEL] = 1; + /* * Set up the (msg_name, crc, message-id) table */ diff --git a/src/vnet/interface_cli.c b/src/vnet/interface_cli.c index 360898ea0fb1..fdbc1a82f6fa 100644 --- a/src/vnet/interface_cli.c +++ b/src/vnet/interface_cli.c @@ -469,6 +469,7 @@ VLIB_CLI_COMMAND (show_sw_interfaces_command, static) = { .path = "show interface", .short_help = "show interface [address|addr|features|feat] [ [ [..]]] [verbose]", .function = show_sw_interfaces, + .is_mp_safe = 1, }; /* *INDENT-ON* */ diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index 69a8dbad8050..9dac828a77de 100644 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -1750,7 +1750,7 @@ ip4_arp_inline (vlib_main_t * vm, u32 *from, *to_next_drop; uword n_left_from, n_left_to_next_drop, next_index; u32 thread_index = vm->thread_index; - u32 seed; + u64 seed; if (node->flags & VLIB_NODE_FLAG_TRACE) ip4_forward_next_trace (vm, node, frame, VLIB_TX); @@ -1770,10 +1770,11 @@ ip4_arp_inline (vlib_main_t * vm, while (n_left_from > 0 && n_left_to_next_drop > 0) { - u32 pi0, adj_index0, r0, sw_if_index0, drop0; + u32 pi0, adj_index0, sw_if_index0, drop0; ip_adjacency_t *adj0; vlib_buffer_t *p0; ip4_header_t *ip0; + u64 r0; pi0 = from[0]; @@ -1798,6 +1799,9 @@ ip4_arp_inline (vlib_main_t * vm, { r0 = adj0->sub_type.nbr.next_hop.ip4.data_u32; } + /* combine the address and interface for the hash key */ + r0 = r0 << 32; + r0 |= sw_if_index0; drop0 = throttle_check (&im->arp_throttle, thread_index, r0, seed); @@ -2344,7 +2348,7 @@ ip4_rewrite_inline (vlib_main_t * vm, adj0->sub_type.midchain.fixup_func (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data); adj1->sub_type.midchain.fixup_func - (vm, adj1, b[1], adj0->sub_type.midchain.fixup_data); + (vm, adj1, b[1], adj1->sub_type.midchain.fixup_data); } if (is_mcast) @@ -2356,7 +2360,7 @@ ip4_rewrite_inline (vlib_main_t * vm, adj0->rewrite_header.dst_mcast_offset, &ip0->dst_address.as_u32, (u8 *) ip0); vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK, - adj0->rewrite_header.dst_mcast_offset, + adj1->rewrite_header.dst_mcast_offset, &ip1->dst_address.as_u32, (u8 *) ip1); } diff --git a/src/vnet/ip/ip4_mtrie.c b/src/vnet/ip/ip4_mtrie.c old mode 100644 new mode 100755 index 97c250746393..fbb8a7480742 --- a/src/vnet/ip/ip4_mtrie.c +++ b/src/vnet/ip/ip4_mtrie.c @@ -369,10 +369,10 @@ set_leaf (ip4_fib_mtrie_t * m, old_ply->n_non_empty_leafs -= ip4_fib_mtrie_leaf_is_non_empty (old_ply, dst_byte); - new_leaf = ply_create (m, old_leaf, - clib_max (old_ply->dst_address_bits_of_leaves - [dst_byte], ply_base_len), - ply_base_len); + new_leaf = + ply_create (m, old_leaf, + old_ply->dst_address_bits_of_leaves[dst_byte], + ply_base_len); new_ply = get_next_ply_for_leaf (m, new_leaf); /* Refetch since ply_create may move pool. */ @@ -492,10 +492,10 @@ set_root_leaf (ip4_fib_mtrie_t * m, if (ip4_fib_mtrie_leaf_is_terminal (old_leaf)) { /* There is a leaf occupying the slot. Replace it with a new ply */ - new_leaf = ply_create (m, old_leaf, - clib_max (old_ply->dst_address_bits_of_leaves - [dst_byte], ply_base_len), - ply_base_len); + new_leaf = + ply_create (m, old_leaf, + old_ply->dst_address_bits_of_leaves[dst_byte], + ply_base_len); new_ply = get_next_ply_for_leaf (m, new_leaf); __sync_val_compare_and_swap (&old_ply->leaves[dst_byte], old_leaf, @@ -551,9 +551,7 @@ unset_leaf (ip4_fib_mtrie_t * m, old_ply->leaves[i] = ip4_fib_mtrie_leaf_set_adj_index (a->cover_adj_index); - old_ply->dst_address_bits_of_leaves[i] = - clib_max (old_ply->dst_address_bits_base, - a->cover_address_length); + old_ply->dst_address_bits_of_leaves[i] = a->cover_address_length; old_ply->n_non_empty_leafs += ip4_fib_mtrie_leaf_is_non_empty (old_ply, i); @@ -714,24 +712,23 @@ format_ip4_fib_mtrie_leaf (u8 * s, va_list * va) return s; } -#define FORMAT_PLY(s, _p, _i, _base_address, _ply_max_len, _indent) \ +#define FORMAT_PLY(s, _p, _a, _i, _base_address, _ply_max_len, _indent) \ ({ \ u32 a, ia_length; \ ip4_address_t ia; \ ip4_fib_mtrie_leaf_t _l = p->leaves[(_i)]; \ \ - a = (_base_address) + ((_i) << (32 - (_ply_max_len))); \ + a = (_base_address) + ((_a) << (32 - (_ply_max_len))); \ ia.as_u32 = clib_host_to_net_u32 (a); \ ia_length = (_p)->dst_address_bits_of_leaves[(_i)]; \ - s = format (s, "\n%U%20U %U", \ - format_white_space, (_indent) + 2, \ + s = format (s, "\n%U%U %U", \ + format_white_space, (_indent) + 4, \ format_ip4_address_and_length, &ia, ia_length, \ format_ip4_fib_mtrie_leaf, _l); \ \ if (ip4_fib_mtrie_leaf_is_next_ply (_l)) \ - s = format (s, "\n%U%U", \ - format_white_space, (_indent) + 2, \ - format_ip4_fib_mtrie_ply, m, a, \ + s = format (s, "\n%U", \ + format_ip4_fib_mtrie_ply, m, a, (_indent) + 8, \ ip4_fib_mtrie_leaf_get_next_ply_index (_l)); \ s; \ }) @@ -741,21 +738,20 @@ format_ip4_fib_mtrie_ply (u8 * s, va_list * va) { ip4_fib_mtrie_t *m = va_arg (*va, ip4_fib_mtrie_t *); u32 base_address = va_arg (*va, u32); + u32 indent = va_arg (*va, u32); u32 ply_index = va_arg (*va, u32); ip4_fib_mtrie_8_ply_t *p; - u32 indent; int i; p = pool_elt_at_index (ip4_ply_pool, ply_index); - indent = format_get_indent (s); - s = format (s, "ply index %d, %d non-empty leaves", ply_index, - p->n_non_empty_leafs); + s = format (s, "%Uply index %d, %d non-empty leaves", + format_white_space, indent, ply_index, p->n_non_empty_leafs); for (i = 0; i < ARRAY_LEN (p->leaves); i++) { if (ip4_fib_mtrie_leaf_is_non_empty (p, i)) { - s = FORMAT_PLY (s, p, i, base_address, + s = FORMAT_PLY (s, p, i, i, base_address, p->dst_address_bits_base + 8, indent); } } @@ -791,7 +787,7 @@ format_ip4_fib_mtrie (u8 * s, va_list * va) if (p->dst_address_bits_of_leaves[slot] > 0) { - s = FORMAT_PLY (s, p, slot, base_address, 16, 2); + s = FORMAT_PLY (s, p, i, slot, base_address, 16, 0); } } } diff --git a/src/vnet/ip/ip6.h b/src/vnet/ip/ip6.h index bc89a0821ae4..e807886cd2af 100644 --- a/src/vnet/ip/ip6.h +++ b/src/vnet/ip/ip6.h @@ -49,6 +49,7 @@ #include #include #include +#include /* * Default size of the ip6 fib hash table @@ -220,10 +221,7 @@ typedef struct ip6_main_t u8 hbh_enabled; /** ND throttling */ - uword **nd_throttle_bitmaps; - u64 *nd_throttle_seeds; - f64 *nd_throttle_last_seed_change_time; - + throttle_t nd_throttle; } ip6_main_t; #define ND_THROTTLE_BITS 512 diff --git a/src/vnet/ip/ip6_input.c b/src/vnet/ip/ip6_input.c index 977d2703d191..a01920a7b487 100644 --- a/src/vnet/ip/ip6_input.c +++ b/src/vnet/ip/ip6_input.c @@ -277,16 +277,9 @@ ip6_main_loop_enter (vlib_main_t * vm) { ip6_main_t *im = &ip6_main; vlib_thread_main_t *tm = &vlib_thread_main; - u32 n_vlib_mains = tm->n_vlib_mains; - int i; - vec_validate (im->nd_throttle_bitmaps, n_vlib_mains); - vec_validate (im->nd_throttle_seeds, n_vlib_mains); - vec_validate (im->nd_throttle_last_seed_change_time, n_vlib_mains); + throttle_init (&im->nd_throttle, tm->n_vlib_mains, 1e-3); - for (i = 0; i < n_vlib_mains; i++) - vec_validate (im->nd_throttle_bitmaps[i], - (ND_THROTTLE_BITS / BITS (uword)) - 1); return 0; } diff --git a/src/vnet/ip/ip6_neighbor.c b/src/vnet/ip/ip6_neighbor.c index 8466ba703135..b6889157cab6 100755 --- a/src/vnet/ip/ip6_neighbor.c +++ b/src/vnet/ip/ip6_neighbor.c @@ -3163,7 +3163,6 @@ ip6_discover_neighbor_inline (vlib_main_t * vm, ip_lookup_main_t *lm = &im->lookup_main; u32 *from, *to_next_drop; uword n_left_from, n_left_to_next_drop; - f64 time_now; u64 seed; u32 thread_index = vm->thread_index; int bogus_length; @@ -3172,16 +3171,7 @@ ip6_discover_neighbor_inline (vlib_main_t * vm, if (node->flags & VLIB_NODE_FLAG_TRACE) ip6_forward_next_trace (vm, node, frame, VLIB_TX); - time_now = vlib_time_now (vm); - if (time_now - im->nd_throttle_last_seed_change_time[thread_index] > 1e-3) - { - (void) random_u64 (&im->nd_throttle_seeds[thread_index]); - memset (im->nd_throttle_bitmaps[thread_index], 0, - ND_THROTTLE_BITS / BITS (u8)); - - im->nd_throttle_last_seed_change_time[thread_index] = time_now; - } - seed = im->nd_throttle_seeds[thread_index]; + seed = throttle_seed (&im->nd_throttle, thread_index, vlib_time_now (vm)); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -3193,15 +3183,12 @@ ip6_discover_neighbor_inline (vlib_main_t * vm, while (n_left_from > 0 && n_left_to_next_drop > 0) { - vlib_buffer_t *p0; - ip6_header_t *ip0; - u32 pi0, adj_index0, w0, sw_if_index0, drop0; - u64 r0; - uword m0; - ip_adjacency_t *adj0; + u32 pi0, adj_index0, sw_if_index0, drop0, r0, next0; vnet_hw_interface_t *hw_if0; ip6_radv_t *radv_info; - u32 next0; + ip_adjacency_t *adj0; + vlib_buffer_t *p0; + ip6_header_t *ip0; pi0 = from[0]; @@ -3224,18 +3211,10 @@ ip6_discover_neighbor_inline (vlib_main_t * vm, sw_if_index0 = adj0->rewrite_header.sw_if_index; vnet_buffer (p0)->sw_if_index[VLIB_TX] = sw_if_index0; - /* Compute the ND throttle bitmap hash */ - r0 = ip0->dst_address.as_u64[0] ^ ip0->dst_address.as_u64[1] ^ seed; - - /* Find the word and bit */ - r0 &= ND_THROTTLE_BITS - 1; - w0 = r0 / BITS (uword); - m0 = (uword) 1 << (r0 % BITS (uword)); + /* combine the address and interface for a hash */ + r0 = ip6_address_hash_to_u64 (&ip0->dst_address) ^ sw_if_index0; - /* If the bit is set, drop the ND request */ - drop0 = (im->nd_throttle_bitmaps[thread_index][w0] & m0) != 0; - /* (unconditionally) mark the bit "inuse" */ - im->nd_throttle_bitmaps[thread_index][w0] |= m0; + drop0 = throttle_check (&im->nd_throttle, thread_index, r0, seed); from += 1; n_left_from -= 1; diff --git a/src/vnet/ip/ip6_packet.h b/src/vnet/ip/ip6_packet.h index ea2fa155b5ed..cdcffa59b8b4 100644 --- a/src/vnet/ip/ip6_packet.h +++ b/src/vnet/ip/ip6_packet.h @@ -347,6 +347,18 @@ ip6_is_solicited_node_multicast_address (const ip6_address_t * a) && a->as_u8[12] == 0xff); } +always_inline u32 +ip6_address_hash_to_u32 (const ip6_address_t * a) +{ + return (a->as_u32[0] ^ a->as_u32[1] ^ a->as_u32[2] ^ a->as_u32[3]); +} + +always_inline u64 +ip6_address_hash_to_u64 (const ip6_address_t * a) +{ + return (a->as_u64[0] ^ a->as_u64[1]); +} + typedef struct { /* 4 bit version, 8 bit traffic class and 20 bit flow label. */ diff --git a/src/vnet/ip/ip6_reassembly.c b/src/vnet/ip/ip6_reassembly.c index 50445f2a1816..0162ad98e635 100644 --- a/src/vnet/ip/ip6_reassembly.c +++ b/src/vnet/ip/ip6_reassembly.c @@ -32,7 +32,12 @@ #define IP6_REASS_MAX_REASSEMBLIES_DEFAULT 1024 #define IP6_REASS_HT_LOAD_FACTOR (0.75) -static vlib_node_registration_t ip6_reass_node; +typedef enum +{ + IP6_REASS_RC_OK, + IP6_REASS_RC_INTERNAL_ERROR, + IP6_REASS_RC_NO_BUF, +} ip6_reass_rc_t; typedef struct { @@ -51,37 +56,21 @@ typedef struct }; } ip6_reass_key_t; -always_inline u32 -ip6_reass_buffer_get_data_offset_no_check (vlib_buffer_t * b) -{ - vnet_buffer_opaque_t *vnb = vnet_buffer (b); - return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first; -} - always_inline u32 ip6_reass_buffer_get_data_offset (vlib_buffer_t * b) { vnet_buffer_opaque_t *vnb = vnet_buffer (b); - ASSERT (vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first); - return ip6_reass_buffer_get_data_offset_no_check (b); + return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first; } always_inline u16 -ip6_reass_buffer_get_data_len_no_check (vlib_buffer_t * b) +ip6_reass_buffer_get_data_len (vlib_buffer_t * b) { vnet_buffer_opaque_t *vnb = vnet_buffer (b); return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) - (vnb->ip.reass.fragment_first + ip6_reass_buffer_get_data_offset (b)) + 1; } -always_inline u16 -ip6_reass_buffer_get_data_len (vlib_buffer_t * b) -{ - vnet_buffer_opaque_t *vnb = vnet_buffer (b); - ASSERT (vnb->ip.reass.range_last > vnb->ip.reass.fragment_first); - return ip6_reass_buffer_get_data_len_no_check (b); -} - typedef struct { // hash table key @@ -108,7 +97,6 @@ typedef struct { ip6_reass_t *pool; u32 reass_n; - u32 buffers_n; u32 id_counter; clib_spinlock_t lock; } ip6_reass_per_thread_t; @@ -188,8 +176,8 @@ ip6_reass_trace_details (vlib_main_t * vm, u32 bi, vnet_buffer_opaque_t *vnb = vnet_buffer (b); trace->range_first = vnb->ip.reass.range_first; trace->range_last = vnb->ip.reass.range_last; - trace->data_offset = ip6_reass_buffer_get_data_offset_no_check (b); - trace->data_len = ip6_reass_buffer_get_data_len_no_check (b); + trace->data_offset = ip6_reass_buffer_get_data_offset (b); + trace->data_len = ip6_reass_buffer_get_data_len (b); trace->range_bi = bi; } @@ -297,11 +285,12 @@ ip6_reass_free (ip6_reass_main_t * rm, ip6_reass_per_thread_t * rt, always_inline void ip6_reass_drop_all (vlib_main_t * vm, ip6_reass_main_t * rm, - ip6_reass_t * reass, u32 ** vec_drop_bi) + ip6_reass_t * reass) { u32 range_bi = reass->first_bi; vlib_buffer_t *range_b; vnet_buffer_opaque_t *range_vnb; + u32 *to_free = NULL; while (~0 != range_bi) { range_b = vlib_get_buffer (vm, range_bi); @@ -309,7 +298,7 @@ ip6_reass_drop_all (vlib_main_t * vm, ip6_reass_main_t * rm, u32 bi = range_bi; while (~0 != bi) { - vec_add1 (*vec_drop_bi, bi); + vec_add1 (to_free, bi); vlib_buffer_t *b = vlib_get_buffer (vm, bi); if (b->flags & VLIB_BUFFER_NEXT_PRESENT) { @@ -323,12 +312,14 @@ ip6_reass_drop_all (vlib_main_t * vm, ip6_reass_main_t * rm, } range_bi = range_vnb->ip.reass.next_range_bi; } + vlib_buffer_free (vm, to_free, vec_len (to_free)); + vec_free (to_free); } always_inline void ip6_reass_on_timeout (vlib_main_t * vm, vlib_node_runtime_t * node, ip6_reass_main_t * rm, ip6_reass_t * reass, - u32 * icmp_bi, u32 ** vec_timeout) + u32 * icmp_bi) { if (~0 == reass->first_bi) { @@ -358,14 +349,13 @@ ip6_reass_on_timeout (vlib_main_t * vm, vlib_node_runtime_t * node, ICMP6_time_exceeded_fragment_reassembly_time_exceeded, 0); } - ip6_reass_drop_all (vm, rm, reass, vec_timeout); + ip6_reass_drop_all (vm, rm, reass); } always_inline ip6_reass_t * ip6_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node, ip6_reass_main_t * rm, ip6_reass_per_thread_t * rt, - ip6_reass_key_t * k, u32 * icmp_bi, - u32 ** vec_timeout) + ip6_reass_key_t * k, u32 * icmp_bi) { ip6_reass_t *reass = NULL; f64 now = vlib_time_now (rm->vlib_main); @@ -382,7 +372,7 @@ ip6_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node, reass = pool_elt_at_index (rt->pool, value.value); if (now > reass->last_heard + rm->timeout) { - ip6_reass_on_timeout (vm, node, rm, reass, icmp_bi, vec_timeout); + ip6_reass_on_timeout (vm, node, rm, reass, icmp_bi); ip6_reass_free (rm, rt, reass); reass = NULL; } @@ -430,13 +420,12 @@ ip6_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node, return reass; } -always_inline void +always_inline ip6_reass_rc_t ip6_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node, ip6_reass_main_t * rm, ip6_reass_per_thread_t * rt, - ip6_reass_t * reass, u32 * bi0, u32 * next0, - u32 * error0, u32 ** vec_drop_compress, bool is_feature) + ip6_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0, + bool is_feature) { - ASSERT (~0 != reass->first_bi); *bi0 = reass->first_bi; *error0 = IP6_ERROR_NONE; ip6_frag_hdr_t *frag_hdr; @@ -445,10 +434,20 @@ ip6_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node, u32 total_length = 0; u32 buf_cnt = 0; u32 dropped_cnt = 0; + u32 *vec_drop_compress = NULL; + ip6_reass_rc_t rv = IP6_REASS_RC_OK; do { u32 tmp_bi = sub_chain_bi; vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi); + vnet_buffer_opaque_t *vnb = vnet_buffer (tmp); + if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) && + !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first)) + { + rv = IP6_REASS_RC_INTERNAL_ERROR; + goto free_buffers_and_return; + } + u32 data_len = ip6_reass_buffer_get_data_len (tmp); u32 trim_front = vnet_buffer (tmp)->ip.reass.ip6_frag_hdr_offset + sizeof (*frag_hdr) + ip6_reass_buffer_get_data_offset (tmp); @@ -457,12 +456,20 @@ ip6_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node, if (tmp_bi == reass->first_bi) { /* first buffer - keep ip6 header */ - ASSERT (0 == ip6_reass_buffer_get_data_offset (tmp)); + if (0 != ip6_reass_buffer_get_data_offset (tmp)) + { + rv = IP6_REASS_RC_INTERNAL_ERROR; + goto free_buffers_and_return; + } trim_front = 0; trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len - (vnet_buffer (tmp)->ip.reass.ip6_frag_hdr_offset + sizeof (*frag_hdr)); - ASSERT (vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0); + if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0)) + { + rv = IP6_REASS_RC_INTERNAL_ERROR; + goto free_buffers_and_return; + } } u32 keep_data = vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end; @@ -474,10 +481,13 @@ ip6_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node, if (trim_front > tmp->current_length) { /* drop whole buffer */ - vec_add1 (*vec_drop_compress, tmp_bi); - ++dropped_cnt; + vec_add1 (vec_drop_compress, tmp_bi); trim_front -= tmp->current_length; - ASSERT (tmp->flags & VLIB_BUFFER_NEXT_PRESENT); + if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + rv = IP6_REASS_RC_INTERNAL_ERROR; + goto free_buffers_and_return; + } tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT; tmp_bi = tmp->next_buffer; tmp = vlib_get_buffer (vm, tmp_bi); @@ -505,14 +515,22 @@ ip6_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node, else { keep_data -= tmp->current_length; - ASSERT (tmp->flags & VLIB_BUFFER_NEXT_PRESENT); + if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + rv = IP6_REASS_RC_INTERNAL_ERROR; + goto free_buffers_and_return; + } } total_length += tmp->current_length; } else { - vec_add1 (*vec_drop_compress, tmp_bi); - ASSERT (reass->first_bi != tmp_bi); + vec_add1 (vec_drop_compress, tmp_bi); + if (reass->first_bi == tmp_bi) + { + rv = IP6_REASS_RC_INTERNAL_ERROR; + goto free_buffers_and_return; + } ++dropped_cnt; } if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT) @@ -531,10 +549,18 @@ ip6_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node, } while (~0 != sub_chain_bi); - ASSERT (last_b != NULL); + if (!last_b) + { + rv = IP6_REASS_RC_INTERNAL_ERROR; + goto free_buffers_and_return; + } last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT; vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi); - ASSERT (total_length >= first_b->current_length); + if (total_length < first_b->current_length) + { + rv = IP6_REASS_RC_INTERNAL_ERROR; + goto free_buffers_and_return; + } total_length -= first_b->current_length; first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; first_b->total_length_not_including_first_buffer = total_length; @@ -553,7 +579,11 @@ ip6_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node, { ip->protocol = frag_hdr->next_hdr; } - ASSERT ((u8 *) frag_hdr - (u8 *) ip == ip6_frag_hdr_offset); + if (!((u8 *) frag_hdr - (u8 *) ip == ip6_frag_hdr_offset)) + { + rv = IP6_REASS_RC_INTERNAL_ERROR; + goto free_buffers_and_return; + } memmove (frag_hdr, (u8 *) frag_hdr + sizeof (*frag_hdr), first_b->current_length - ip6_frag_hdr_offset - sizeof (ip6_frag_hdr_t)); @@ -561,7 +591,11 @@ ip6_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node, ip->payload_length = clib_host_to_net_u16 (total_length + first_b->current_length - sizeof (*ip)); - vlib_buffer_chain_compress (vm, first_b, vec_drop_compress); + if (!vlib_buffer_chain_linearize (vm, first_b)) + { + rv = IP6_REASS_RC_NO_BUF; + goto free_buffers_and_return; + } if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED)) { ip6_reass_add_trace (vm, node, rm, reass, reass->first_bi, FINALIZE, 0); @@ -603,25 +637,10 @@ ip6_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node, vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length; ip6_reass_free (rm, rt, reass); reass = NULL; -} - -always_inline u32 -ip6_reass_get_buffer_chain_length (vlib_main_t * vm, vlib_buffer_t * b) -{ - u32 len = 0; - while (b) - { - ++len; - if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT)) - { - b = vlib_get_buffer (vm, b->next_buffer); - } - else - { - break; - } - } - return len; +free_buffers_and_return: + vlib_buffer_free (vm, vec_drop_compress, vec_len (vec_drop_compress)); + vec_free (vec_drop_compress); + return rv; } always_inline void @@ -649,16 +668,13 @@ ip6_reass_insert_range_in_chain (vlib_main_t * vm, ip6_reass_main_t * rm, reass->first_bi = new_next_bi; } reass->data_len += ip6_reass_buffer_get_data_len (new_next_b); - rt->buffers_n += ip6_reass_get_buffer_chain_length (vm, new_next_b); } -always_inline void +always_inline ip6_reass_rc_t ip6_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node, ip6_reass_main_t * rm, ip6_reass_per_thread_t * rt, - ip6_reass_t * reass, u32 * bi0, u32 * next0, - u32 * error0, ip6_frag_hdr_t * frag_hdr, - u32 ** vec_drop_overlap, u32 ** vec_drop_compress, - bool is_feature) + ip6_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0, + ip6_frag_hdr_t * frag_hdr, bool is_feature) { int consumed = 0; vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0); @@ -667,9 +683,13 @@ ip6_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node, fvnb->ip.reass.ip6_frag_hdr_offset = (u8 *) frag_hdr - (u8 *) vlib_buffer_get_current (fb); ip6_header_t *fip = vlib_buffer_get_current (fb); - ASSERT (fb->current_length > sizeof (*fip)); - ASSERT (fvnb->ip.reass.ip6_frag_hdr_offset > 0 && - fvnb->ip.reass.ip6_frag_hdr_offset < fb->current_length); + if (fb->current_length < sizeof (*fip) || + fvnb->ip.reass.ip6_frag_hdr_offset == 0 || + fvnb->ip.reass.ip6_frag_hdr_offset >= fb->current_length) + { + return IP6_REASS_RC_INTERNAL_ERROR; + } + u32 fragment_first = fvnb->ip.reass.fragment_first = ip6_frag_hdr_offset_bytes (frag_hdr); u32 fragment_length = @@ -692,13 +712,9 @@ ip6_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node, // starting a new reassembly ip6_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi, *bi0); - if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED)) - { - ip6_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0); - } reass->min_fragment_length = clib_net_to_host_u16 (fip->payload_length); - *bi0 = ~0; - return; + consumed = 1; + goto check_if_done_maybe; } reass->min_fragment_length = clib_min (clib_net_to_host_u16 (fip->payload_length), @@ -738,7 +754,7 @@ ip6_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node, else { // overlapping fragment - not allowed by RFC 8200 - ip6_reass_drop_all (vm, rm, reass, vec_drop_overlap); + ip6_reass_drop_all (vm, rm, reass); ip6_reass_free (rm, rt, reass); if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED)) { @@ -747,9 +763,11 @@ ip6_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node, } *next0 = IP6_REASSEMBLY_NEXT_DROP; *error0 = IP6_ERROR_REASS_OVERLAPPING_FRAGMENT; + return IP6_REASS_RC_OK; } break; } +check_if_done_maybe: if (consumed) { if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED)) @@ -760,8 +778,8 @@ ip6_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node, if (~0 != reass->last_packet_octet && reass->data_len == reass->last_packet_octet + 1) { - ip6_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0, - vec_drop_compress, is_feature); + return ip6_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0, + is_feature); } else { @@ -772,10 +790,10 @@ ip6_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node, else { *next0 = IP6_REASSEMBLY_NEXT_DROP; - ; *error0 = IP6_ERROR_REASS_DUPLICATE_FRAGMENT; } } + return IP6_REASS_RC_OK; } always_inline bool @@ -858,64 +876,15 @@ ip6_reassembly_inline (vlib_main_t * vm, n_left_from = frame->n_vectors; next_index = node->cached_next_index; - static u32 *vec_timeout = NULL; // indexes of buffers which timed out - static u32 *vec_drop_overlap = NULL; // indexes of buffers dropped due to overlap - static u32 *vec_drop_compress = NULL; // indexes of buffers dropped due to buffer compression - while (n_left_from > 0 || vec_len (vec_timeout) > 0 || - vec_len (vec_drop_overlap) > 0 || vec_len (vec_drop_compress) > 0) + while (n_left_from > 0) { vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); - while (vec_len (vec_timeout) > 0 && n_left_to_next > 0) - { - u32 bi = vec_pop (vec_timeout); - vlib_buffer_t *b = vlib_get_buffer (vm, bi); - b->error = node->errors[IP6_ERROR_REASS_TIMEOUT]; - to_next[0] = bi; - to_next += 1; - n_left_to_next -= 1; - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, - n_left_to_next, bi, - IP6_REASSEMBLY_NEXT_DROP); - ASSERT (rt->buffers_n > 0); - --rt->buffers_n; - } - - while (vec_len (vec_drop_overlap) > 0 && n_left_to_next > 0) - { - u32 bi = vec_pop (vec_drop_overlap); - vlib_buffer_t *b = vlib_get_buffer (vm, bi); - b->error = node->errors[IP6_ERROR_REASS_OVERLAPPING_FRAGMENT]; - to_next[0] = bi; - to_next += 1; - n_left_to_next -= 1; - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, - n_left_to_next, bi, - IP6_REASSEMBLY_NEXT_DROP); - ASSERT (rt->buffers_n > 0); - --rt->buffers_n; - } - - while (vec_len (vec_drop_compress) > 0 && n_left_to_next > 0) - { - u32 bi = vec_pop (vec_drop_compress); - vlib_buffer_t *b = vlib_get_buffer (vm, bi); - b->error = node->errors[IP6_ERROR_NONE]; - to_next[0] = bi; - to_next += 1; - n_left_to_next -= 1; - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, - n_left_to_next, bi, - IP6_REASSEMBLY_NEXT_DROP); - ASSERT (rt->buffers_n > 0); - --rt->buffers_n; - } - while (n_left_from > 0 && n_left_to_next > 0) { u32 bi0; vlib_buffer_t *b0; - u32 next0; + u32 next0 = IP6_REASSEMBLY_NEXT_DROP; u32 error0 = IP6_ERROR_NONE; u32 icmp_bi = ~0; @@ -965,14 +934,25 @@ ip6_reassembly_inline (vlib_main_t * vm, sw_if_index[VLIB_RX] << 32 | frag_hdr->identification; k.as_u64[5] = ip0->protocol; ip6_reass_t *reass = - ip6_reass_find_or_create (vm, node, rm, rt, &k, &icmp_bi, - &vec_timeout); + ip6_reass_find_or_create (vm, node, rm, rt, &k, &icmp_bi); if (reass) { - ip6_reass_update (vm, node, rm, rt, reass, &bi0, &next0, - &error0, frag_hdr, &vec_drop_overlap, - &vec_drop_compress, is_feature); + switch (ip6_reass_update (vm, node, rm, rt, reass, &bi0, &next0, + &error0, frag_hdr, is_feature)) + { + case IP6_REASS_RC_OK: + /* nothing to do here */ + break; + case IP6_REASS_RC_NO_BUF: + /* fallthrough */ + case IP6_REASS_RC_INTERNAL_ERROR: + /* drop everything and start with a clean slate */ + ip6_reass_drop_all (vm, rm, reass); + ip6_reass_free (rm, rt, reass); + goto next_packet; + break; + } } else { @@ -1007,6 +987,7 @@ ip6_reassembly_inline (vlib_main_t * vm, n_left_to_next, icmp_bi, next0); } + next_packet: from += 1; n_left_from -= 1; } @@ -1193,7 +1174,7 @@ ip6_reass_init_function (vlib_main_t * vm) rm->vlib_main = vm; rm->vnet_main = vnet_get_main (); - vec_validate (rm->per_thread_data, vlib_num_workers () + 1); + vec_validate (rm->per_thread_data, vlib_num_workers ()); ip6_reass_per_thread_t *rt; vec_foreach (rt, rm->per_thread_data) { @@ -1257,12 +1238,11 @@ ip6_reass_walk_expired (vlib_main_t * vm, f64 now = vlib_time_now (vm); ip6_reass_t *reass; - u32 *vec_timeout = NULL; int *pool_indexes_to_free = NULL; uword thread_index = 0; int index; - const uword nthreads = os_get_nthreads (); + const uword nthreads = vlib_num_workers () + 1; u32 *vec_icmp_bi = NULL; for (thread_index = 0; thread_index < nthreads; ++thread_index) { @@ -1285,7 +1265,6 @@ ip6_reass_walk_expired (vlib_main_t * vm, { ip6_reass_t *reass = pool_elt_at_index (rt->pool, i[0]); u32 icmp_bi = ~0; - u32 before = vec_len (vec_timeout); vlib_buffer_t *b = vlib_get_buffer (vm, reass->first_bi); if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED)) { @@ -1296,15 +1275,10 @@ ip6_reass_walk_expired (vlib_main_t * vm, b->flags &= ~VLIB_BUFFER_IS_TRACED; } } - ip6_reass_on_timeout (vm, node, rm, reass, &icmp_bi, &vec_timeout); - u32 after = vec_len (vec_timeout); - ASSERT (rt->buffers_n >= (after - before)); - rt->buffers_n -= (after - before); + ip6_reass_on_timeout (vm, node, rm, reass, &icmp_bi); if (~0 != icmp_bi) { vec_add1 (vec_icmp_bi, icmp_bi); - ASSERT (rt->buffers_n > 0); - --rt->buffers_n; } ip6_reass_free (rm, rt, reass); } @@ -1313,39 +1287,6 @@ ip6_reass_walk_expired (vlib_main_t * vm, clib_spinlock_unlock (&rt->lock); } - while (vec_len (vec_timeout) > 0) - { - vlib_frame_t *f = vlib_get_frame_to_node (vm, rm->ip6_drop_idx); - u32 *to_next = vlib_frame_vector_args (f); - u32 n_left_to_next = VLIB_FRAME_SIZE - f->n_vectors; - int trace_frame = 0; - while (vec_len (vec_timeout) > 0 && n_left_to_next > 0) - { - u32 bi = vec_pop (vec_timeout); - vlib_buffer_t *b = vlib_get_buffer (vm, bi); - if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED)) - { - if (pool_is_free_index (vm->trace_main.trace_buffer_pool, - b->trace_index)) - { - /* the trace is gone, don't trace this buffer anymore */ - b->flags &= ~VLIB_BUFFER_IS_TRACED; - } - else - { - trace_frame = 1; - } - } - b->error = node->errors[IP6_ERROR_REASS_TIMEOUT]; - to_next[0] = bi; - ++f->n_vectors; - to_next += 1; - n_left_to_next -= 1; - } - f->frame_flags |= (trace_frame * VLIB_FRAME_TRACE); - vlib_put_frame_to_node (vm, rm->ip6_drop_idx, f); - } - while (vec_len (vec_icmp_bi) > 0) { vlib_frame_t *f = @@ -1381,7 +1322,6 @@ ip6_reass_walk_expired (vlib_main_t * vm, } vec_free (pool_indexes_to_free); - vec_free (vec_timeout); vec_free (vec_icmp_bi); if (event_data) { @@ -1438,8 +1378,8 @@ format_ip6_reass (u8 * s, va_list * args) "fragment[%u, %u]\n", counter, vnb->ip.reass.range_first, vnb->ip.reass.range_last, bi, - ip6_reass_buffer_get_data_offset_no_check (b), - ip6_reass_buffer_get_data_len_no_check (b), + ip6_reass_buffer_get_data_offset (b), + ip6_reass_buffer_get_data_len (b), vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last); if (b->flags & VLIB_BUFFER_NEXT_PRESENT) { @@ -1472,7 +1412,7 @@ show_ip6_reass (vlib_main_t * vm, unformat_input_t * input, u64 sum_buffers_n = 0; ip6_reass_t *reass; uword thread_index; - const uword nthreads = os_get_nthreads (); + const uword nthreads = vlib_num_workers () + 1; for (thread_index = 0; thread_index < nthreads; ++thread_index) { ip6_reass_per_thread_t *rt = &rm->per_thread_data[thread_index]; @@ -1486,7 +1426,6 @@ show_ip6_reass (vlib_main_t * vm, unformat_input_t * input, /* *INDENT-ON* */ } sum_reass_n += rt->reass_n; - sum_buffers_n += rt->buffers_n; clib_spinlock_unlock (&rt->lock); } vlib_cli_output (vm, "---------------------"); diff --git a/src/vnet/ip/ip_frag.c b/src/vnet/ip/ip_frag.c index 628d9d66474c..8de4dfc5d58f 100644 --- a/src/vnet/ip/ip_frag.c +++ b/src/vnet/ip/ip_frag.c @@ -101,7 +101,8 @@ ip4_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer, ip4 = (ip4_header_t *) vlib_buffer_get_current (from_b); rem = clib_net_to_host_u16 (ip4->length) - sizeof (ip4_header_t); - max = (mtu - sizeof (ip4_header_t)) & ~0x7; + max = + (clib_min (mtu, VLIB_BUFFER_DATA_SIZE) - sizeof (ip4_header_t)) & ~0x7; if (rem > (vlib_buffer_length_in_chain (vm, from_b) - sizeof (ip4_header_t))) @@ -152,7 +153,7 @@ ip4_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer, ip4_header_t *to_ip4; u8 *to_data; - len = (rem > (mtu - sizeof (ip4_header_t)) ? max : rem); + len = (rem > max ? max : rem); if (len != rem) /* Last fragment does not need to divisible by 8 */ len &= ~0x7; if ((to_b = frag_buffer_alloc (org_from_b, &to_bi)) == 0) diff --git a/src/vnet/ipsec/ah_decrypt.c b/src/vnet/ipsec/ah_decrypt.c index abe2e6f5f80f..29c59fdfd248 100644 --- a/src/vnet/ipsec/ah_decrypt.c +++ b/src/vnet/ipsec/ah_decrypt.c @@ -166,11 +166,8 @@ ah_decrypt_node_fn (vlib_main_t * vm, if (PREDICT_FALSE (rv)) { - clib_warning ("anti-replay SPI %u seq %u", sa0->spi, seq); vlib_node_increment_counter (vm, ah_decrypt_node.index, AH_DECRYPT_ERROR_REPLAY, 1); - to_next[0] = i_bi0; - to_next += 1; goto trace; } } @@ -220,8 +217,6 @@ ah_decrypt_node_fn (vlib_main_t * vm, vlib_node_increment_counter (vm, ah_decrypt_node.index, AH_DECRYPT_ERROR_INTEG_ERROR, 1); - to_next[0] = i_bi0; - to_next += 1; goto trace; } diff --git a/src/vnet/ipsec/esp_decrypt.c b/src/vnet/ipsec/esp_decrypt.c index a0eeed464da6..7f9be89ee4c0 100644 --- a/src/vnet/ipsec/esp_decrypt.c +++ b/src/vnet/ipsec/esp_decrypt.c @@ -185,7 +185,6 @@ esp_decrypt_node_fn (vlib_main_t * vm, if (PREDICT_FALSE (rv)) { - clib_warning ("anti-replay SPI %u seq %u", sa0->spi, seq); vlib_node_increment_counter (vm, esp_decrypt_node.index, ESP_DECRYPT_ERROR_REPLAY, 1); o_bi0 = i_bi0; @@ -330,7 +329,6 @@ esp_decrypt_node_fn (vlib_main_t * vm, next0 = ESP_DECRYPT_NEXT_IP6_INPUT; else { - clib_warning ("next header: 0x%x", f0->next_header); vlib_node_increment_counter (vm, esp_decrypt_node.index, ESP_DECRYPT_ERROR_DECRYPTION_FAILED, 1); diff --git a/src/vnet/ipsec/ipsec.api b/src/vnet/ipsec/ipsec.api index d6a28017fff1..523def284c32 100644 --- a/src/vnet/ipsec/ipsec.api +++ b/src/vnet/ipsec/ipsec.api @@ -455,7 +455,7 @@ define ipsec_spds_dump { @param spd_id - SPD instance id (control plane allocated) @param npolicies - number of policies in SPD */ - define ipsec_spds_details { +define ipsec_spds_details { u32 context; u32 spd_id; u32 npolicies; @@ -515,6 +515,31 @@ define ipsec_spd_details { u64 packets; }; +/** \brief IPsec: Get SPD interfaces + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param spd_index - SPD index + @param spd_index_valid - if 1 spd_index is used to filter + spd_index's, if 0 no filtering is done +*/ +define ipsec_spd_interface_dump { + u32 client_index; + u32 context; + u32 spd_index; + u8 spd_index_valid; +}; + +/** \brief IPsec: SPD interface response + @param context - sender context which was passed in the request + @param spd_index - SPD index + @param sw_if_index - index of the interface +*/ +define ipsec_spd_interface_details { + u32 context; + u32 spd_index; + u32 sw_if_index; +}; + /** \brief Add or delete IPsec tunnel interface @param client_index - opaque cookie to identify the sender @param context - sender context, to match reply w/ request @@ -537,6 +562,7 @@ define ipsec_spd_details { @param remote_integ_key - integrity key for inbound IPsec SA @param renumber - intf display name uses a specified instance if != 0 @param show_instance - instance to display for intf if renumber is set + @param udp_encap - enable UDP encapsulation for NAT traversal */ define ipsec_tunnel_if_add_del { u32 client_index; @@ -560,6 +586,7 @@ define ipsec_tunnel_if_add_del { u8 remote_integ_key[128]; u8 renumber; u32 show_instance; + u8 udp_encap; }; /** \brief Add/delete IPsec tunnel interface response diff --git a/src/vnet/ipsec/ipsec_api.c b/src/vnet/ipsec/ipsec_api.c index 37daee0b64fb..a6bccf7a57c5 100644 --- a/src/vnet/ipsec/ipsec_api.c +++ b/src/vnet/ipsec/ipsec_api.c @@ -56,6 +56,7 @@ _(IPSEC_SA_SET_KEY, ipsec_sa_set_key) \ _(IPSEC_SA_DUMP, ipsec_sa_dump) \ _(IPSEC_SPDS_DUMP, ipsec_spds_dump) \ _(IPSEC_SPD_DUMP, ipsec_spd_dump) \ +_(IPSEC_SPD_INTERFACE_DUMP, ipsec_spd_interface_dump) \ _(IPSEC_TUNNEL_IF_ADD_DEL, ipsec_tunnel_if_add_del) \ _(IPSEC_TUNNEL_IF_SET_KEY, ipsec_tunnel_if_set_key) \ _(IPSEC_TUNNEL_IF_SET_SA, ipsec_tunnel_if_set_sa) \ @@ -366,6 +367,60 @@ vl_api_ipsec_spd_dump_t_handler (vl_api_ipsec_spd_dump_t * mp) #endif } +static void +send_ipsec_spd_interface_details (vl_api_registration_t * reg, u32 spd_index, + u32 sw_if_index, u32 context) +{ + vl_api_ipsec_spd_interface_details_t *mp; + + mp = vl_msg_api_alloc (sizeof (*mp)); + memset (mp, 0, sizeof (*mp)); + mp->_vl_msg_id = ntohs (VL_API_IPSEC_SPD_INTERFACE_DETAILS); + mp->context = context; + + mp->spd_index = htonl (spd_index); + mp->sw_if_index = htonl (sw_if_index); + + vl_api_send_msg (reg, (u8 *) mp); +} + +static void +vl_api_ipsec_spd_interface_dump_t_handler (vl_api_ipsec_spd_interface_dump_t * + mp) +{ + ipsec_main_t *im = &ipsec_main; + vl_api_registration_t *reg; + u32 k, v, spd_index; + +#if WITH_LIBSSL > 0 + reg = vl_api_client_index_to_registration (mp->client_index); + if (!reg) + return; + + if (mp->spd_index_valid) + { + spd_index = ntohl (mp->spd_index); + /* *INDENT-OFF* */ + hash_foreach(k, v, im->spd_index_by_sw_if_index, ({ + if (v == spd_index) + send_ipsec_spd_interface_details(reg, v, k, mp->context); + })); + /* *INDENT-ON* */ + } + else + { + /* *INDENT-OFF* */ + hash_foreach(k, v, im->spd_index_by_sw_if_index, ({ + send_ipsec_spd_interface_details(reg, v, k, mp->context); + })); + /* *INDENT-ON* */ + } + +#else + clib_warning ("unimplemented"); +#endif +} + static void vl_api_ipsec_sa_set_key_t_handler (vl_api_ipsec_sa_set_key_t * mp) { @@ -414,6 +469,7 @@ vl_api_ipsec_tunnel_if_add_del_t_handler (vl_api_ipsec_tunnel_if_add_del_t * tun.integ_alg = mp->integ_alg; tun.local_integ_key_len = mp->local_integ_key_len; tun.remote_integ_key_len = mp->remote_integ_key_len; + tun.udp_encap = mp->udp_encap; memcpy (&tun.local_ip, mp->local_ip, 4); memcpy (&tun.remote_ip, mp->remote_ip, 4); memcpy (&tun.local_crypto_key, &mp->local_crypto_key, diff --git a/src/vnet/l2/l2_api.c b/src/vnet/l2/l2_api.c index eb04459f2347..1e14b1c7a508 100644 --- a/src/vnet/l2/l2_api.c +++ b/src/vnet/l2/l2_api.c @@ -986,6 +986,9 @@ l2_api_hookup (vlib_main_t * vm) foreach_vpe_api_msg; #undef _ + /* Mark VL_API_BRIDGE_DOMAIN_DUMP as mp safe */ + am->is_mp_safe[VL_API_BRIDGE_DOMAIN_DUMP] = 1; + /* * Set up the (msg_name, crc, message-id) table */ diff --git a/src/vnet/l2/l2_flood.c b/src/vnet/l2/l2_flood.c index 97a4ff59da7b..aeac8ff10d84 100644 --- a/src/vnet/l2/l2_flood.c +++ b/src/vnet/l2/l2_flood.c @@ -209,77 +209,85 @@ l2flood_node_fn (vlib_main_t * vm, bi0, L2FLOOD_NEXT_DROP); continue; } - - vec_validate (msm->clones[thread_index], n_clones); - vec_reset_length (msm->clones[thread_index]); - - /* - * the header offset needs to be large enough to incorporate - * all the L3 headers that could be touched when doing BVI - * processing. So take the current l2 length plus 2 * IPv6 - * headers (for tunnel encap) - */ - n_cloned = vlib_buffer_clone (vm, bi0, - msm->clones[thread_index], - n_clones, - (vnet_buffer (b0)->l2.l2_len + - sizeof (udp_header_t) + - 2 * sizeof (ip6_header_t))); - - if (PREDICT_FALSE (n_cloned != n_clones)) + else if (n_clones > 1) { - b0->error = node->errors[L2FLOOD_ERROR_REPL_FAIL]; - } + vec_validate (msm->clones[thread_index], n_clones); + vec_reset_length (msm->clones[thread_index]); + + /* + * the header offset needs to be large enough to incorporate + * all the L3 headers that could be touched when doing BVI + * processing. So take the current l2 length plus 2 * IPv6 + * headers (for tunnel encap) + */ + n_cloned = vlib_buffer_clone (vm, bi0, + msm->clones[thread_index], + n_clones, + VLIB_BUFFER_CLONE_HEAD_SIZE); + + if (PREDICT_FALSE (n_cloned != n_clones)) + { + b0->error = node->errors[L2FLOOD_ERROR_REPL_FAIL]; + } - /* - * for all but the last clone, these are not BVI bound - */ - for (clone0 = 0; clone0 < n_cloned - 1; clone0++) - { - member = msm->members[thread_index][clone0]; - ci0 = msm->clones[thread_index][clone0]; - c0 = vlib_get_buffer (vm, ci0); + /* + * for all but the last clone, these are not BVI bound + */ + for (clone0 = 0; clone0 < n_cloned - 1; clone0++) + { + member = msm->members[thread_index][clone0]; + ci0 = msm->clones[thread_index][clone0]; + c0 = vlib_get_buffer (vm, ci0); - to_next[0] = ci0; - to_next += 1; - n_left_to_next -= 1; + to_next[0] = ci0; + to_next += 1; + n_left_to_next -= 1; - if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) && - (b0->flags & VLIB_BUFFER_IS_TRACED))) - { - ethernet_header_t *h0; - l2flood_trace_t *t; - - if (c0 != b0) - vlib_buffer_copy_trace_flag (vm, b0, ci0); - - t = vlib_add_trace (vm, node, c0, sizeof (*t)); - h0 = vlib_buffer_get_current (c0); - t->sw_if_index = sw_if_index0; - t->bd_index = vnet_buffer (c0)->l2.bd_index; - clib_memcpy (t->src, h0->src_address, 6); - clib_memcpy (t->dst, h0->dst_address, 6); - } + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) && + (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + ethernet_header_t *h0; + l2flood_trace_t *t; + + if (c0 != b0) + vlib_buffer_copy_trace_flag (vm, b0, ci0); + + t = vlib_add_trace (vm, node, c0, sizeof (*t)); + h0 = vlib_buffer_get_current (c0); + t->sw_if_index = sw_if_index0; + t->bd_index = vnet_buffer (c0)->l2.bd_index; + clib_memcpy (t->src, h0->src_address, 6); + clib_memcpy (t->dst, h0->dst_address, 6); + } - /* Do normal L2 forwarding */ - vnet_buffer (c0)->sw_if_index[VLIB_TX] = member->sw_if_index; + /* Do normal L2 forwarding */ + vnet_buffer (c0)->sw_if_index[VLIB_TX] = + member->sw_if_index; - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, - to_next, n_left_to_next, - ci0, next0); - if (PREDICT_FALSE (0 == n_left_to_next)) - { - vlib_put_next_frame (vm, node, next_index, n_left_to_next); - vlib_get_next_frame (vm, node, next_index, - to_next, n_left_to_next); + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + ci0, next0); + if (PREDICT_FALSE (0 == n_left_to_next)) + { + vlib_put_next_frame (vm, node, next_index, + n_left_to_next); + vlib_get_next_frame (vm, node, next_index, to_next, + n_left_to_next); + } } + member = msm->members[thread_index][clone0]; + ci0 = msm->clones[thread_index][clone0]; + } + else + { + /* one clone */ + ci0 = bi0; + member = msm->members[thread_index][0]; } /* * the last clone that might go to a BVI */ - member = msm->members[thread_index][clone0]; - ci0 = msm->clones[thread_index][clone0]; c0 = vlib_get_buffer (vm, ci0); to_next[0] = ci0; diff --git a/src/vnet/lisp-gpe/lisp_gpe.c b/src/vnet/lisp-gpe/lisp_gpe.c index c7b3d887cfa8..66304ae344aa 100644 --- a/src/vnet/lisp-gpe/lisp_gpe.c +++ b/src/vnet/lisp-gpe/lisp_gpe.c @@ -193,10 +193,15 @@ clib_error_t * vnet_lisp_gpe_enable_disable (vnet_lisp_gpe_enable_disable_args_t * a) { lisp_gpe_main_t *lgm = &lisp_gpe_main; + vlib_main_t *vm = vlib_get_main (); if (a->is_en) { lgm->is_en = 1; + udp_register_dst_port (vm, UDP_DST_PORT_lisp_gpe, + lisp_gpe_ip4_input_node.index, 1 /* is_ip4 */ ); + udp_register_dst_port (vm, UDP_DST_PORT_lisp_gpe6, + lisp_gpe_ip6_input_node.index, 0 /* is_ip4 */ ); } else { @@ -206,6 +211,8 @@ vnet_lisp_gpe_enable_disable (vnet_lisp_gpe_enable_disable_args_t * a) /* disable all l3 ifaces */ lisp_gpe_tenant_flush (); + udp_unregister_dst_port (vm, UDP_DST_PORT_lisp_gpe, 0 /* is_ip4 */ ); + udp_unregister_dst_port (vm, UDP_DST_PORT_lisp_gpe6, 1 /* is_ip4 */ ); lgm->is_en = 0; } @@ -612,11 +619,6 @@ lisp_gpe_init (vlib_main_t * vm) lgm->lisp_gpe_fwd_entries = hash_create_mem (0, sizeof (lisp_gpe_fwd_entry_key_t), sizeof (uword)); - udp_register_dst_port (vm, UDP_DST_PORT_lisp_gpe, - lisp_gpe_ip4_input_node.index, 1 /* is_ip4 */ ); - udp_register_dst_port (vm, UDP_DST_PORT_lisp_gpe6, - lisp_gpe_ip6_input_node.index, 0 /* is_ip4 */ ); - lgm->lisp_stats_index_by_key = hash_create_mem (0, sizeof (lisp_stats_key_t), sizeof (uword)); memset (&lgm->counters, 0, sizeof (lgm->counters)); diff --git a/src/vnet/mpls/interface.c b/src/vnet/mpls/interface.c index ec541f760de3..46d80f07a387 100644 --- a/src/vnet/mpls/interface.c +++ b/src/vnet/mpls/interface.c @@ -62,7 +62,7 @@ mpls_sw_interface_enable_disable (mpls_main_t * mm, fib_table_lock(lfib_index, FIB_PROTOCOL_MPLS, (is_api? FIB_SOURCE_API: FIB_SOURCE_CLI)); - vec_validate(mm->fib_index_by_sw_if_index, 0); + vec_validate(mm->fib_index_by_sw_if_index, sw_if_index); mm->fib_index_by_sw_if_index[sw_if_index] = lfib_index; } else diff --git a/src/vnet/session-apps/echo_client.c b/src/vnet/session-apps/echo_client.c index b47dcf21a4b9..1ece0196dde3 100644 --- a/src/vnet/session-apps/echo_client.c +++ b/src/vnet/session-apps/echo_client.c @@ -208,7 +208,7 @@ echo_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, connections_this_batch = ecm->connections_this_batch_by_thread[my_thread_index]; - if ((ecm->run_test == 0) || + if ((ecm->run_test != ECHO_CLIENTS_RUNNING) || ((vec_len (connection_indices) == 0) && vec_len (connections_this_batch) == 0)) return 0; @@ -361,9 +361,13 @@ echo_clients_session_connected_callback (u32 app_index, u32 api_context, u32 session_index; u8 thread_index; + if (PREDICT_FALSE (ecm->run_test != ECHO_CLIENTS_STARTING)) + return -1; + if (is_fail) { clib_warning ("connection %d failed!", api_context); + ecm->run_test = ECHO_CLIENTS_EXITING; signal_evt_to_cli (-1); return 0; } @@ -407,7 +411,7 @@ echo_clients_session_connected_callback (u32 app_index, u32 api_context, __sync_fetch_and_add (&ecm->ready_connections, 1); if (ecm->ready_connections == ecm->expected_connections) { - ecm->run_test = 1; + ecm->run_test = ECHO_CLIENTS_RUNNING; /* Signal the CLI process that the action is starting... */ signal_evt_to_cli (1); } @@ -441,12 +445,28 @@ echo_clients_session_disconnect_callback (stream_session_t * s) return; } +void +echo_clients_session_disconnect (stream_session_t * s) +{ + echo_client_main_t *ecm = &echo_client_main; + vnet_disconnect_args_t _a, *a = &_a; + a->handle = session_handle (s); + a->app_index = ecm->app_index; + vnet_disconnect_session (a); +} + static int echo_clients_rx_callback (stream_session_t * s) { echo_client_main_t *ecm = &echo_client_main; eclient_session_t *sp; + if (PREDICT_FALSE (ecm->run_test != ECHO_CLIENTS_RUNNING)) + { + echo_clients_session_disconnect (s); + return -1; + } + sp = pool_elt_at_index (ecm->sessions, s->server_rx_fifo->client_session_index); receive_data_chunk (ecm, sp); @@ -624,6 +644,7 @@ echo_clients_command_fn (vlib_main_t * vm, ecm->vlib_main = vm; ecm->tls_engine = TLS_ENGINE_OPENSSL; ecm->no_copy = 0; + ecm->run_test = ECHO_CLIENTS_STARTING; if (thread_main->n_vlib_mains > 1) clib_spinlock_init (&ecm->sessions_lock); @@ -745,7 +766,7 @@ echo_clients_command_fn (vlib_main_t * vm, /* Fire off connect requests */ time_before_connects = vlib_time_now (vm); if ((error = echo_clients_connect (vm, n_clients))) - return error; + goto cleanup; /* Park until the sessions come up, or ten seconds elapse... */ vlib_process_wait_for_event_or_clock (vm, syn_timeout); @@ -825,7 +846,7 @@ echo_clients_command_fn (vlib_main_t * vm, error = clib_error_return (0, "failed: test bytes"); cleanup: - ecm->run_test = 0; + ecm->run_test = ECHO_CLIENTS_EXITING; vlib_process_wait_for_event_or_clock (vm, 10e-3); for (i = 0; i < vec_len (ecm->connection_index_by_thread); i++) { diff --git a/src/vnet/session-apps/echo_client.h b/src/vnet/session-apps/echo_client.h index db5ba1636289..2270720dcdad 100644 --- a/src/vnet/session-apps/echo_client.h +++ b/src/vnet/session-apps/echo_client.h @@ -105,6 +105,12 @@ typedef struct vlib_main_t *vlib_main; } echo_client_main_t; +enum +{ + ECHO_CLIENTS_STARTING, + ECHO_CLIENTS_RUNNING, + ECHO_CLIENTS_EXITING +} echo_clients_test_state_e; extern echo_client_main_t echo_client_main; vlib_node_registration_t echo_clients_node; diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 83b96d317dcd..81c93064d38d 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -153,7 +153,7 @@ session_free (stream_session_t * s) memset (s, 0xFA, sizeof (*s)); } -static void +void session_free_w_fifos (stream_session_t * s) { segment_manager_dealloc_fifos (s->svm_segment_index, s->server_rx_fifo, @@ -197,7 +197,7 @@ session_alloc_for_connection (transport_connection_t * tc) s = session_alloc (thread_index); s->session_type = session_type_from_proto_and_ip (tc->proto, tc->is_ip4); s->session_state = SESSION_STATE_CONNECTING; - s->enqueue_epoch = ~0; + s->enqueue_epoch = (u64) ~ 0; /* Attach transport to session and vice versa */ s->connection_index = tc->c_index; @@ -393,7 +393,7 @@ session_enqueue_stream_connection (transport_connection_t * tc, * by calling stream_server_flush_enqueue_events () */ session_manager_main_t *smm = vnet_get_session_manager_main (); u32 thread_index = s->thread_index; - u32 enqueue_epoch = smm->current_enqueue_epoch[tc->proto][thread_index]; + u64 enqueue_epoch = smm->current_enqueue_epoch[tc->proto][thread_index]; if (s->enqueue_epoch != enqueue_epoch) { @@ -434,7 +434,7 @@ session_enqueue_dgram_connection (stream_session_t * s, * by calling stream_server_flush_enqueue_events () */ session_manager_main_t *smm = vnet_get_session_manager_main (); u32 thread_index = s->thread_index; - u32 enqueue_epoch = smm->current_enqueue_epoch[proto][thread_index]; + u64 enqueue_epoch = smm->current_enqueue_epoch[proto][thread_index]; if (s->enqueue_epoch != enqueue_epoch) { @@ -814,17 +814,22 @@ stream_session_delete_notify (transport_connection_t * tc) * from the app, do the whole disconnect since we might still * have lingering events */ stream_session_disconnect (s); + s->session_state = SESSION_STATE_CLOSED; break; case SESSION_STATE_CLOSING: /* Cleanup lookup table. Transport needs to still be valid */ session_lookup_del_session (s); + s->session_state = SESSION_STATE_CLOSED; break; case SESSION_STATE_CLOSED: + case SESSION_STATE_ACCEPTING: stream_session_delete (s); break; + default: + /* Assume connection was not yet added the lookup table */ + session_free_w_fifos (s); + break; } - - s->session_state = SESSION_STATE_CLOSED; } /** diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index 1e08cccb6f7f..914e0581fecd 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -195,7 +195,7 @@ struct _session_manager_main clib_rwlock_t *peekers_rw_locks; /** Per-proto, per-worker enqueue epoch counters */ - u32 *current_enqueue_epoch[TRANSPORT_N_PROTO]; + u64 *current_enqueue_epoch[TRANSPORT_N_PROTO]; /** Per-proto, per-worker thread vector of sessions to enqueue */ u32 **session_to_enqueue[TRANSPORT_N_PROTO]; @@ -308,6 +308,7 @@ stream_session_is_valid (u32 si, u8 thread_index) stream_session_t *session_alloc (u32 thread_index); int session_alloc_fifos (segment_manager_t * sm, stream_session_t * s); void session_free (stream_session_t * s); +void session_free_w_fifos (stream_session_t * s); always_inline stream_session_t * session_get (u32 si, u32 thread_index) diff --git a/src/vnet/session/session_api.c b/src/vnet/session/session_api.c index fc63428277ee..565938653816 100755 --- a/src/vnet/session/session_api.c +++ b/src/vnet/session/session_api.c @@ -278,30 +278,6 @@ send_session_accept_callback (stream_session_t * s) return 0; } -void -mq_send_local_session_disconnected_cb (u32 app_wrk_index, - local_session_t * ls) -{ - app_worker_t *app_wrk = app_worker_get (app_wrk_index); - svm_msg_q_msg_t _msg, *msg = &_msg; - session_disconnected_msg_t *mp; - svm_msg_q_t *app_mq; - session_event_t *evt; - application_t *app; - - app = application_get (app_wrk->app_index); - app_mq = app_wrk->event_queue; - svm_msg_q_lock_and_alloc_msg_w_ring (app_mq, SESSION_MQ_CTRL_EVT_RING, - SVM_Q_WAIT, msg); - evt = svm_msg_q_msg_data (app_mq, msg); - memset (evt, 0, sizeof (*evt)); - evt->event_type = SESSION_CTRL_EVT_DISCONNECTED; - mp = (session_disconnected_msg_t *) evt->data; - mp->handle = application_local_session_handle (ls); - mp->context = app->api_client_index; - svm_msg_q_add_and_unlock (app_mq, msg); -} - static void send_session_disconnect_callback (stream_session_t * s) { @@ -421,6 +397,23 @@ static session_cb_vft_t session_cb_vft = { .del_segment_callback = send_del_segment_callback, }; +static int +mq_try_lock_and_alloc_msg (svm_msg_q_t * app_mq, svm_msg_q_msg_t * msg) +{ + int rv; + u8 try = 0; + while (try < 100) + { + rv = svm_msg_q_lock_and_alloc_msg_w_ring (app_mq, + SESSION_MQ_CTRL_EVT_RING, + SVM_Q_NOWAIT, msg); + if (!rv) + return 0; + try++; + } + return -1; +} + static int mq_send_session_accepted_cb (stream_session_t * s) { @@ -436,8 +429,8 @@ mq_send_session_accepted_cb (stream_session_t * s) app = application_get (app_wrk->app_index); app_mq = app_wrk->event_queue; - svm_msg_q_lock_and_alloc_msg_w_ring (app_mq, SESSION_MQ_CTRL_EVT_RING, - SVM_Q_WAIT, msg); + if (mq_try_lock_and_alloc_msg (app_mq, msg)) + return -1; evt = svm_msg_q_msg_data (app_mq, msg); memset (evt, 0, sizeof (*evt)); @@ -523,8 +516,8 @@ mq_send_session_disconnected_cb (stream_session_t * s) app = application_get (app_wrk->app_index); app_mq = app_wrk->event_queue; - svm_msg_q_lock_and_alloc_msg_w_ring (app_mq, SESSION_MQ_CTRL_EVT_RING, - SVM_Q_WAIT, msg); + if (mq_try_lock_and_alloc_msg (app_mq, msg)) + return; evt = svm_msg_q_msg_data (app_mq, msg); memset (evt, 0, sizeof (*evt)); evt->event_type = SESSION_CTRL_EVT_DISCONNECTED; @@ -534,6 +527,30 @@ mq_send_session_disconnected_cb (stream_session_t * s) svm_msg_q_add_and_unlock (app_mq, msg); } +void +mq_send_local_session_disconnected_cb (u32 app_wrk_index, + local_session_t * ls) +{ + app_worker_t *app_wrk = app_worker_get (app_wrk_index); + svm_msg_q_msg_t _msg, *msg = &_msg; + session_disconnected_msg_t *mp; + svm_msg_q_t *app_mq; + session_event_t *evt; + application_t *app; + + app = application_get (app_wrk->app_index); + app_mq = app_wrk->event_queue; + if (mq_try_lock_and_alloc_msg (app_mq, msg)) + return; + evt = svm_msg_q_msg_data (app_mq, msg); + memset (evt, 0, sizeof (*evt)); + evt->event_type = SESSION_CTRL_EVT_DISCONNECTED; + mp = (session_disconnected_msg_t *) evt->data; + mp->handle = application_local_session_handle (ls); + mp->context = app->api_client_index; + svm_msg_q_add_and_unlock (app_mq, msg); +} + static void mq_send_session_reset_cb (stream_session_t * s) { @@ -544,8 +561,8 @@ mq_send_session_reset_cb (stream_session_t * s) session_event_t *evt; app_mq = app->event_queue; - svm_msg_q_lock_and_alloc_msg_w_ring (app_mq, SESSION_MQ_CTRL_EVT_RING, - SVM_Q_WAIT, msg); + if (mq_try_lock_and_alloc_msg (app_mq, msg)) + return; evt = svm_msg_q_msg_data (app_mq, msg); memset (evt, 0, sizeof (*evt)); evt->event_type = SESSION_CTRL_EVT_RESET; @@ -576,8 +593,8 @@ mq_send_session_connected_cb (u32 app_wrk_index, u32 api_context, return -1; } - svm_msg_q_lock_and_alloc_msg_w_ring (app_mq, SESSION_MQ_CTRL_EVT_RING, - SVM_Q_WAIT, msg); + if (mq_try_lock_and_alloc_msg (app_mq, msg)) + return -1; evt = svm_msg_q_msg_data (app_mq, msg); memset (evt, 0, sizeof (*evt)); evt->event_type = SESSION_CTRL_EVT_CONNECTED; @@ -656,8 +673,9 @@ mq_send_session_bound_cb (u32 app_wrk_index, u32 api_context, return -1; } - svm_msg_q_lock_and_alloc_msg_w_ring (app_mq, SESSION_MQ_CTRL_EVT_RING, - SVM_Q_WAIT, msg); + if (mq_try_lock_and_alloc_msg (app_mq, msg)) + return -1; + evt = svm_msg_q_msg_data (app_mq, msg); memset (evt, 0, sizeof (*evt)); evt->event_type = SESSION_CTRL_EVT_BOUND; diff --git a/src/vnet/session/stream_session.h b/src/vnet/session/stream_session.h index 30178d7a4539..287a8927339a 100644 --- a/src/vnet/session/stream_session.h +++ b/src/vnet/session/stream_session.h @@ -67,7 +67,7 @@ typedef struct _stream_session_t u8 thread_index; /** To avoid n**2 "one event per frame" check */ - u8 enqueue_epoch; + u64 enqueue_epoch; /** svm segment index where fifos were allocated */ u32 svm_segment_index; @@ -120,6 +120,9 @@ typedef struct local_session_ /** Port for connection. Overlaps thread_index/enqueue_epoch */ u16 port; + /** Partly overlaps enqueue_epoch */ + u8 pad_epoch[7]; + /** Segment index where fifos were allocated */ u32 svm_segment_index; diff --git a/src/vnet/span/node.c b/src/vnet/span/node.c index 67f1d6e42bba..f4be2e928955 100644 --- a/src/vnet/span/node.c +++ b/src/vnet/span/node.c @@ -35,7 +35,7 @@ format_span_trace (u8 * s, va_list * args) CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); span_trace_t *t = va_arg (*args, span_trace_t *); - vnet_main_t *vnm = &vnet_main; + vnet_main_t *vnm = vnet_get_main (); s = format (s, "SPAN: mirrored %U -> %U", format_vnet_sw_if_index_name, vnm, t->src_sw_if_index, format_vnet_sw_if_index_name, vnm, t->mirror_sw_if_index); @@ -67,7 +67,7 @@ span_mirror (vlib_main_t * vm, vlib_node_runtime_t * node, u32 sw_if_index0, { vlib_buffer_t *c0; span_main_t *sm = &span_main; - vnet_main_t *vnm = &vnet_main; + vnet_main_t *vnm = vnet_get_main (); u32 *to_mirror_next = 0; u32 i; span_interface_t *si0; @@ -92,8 +92,7 @@ span_mirror (vlib_main_t * vm, vlib_node_runtime_t * node, u32 sw_if_index0, if (mirror_frames[i] == 0) { if (sf == SPAN_FEAT_L2) - mirror_frames[i] = vlib_get_frame_to_node (vnm->vlib_main, - l2output_node.index); + mirror_frames[i] = vlib_get_frame_to_node (vm, l2output_node.index); else mirror_frames[i] = vnet_get_frame_to_sw_interface (vnm, i); } @@ -134,7 +133,7 @@ span_node_inline_fn (vlib_main_t * vm, vlib_node_runtime_t * node, span_feat_t sf) { span_main_t *sm = &span_main; - vnet_main_t *vnm = &vnet_main; + vnet_main_t *vnm = vnet_get_main (); u32 n_left_from, *from, *to_next; u32 n_span_packets = 0; u32 next_index; @@ -262,7 +261,7 @@ span_node_inline_fn (vlib_main_t * vm, vlib_node_runtime_t * node, continue; if (sf == SPAN_FEAT_L2) - vlib_put_frame_to_node (vnm->vlib_main, l2output_node.index, f); + vlib_put_frame_to_node (vm, l2output_node.index, f); else vnet_put_frame_to_sw_interface (vnm, sw_if_index, f); mirror_frames[sw_if_index] = 0; diff --git a/src/vnet/span/span.c b/src/vnet/span/span.c index b7292cf1e800..fc923e0ce31b 100644 --- a/src/vnet/span/span.c +++ b/src/vnet/span/span.c @@ -206,7 +206,7 @@ show_interfaces_span_command_fn (vlib_main_t * vm, clib_bitmap_t *b = clib_bitmap_dup_or (d, l); if (header) { - vlib_cli_output (vm, "%-20s %-20s %6s %6s", "Source", "Destination", + vlib_cli_output (vm, "%-32s %-32s %6s %6s", "Source", "Destination", "Device", "L2"); header = 0; } @@ -219,7 +219,7 @@ show_interfaces_span_command_fn (vlib_main_t * vm, int l2 = (clib_bitmap_get (lrxm->mirror_ports, i) + clib_bitmap_get (ltxm->mirror_ports, i) * 2); - vlib_cli_output (vm, "%-20v %-20U (%6s) (%6s)", s, + vlib_cli_output (vm, "%-32v %-32U (%6s) (%6s)", s, format_vnet_sw_if_index_name, vnm, i, states[device], states[l2]); vec_reset_length (s); diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 1c8ce34a728a..5378de1c1daa 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -210,7 +210,8 @@ tcp_connection_cleanup (tcp_connection_t * tc) /* Try to remove the half-open connection. If this is not the owning * thread, tc won't be removed. Retransmit or establish timers will * eventually expire and call again cleanup on the right thread. */ - tcp_half_open_connection_cleanup (tc); + if (tcp_half_open_connection_cleanup (tc)) + tc->flags |= TCP_CONN_HALF_OPEN_DONE; } else { @@ -322,8 +323,10 @@ tcp_connection_close (tcp_connection_t * tc) tc->state = TCP_STATE_CLOSED; break; case TCP_STATE_SYN_RCVD: + tcp_connection_timers_reset (tc); tcp_send_fin (tc); tc->state = TCP_STATE_FIN_WAIT_1; + tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); break; case TCP_STATE_ESTABLISHED: if (!session_tx_fifo_max_dequeue (&tc->connection)) @@ -939,9 +942,11 @@ format_tcp_scoreboard (u8 * s, va_list * args) s = format (s, "sacked_bytes %u last_sacked_bytes %u lost_bytes %u\n", sb->sacked_bytes, sb->last_sacked_bytes, sb->lost_bytes); s = format (s, " last_bytes_delivered %u high_sacked %u snd_una_adv %u\n", - sb->last_bytes_delivered, sb->high_sacked, sb->snd_una_adv); + sb->last_bytes_delivered, sb->high_sacked - tc->iss, + sb->snd_una_adv); s = format (s, " cur_rxt_hole %u high_rxt %u rescue_rxt %u", - sb->cur_rxt_hole, sb->high_rxt, sb->rescue_rxt); + sb->cur_rxt_hole, sb->high_rxt - tc->iss, + sb->rescue_rxt - tc->iss); hole = scoreboard_first_hole (sb); if (hole) diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 10f50fefc093..87bacc243546 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -135,7 +135,8 @@ tcp_options_parse (tcp_header_t * th, tcp_options_t * to) data = (const u8 *) (th + 1); /* Zero out all flags but those set in SYN */ - to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE); + to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE + | TCP_OPTS_FLAG_SACK); for (; opts_len > 0; opts_len -= opt_len, data += opt_len) { @@ -666,39 +667,45 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, static void scoreboard_update_bytes (tcp_connection_t * tc, sack_scoreboard_t * sb) { - sack_scoreboard_hole_t *hole, *prev; + sack_scoreboard_hole_t *left, *right; u32 bytes = 0, blks = 0; sb->lost_bytes = 0; sb->sacked_bytes = 0; - hole = scoreboard_last_hole (sb); - if (!hole) + left = scoreboard_last_hole (sb); + if (!left) return; - if (seq_gt (sb->high_sacked, hole->end)) + if (seq_gt (sb->high_sacked, left->end)) { - bytes = sb->high_sacked - hole->end; + bytes = sb->high_sacked - left->end; blks = 1; } - while ((prev = scoreboard_prev_hole (sb, hole)) - && (bytes < (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss - && blks < TCP_DUPACK_THRESHOLD)) + while ((right = left) + && bytes < (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss + && blks < TCP_DUPACK_THRESHOLD + /* left not updated if above conditions fail */ + && (left = scoreboard_prev_hole (sb, right))) { - bytes += hole->start - prev->end; + bytes += right->start - left->end; blks++; - hole = prev; } - while (hole) + /* left is first lost */ + if (left) { - sb->lost_bytes += scoreboard_hole_bytes (hole); - hole->is_lost = 1; - prev = hole; - hole = scoreboard_prev_hole (sb, hole); - if (hole) - bytes += prev->start - hole->end; + do + { + sb->lost_bytes += scoreboard_hole_bytes (right); + left->is_lost = 1; + left = scoreboard_prev_hole (sb, right); + if (left) + bytes += right->start - left->end; + } + while ((right = left)); } + sb->sacked_bytes = bytes; } @@ -814,7 +821,8 @@ tcp_scoreboard_is_sane_post_recovery (tcp_connection_t * tc) { sack_scoreboard_hole_t *hole; hole = scoreboard_first_hole (&tc->sack_sb); - return (!hole || seq_geq (hole->start, tc->snd_una)); + return (!hole || (seq_geq (hole->start, tc->snd_una) + && seq_lt (hole->end, tc->snd_una_max))); } void @@ -973,6 +981,14 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) } } + if (pool_elts (sb->holes) == 1) + { + hole = scoreboard_first_hole (sb); + if (hole->start == ack + sb->snd_una_adv + && hole->end == tc->snd_una_max) + scoreboard_remove_hole (sb, hole); + } + scoreboard_update_bytes (tc, sb); sb->last_sacked_bytes = sb->sacked_bytes - (old_sacked_bytes - sb->last_bytes_delivered); @@ -2674,9 +2690,12 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tc0->state = TCP_STATE_CLOSED; TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); - /* Delete the connection/session since the pipes should be - * clear by now */ - tcp_connection_del (tc0); + + /* Don't free the connection from the data path since + * we can't ensure that we have no packets already enqueued + * to output. Rely instead on the waitclose timer */ + tcp_connection_timers_reset (tc0); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, 1); goto drop; @@ -3488,6 +3507,7 @@ do { \ _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(LAST_ACK, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(TIME_WAIT, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(TIME_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 0d5feb976f86..7d7c32ad210a 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1076,16 +1076,25 @@ tcp_send_fin (tcp_connection_t * tc) u32 bi; u8 fin_snt = 0; - tcp_retransmit_timer_force_update (tc); - if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) - return; - b = vlib_get_buffer (vm, bi); - tcp_init_buffer (vm, b); fin_snt = tc->flags & TCP_CONN_FINSNT; if (fin_snt) tc->snd_nxt = tc->snd_una; + + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + { + /* Out of buffers so program fin retransmit ASAP */ + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, 1); + goto post_enqueue; + } + + tcp_retransmit_timer_force_update (tc); + b = vlib_get_buffer (vm, bi); + tcp_init_buffer (vm, b); tcp_make_fin (tc, b); tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4); + TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); + +post_enqueue: if (!fin_snt) { tc->flags |= TCP_CONN_FINSNT; @@ -1098,7 +1107,6 @@ tcp_send_fin (tcp_connection_t * tc) { tc->snd_nxt = tc->snd_una_max; } - TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); } always_inline u8 diff --git a/src/vnet/tls/tls.c b/src/vnet/tls/tls.c index aba7919927c0..f4814a3ce217 100644 --- a/src/vnet/tls/tls.c +++ b/src/vnet/tls/tls.c @@ -26,6 +26,18 @@ static tls_engine_vft_t *tls_vfts; void tls_disconnect (u32 ctx_handle, u32 thread_index); +static void +tls_disconnect_transport (tls_ctx_t * ctx) +{ + vnet_disconnect_args_t a = { + .handle = ctx->tls_session_handle, + .app_index = tls_main.app_index, + }; + + if (vnet_disconnect_session (&a)) + clib_warning ("disconnect returned"); +} + tls_engine_type_t tls_get_available_engine (void) { @@ -91,6 +103,8 @@ tls_listener_ctx_alloc (void) void tls_listener_ctx_free (tls_ctx_t * ctx) { + if (CLIB_DEBUG) + memset (ctx, 0xfb, sizeof (*ctx)); pool_put (tls_main.listener_ctx_pool, ctx); } @@ -119,6 +133,7 @@ tls_ctx_half_open_alloc (void) { clib_rwlock_writer_lock (&tm->half_open_rwlock); pool_get (tm->half_open_ctx_pool, ctx); + ctx_index = ctx - tm->half_open_ctx_pool; clib_rwlock_writer_unlock (&tm->half_open_rwlock); } else @@ -126,10 +141,10 @@ tls_ctx_half_open_alloc (void) /* reader lock assumption: only main thread will call pool_get */ clib_rwlock_reader_lock (&tm->half_open_rwlock); pool_get (tm->half_open_ctx_pool, ctx); + ctx_index = ctx - tm->half_open_ctx_pool; clib_rwlock_reader_unlock (&tm->half_open_rwlock); } memset (ctx, 0, sizeof (*ctx)); - ctx_index = ctx - tm->half_open_ctx_pool; return ctx_index; } @@ -225,7 +240,7 @@ tls_notify_app_connected (tls_ctx_t * ctx, u8 is_failed) app_wrk = app_worker_get_if_valid (ctx->parent_app_index); if (!app_wrk) { - tls_disconnect (ctx->tls_ctx_handle, vlib_get_thread_index ()); + tls_disconnect_transport (ctx); return -1; } @@ -248,14 +263,16 @@ tls_notify_app_connected (tls_ctx_t * ctx, u8 is_failed) ctx->app_session_handle = session_handle (app_session); ctx->c_s_index = app_session->session_index; - app_session->session_state = SESSION_STATE_READY; + app_session->session_state = SESSION_STATE_CONNECTING; if (cb_fn (ctx->parent_app_index, ctx->parent_app_api_context, app_session, 0 /* not failed */ )) { TLS_DBG (1, "failed to notify app"); tls_disconnect (ctx->tls_ctx_handle, vlib_get_thread_index ()); + return -1; } + app_session->session_state = SESSION_STATE_READY; session_lookup_add_connection (&ctx->connection, session_handle (app_session)); @@ -550,15 +567,7 @@ tls_disconnect (u32 ctx_handle, u32 thread_index) TLS_DBG (1, "Disconnecting %x", ctx_handle); ctx = tls_ctx_get (ctx_handle); - - vnet_disconnect_args_t a = { - .handle = ctx->tls_session_handle, - .app_index = tls_main.app_index, - }; - - if (vnet_disconnect_session (&a)) - clib_warning ("disconnect returned"); - + tls_disconnect_transport (ctx); stream_session_delete_notify (&ctx->connection); tls_ctx_free (ctx); } diff --git a/src/vnet/util/throttle.h b/src/vnet/util/throttle.h index 97ebb2597b3a..28bf7aa2217d 100644 --- a/src/vnet/util/throttle.h +++ b/src/vnet/util/throttle.h @@ -17,6 +17,7 @@ #define __THROTTLE_H__ #include +#include /** * @brief A throttle @@ -28,7 +29,7 @@ typedef struct throttle_t_ { f64 time; uword **bitmaps; - u32 *seeds; + u64 *seeds; f64 *last_seed_change_time; } throttle_t; @@ -36,12 +37,12 @@ typedef struct throttle_t_ extern void throttle_init (throttle_t * t, u32 n_threads, f64 time); -always_inline u32 +always_inline u64 throttle_seed (throttle_t * t, u32 thread_index, f64 time_now) { if (time_now - t->last_seed_change_time[thread_index] > t->time) { - (void) random_u32 (&t->seeds[thread_index]); + (void) random_u64 (&t->seeds[thread_index]); memset (t->bitmaps[thread_index], 0, THROTTLE_BITS / BITS (u8)); t->last_seed_change_time[thread_index] = time_now; @@ -50,13 +51,14 @@ throttle_seed (throttle_t * t, u32 thread_index, f64 time_now) } always_inline int -throttle_check (throttle_t * t, u32 thread_index, u32 hash, u32 seed) +throttle_check (throttle_t * t, u32 thread_index, u64 hash, u64 seed) { int drop; uword m; u32 w; - hash ^= seed; + hash = clib_xxhash (hash ^ seed); + /* Select bit number */ hash &= THROTTLE_BITS - 1; w = hash / BITS (uword); diff --git a/src/vnet/vxlan/decap.c b/src/vnet/vxlan/decap.c index 387539d8f4a2..b84a9b6eb56e 100644 --- a/src/vnet/vxlan/decap.c +++ b/src/vnet/vxlan/decap.c @@ -91,7 +91,7 @@ vxlan4_find_tunnel (vxlan_main_t * vxm, last_tunnel_cache4 * cache, }; if (PREDICT_TRUE - (key4.key[0] == cache->key[0] || key4.key[1] == cache->key[1])) + (key4.key[0] == cache->key[0] && key4.key[1] == cache->key[1])) { /* cache hit */ vxlan_decap_info_t di = {.as_u64 = cache->value }; diff --git a/src/vnet/vxlan/vxlan.c b/src/vnet/vxlan/vxlan.c index 93a4e2635792..4276d6689acf 100644 --- a/src/vnet/vxlan/vxlan.c +++ b/src/vnet/vxlan/vxlan.c @@ -599,7 +599,8 @@ int vnet_vxlan_add_del_tunnel if (!p) return VNET_API_ERROR_NO_SUCH_ENTRY; - u32 instance = vxm->tunnel_index_by_sw_if_index[p->sw_if_index]; + u32 instance = is_ip6 ? key6.value : + vxm->tunnel_index_by_sw_if_index[p->sw_if_index]; vxlan_tunnel_t *t = pool_elt_at_index (vxm->tunnels, instance); sw_if_index = t->sw_if_index; diff --git a/src/vpp-api/client/libvppapiclient.map b/src/vpp-api/client/libvppapiclient.map index 00a26fbc90e2..cb3d18b4de0e 100644 --- a/src/vpp-api/client/libvppapiclient.map +++ b/src/vpp-api/client/libvppapiclient.map @@ -21,5 +21,6 @@ VPPAPICLIENT_18.10 { stat_segment_heartbeat; stat_segment_string_vector; stat_segment_vec_len; + stat_segment_vec_free; local: *; }; diff --git a/src/vpp-api/client/stat_client.c b/src/vpp-api/client/stat_client.c index 1c099edc7897..0042a2be4e43 100644 --- a/src/vpp-api/client/stat_client.c +++ b/src/vpp-api/client/stat_client.c @@ -364,6 +364,12 @@ stat_segment_vec_len (void *vec) return vec_len (vec); } +void +stat_segment_vec_free (void *vec) +{ + vec_free (vec); +} + /* Create a vector from a string (or add to existing) */ u8 ** stat_segment_string_vector (u8 ** string_vector, char *string) diff --git a/src/vpp-api/client/stat_client.h b/src/vpp-api/client/stat_client.h index c1a0ecf47537..ef16e4246dcb 100644 --- a/src/vpp-api/client/stat_client.h +++ b/src/vpp-api/client/stat_client.h @@ -38,6 +38,7 @@ int stat_segment_connect (char *socket_name); void stat_segment_disconnect (void); uint8_t **stat_segment_string_vector (uint8_t ** string_vector, char *string); int stat_segment_vec_len (void *vec); +void stat_segment_vec_free (void *vec); uint32_t *stat_segment_ls (uint8_t ** pattern); stat_segment_data_t *stat_segment_dump (uint32_t * counter_vec); stat_segment_data_t *stat_segment_dump_entry (uint32_t index); diff --git a/src/vpp/CMakeLists.txt b/src/vpp/CMakeLists.txt index 601bc0397d62..16843f73e8a3 100644 --- a/src/vpp/CMakeLists.txt +++ b/src/vpp/CMakeLists.txt @@ -82,6 +82,7 @@ add_vpp_executable(vpp add_vpp_headers(vpp api/vpe_msg_enum.h api/vpe_all_api_h.h + stats/stat_segment.h ) ############################################################################## diff --git a/src/vpp/api/api.c b/src/vpp/api/api.c index 86865099532d..dca33e99e47d 100644 --- a/src/vpp/api/api.c +++ b/src/vpp/api/api.c @@ -525,6 +525,8 @@ vpe_api_hookup (vlib_main_t * vm) /* * Thread-safe API messages */ + am->is_mp_safe[VL_API_CONTROL_PING] = 1; + am->is_mp_safe[VL_API_CONTROL_PING_REPLY] = 1; am->is_mp_safe[VL_API_IP_ADD_DEL_ROUTE] = 1; am->is_mp_safe[VL_API_GET_NODE_GRAPH] = 1; diff --git a/src/vppinfra/string.h b/src/vppinfra/string.h index 8f165dfa18e2..2c794baf71f0 100644 --- a/src/vppinfra/string.h +++ b/src/vppinfra/string.h @@ -356,7 +356,7 @@ clib_count_equal_u64 (u64 * data, uword max_count) #endif count += 2; data += 2; - while (count < max_count - 3 && + while (count + 3 < max_count && ((data[0] ^ first) | (data[1] ^ first) | (data[2] ^ first) | (data[3] ^ first)) == 0) { @@ -424,7 +424,7 @@ clib_count_equal_u32 (u32 * data, uword max_count) #endif count += 2; data += 2; - while (count < max_count - 3 && + while (count + 3 < max_count && ((data[0] ^ first) | (data[1] ^ first) | (data[2] ^ first) | (data[3] ^ first)) == 0) { @@ -492,7 +492,7 @@ clib_count_equal_u16 (u16 * data, uword max_count) #endif count += 2; data += 2; - while (count < max_count - 3 && + while (count + 3 < max_count && ((data[0] ^ first) | (data[1] ^ first) | (data[2] ^ first) | (data[3] ^ first)) == 0) { @@ -560,7 +560,7 @@ clib_count_equal_u8 (u8 * data, uword max_count) #endif count += 2; data += 2; - while (count < max_count - 3 && + while (count + 3 < max_count && ((data[0] ^ first) | (data[1] ^ first) | (data[2] ^ first) | (data[3] ^ first)) == 0) { diff --git a/test/framework.py b/test/framework.py index da34724befd6..a7f4fc774b44 100644 --- a/test/framework.py +++ b/test/framework.py @@ -284,13 +284,14 @@ def setUpConstants(cls): cls.vpp_cmdline = [cls.vpp_bin, "unix", "{", "nodaemon", debug_cli, "full-coredump", - coredump_size, "}", "api-trace", "{", "on", "}", - "api-segment", "{", "prefix", cls.shm_prefix, "}", - "cpu", "{", "main-core", str(cpu_core_number), "}", - "statseg", "{", "socket-name", cls.stats_sock, "}", - "plugins", "{", "plugin", "dpdk_plugin.so", "{", - "disable", "}", "plugin", "unittest_plugin.so", - "{", "enable", "}", "}", ] + coredump_size, "runtime-dir", cls.tempdir, "}", + "api-trace", "{", "on", "}", "api-segment", "{", + "prefix", cls.shm_prefix, "}", "cpu", "{", + "main-core", str(cpu_core_number), "}", "statseg", + "{", "socket-name", cls.stats_sock, "}", "plugins", + "{", "plugin", "dpdk_plugin.so", "{", "disable", + "}", "plugin", "unittest_plugin.so", "{", "enable", + "}", "}", ] if plugin_path is not None: cls.vpp_cmdline.extend(["plugin_path", plugin_path]) cls.logger.info("vpp_cmdline: %s" % cls.vpp_cmdline) diff --git a/test/test_abf.py b/test/test_abf.py index fb30fc3018ce..55eb552803a1 100644 --- a/test/test_abf.py +++ b/test/test_abf.py @@ -3,7 +3,7 @@ from framework import VppTestCase, VppTestRunner from vpp_udp_encap import * from vpp_ip import DpoProto -from vpp_ip_route import VppIpRoute, VppRoutePath, VppIpTable +from vpp_ip_route import VppIpRoute, VppRoutePath, VppMplsLabel, VppIpTable from scapy.packet import Raw from scapy.layers.l2 import Ether, ARP @@ -144,9 +144,9 @@ class TestAbf(VppTestCase): def setUp(self): super(TestAbf, self).setUp() - self.create_pg_interfaces(range(4)) + self.create_pg_interfaces(range(5)) - for i in self.pg_interfaces: + for i in self.pg_interfaces[:4]: i.admin_up() i.config_ip4() i.resolve_arp() @@ -266,6 +266,30 @@ def test_abf4(self): self.send_and_assert_no_replies(self.pg1, p_2 * 65, "Detached") + # + # Swap to route via a next-hop in the non-default table + # + table_20 = VppIpTable(self, 20) + table_20.add_vpp_config() + + self.pg4.set_table_ip4(table_20.table_id) + self.pg4.admin_up() + self.pg4.config_ip4() + self.pg4.resolve_arp() + + abf_13 = VppAbfPolicy(self, 13, acl_1, + [VppRoutePath(self.pg4.remote_ip4, + 0xffffffff, + nh_table_id=table_20.table_id)]) + abf_13.add_vpp_config() + attach_5 = VppAbfAttach(self, 13, self.pg0.sw_if_index, 30) + attach_5.add_vpp_config() + + self.send_and_expect(self.pg0, p_1*65, self.pg4) + + self.pg4.unconfig_ip4() + self.pg4.set_table_ip4(0) + def test_abf6(self): """ IPv6 ACL Based Forwarding """ diff --git a/test/test_bier.py b/test/test_bier.py index cc4c9b3ea0ca..5d69ec7cbb4d 100644 --- a/test/test_bier.py +++ b/test/test_bier.py @@ -581,10 +581,12 @@ def bier_e2e(self, hdr_len_id, n_bytes, max_bp): proto=DpoProto.DPO_PROTO_BIER, nh_table_id=8)]) bier_route_1.add_vpp_config() - bier_route_max = VppBierRoute(self, bti, max_bp, - [VppRoutePath("0.0.0.0", - 0xffffffff, - nh_table_id=8)]) + bier_route_max = VppBierRoute( + self, bti, max_bp, + [VppRoutePath("0.0.0.0", + 0xffffffff, + nh_table_id=8, + proto=DpoProto.DPO_PROTO_BIER)]) bier_route_max.add_vpp_config() # diff --git a/test/test_ip4.py b/test/test_ip4.py index 02a31be830a4..ca461f1d2760 100644 --- a/test/test_ip4.py +++ b/test/test_ip4.py @@ -1505,5 +1505,104 @@ def test_ip_input(self): self.pg1.unconfig_ip4() +class TestIPLPM(VppTestCase): + """ IPv4 longest Prefix Match """ + + def setUp(self): + super(TestIPLPM, self).setUp() + + self.create_pg_interfaces(range(4)) + + for i in self.pg_interfaces: + i.admin_up() + i.config_ip4() + i.resolve_arp() + + def tearDown(self): + super(TestIPLPM, self).tearDown() + for i in self.pg_interfaces: + i.admin_down() + i.unconfig_ip4() + + def test_ip_lpm(self): + """ IP longest Prefix Match """ + + s_24 = VppIpRoute(self, "10.1.2.0", 24, + [VppRoutePath(self.pg1.remote_ip4, + self.pg1.sw_if_index)]) + s_24.add_vpp_config() + s_8 = VppIpRoute(self, "10.0.0.0", 8, + [VppRoutePath(self.pg2.remote_ip4, + self.pg2.sw_if_index)]) + s_8.add_vpp_config() + + p_8 = (Ether(src=self.pg0.remote_mac, + dst=self.pg0.local_mac) / + IP(src="1.1.1.1", + dst="10.1.1.1") / + UDP(sport=1234, dport=1234) / + Raw('\xa5' * 2000)) + p_24 = (Ether(src=self.pg0.remote_mac, + dst=self.pg0.local_mac) / + IP(src="1.1.1.1", + dst="10.1.2.1") / + UDP(sport=1234, dport=1234) / + Raw('\xa5' * 2000)) + + self.logger.info(self.vapi.cli("sh ip fib mtrie")) + rx = self.send_and_expect(self.pg0, p_8 * 65, self.pg2) + rx = self.send_and_expect(self.pg0, p_24 * 65, self.pg1) + + +class TestIPv4Frag(VppTestCase): + """ IPv4 fragmentation """ + + @classmethod + def setUpClass(cls): + super(TestIPv4Frag, cls).setUpClass() + + cls.create_pg_interfaces([0, 1]) + cls.src_if = cls.pg0 + cls.dst_if = cls.pg1 + + # setup all interfaces + for i in cls.pg_interfaces: + i.admin_up() + i.config_ip4() + i.resolve_arp() + + def test_frag_large_packets(self): + """ Fragmentation of large packets """ + + p = (Ether(dst=self.src_if.local_mac, src=self.src_if.remote_mac) / + IP(src=self.src_if.remote_ip4, dst=self.dst_if.remote_ip4) / + UDP(sport=1234, dport=5678) / Raw()) + self.extend_packet(p, 6000, "abcde") + saved_payload = p[Raw].load + + # Force fragmentation by setting MTU of output interface + # lower than packet size + self.vapi.sw_interface_set_mtu(self.dst_if.sw_if_index, + [5000, 0, 0, 0]) + + self.pg_enable_capture() + self.src_if.add_stream(p) + self.pg_start() + + # Expecting 3 fragments because size of created fragments currently + # cannot be larger then VPP buffer size (which is 2048) + packets = self.dst_if.get_capture(3) + + # Assume VPP sends the fragments in order + payload = '' + for p in packets: + payload_offset = p.frag * 8 + if payload_offset > 0: + payload_offset -= 8 # UDP header is not in payload + self.assert_equal(payload_offset, len(payload)) + payload += p[Raw].load + self.assert_equal(payload, saved_payload, "payload") + + if __name__ == '__main__': unittest.main(testRunner=VppTestRunner) diff --git a/test/test_l2_flood.py b/test/test_l2_flood.py index 50a692e57e8d..8b8a3f0f960c 100644 --- a/test/test_l2_flood.py +++ b/test/test_l2_flood.py @@ -144,6 +144,39 @@ def test_flood(self): self.vapi.bridge_domain_add_del(1, is_add=0) + def test_flood_one(self): + """ L2 no-Flood Test """ + + # + # Create a single bridge Domain + # + self.vapi.bridge_domain_add_del(1) + + # + # add 2 interfaces to the BD. this means a flood goes to only + # one member + # + for i in self.pg_interfaces[:2]: + self.vapi.sw_interface_set_l2_bridge(i.sw_if_index, 1, 0) + + p = (Ether(dst="ff:ff:ff:ff:ff:ff", + src="00:00:de:ad:be:ef") / + IP(src="10.10.10.10", dst="1.1.1.1") / + UDP(sport=1234, dport=1234) / + Raw('\xa5' * 100)) + + # + # input on pg0 expect copies on pg1 + # + self.send_and_expect(self.pg0, p*65, self.pg1) + + # + # cleanup + # + for i in self.pg_interfaces[:2]: + self.vapi.sw_interface_set_l2_bridge(i.sw_if_index, 1, enable=0) + self.vapi.bridge_domain_add_del(1, is_add=0) + def test_uu_fwd(self): """ UU Flood """ diff --git a/test/test_memif.py b/test/test_memif.py index 8fe229986aa4..6d462bae018b 100644 --- a/test/test_memif.py +++ b/test/test_memif.py @@ -54,14 +54,14 @@ def test_memif_socket_filename_add_del(self): dump = self.vapi.memif_socket_filename_dump() self.assertTrue( self._check_socket_filename( - dump, 0, "/run/vpp/memif.sock")) + dump, 0, self.tempdir + "/memif.sock")) memif_sockets = [] # existing path memif_sockets.append( VppSocketFilename( - self, 1, "/run/vpp/memif1.sock")) - # default path ("/run/vpp") + self, 1, self.tempdir + "/memif1.sock")) + # default path (test tempdir) memif_sockets.append( VppSocketFilename( self, @@ -91,7 +91,7 @@ def test_memif_socket_filename_add_del(self): dump = self.vapi.memif_socket_filename_dump() self.assertTrue( self._check_socket_filename( - dump, 0, "/run/vpp/memif.sock")) + dump, 0, self.tempdir + "/memif.sock")) def _create_delete_test_one_interface(self, memif): memif.add_vpp_config() @@ -150,8 +150,8 @@ def test_memif_create_custom_socket(self): # existing path memif_sockets.append( VppSocketFilename( - self, 1, "/run/vpp/memif1.sock")) - # default path ("/run/vpp") + self, 1, self.tempdir + "/memif1.sock")) + # default path (test tempdir) memif_sockets.append( VppSocketFilename( self, @@ -178,18 +178,16 @@ def test_memif_create_custom_socket(self): def test_memif_connect(self): """ Memif connect """ - memif = VppMemif( - self, - MEMIF_ROLE.SLAVE, - MEMIF_MODE.ETHERNET, - ring_size=1024, - buffer_size=2048) - remote_memif = VppMemif( - self.remote_test, - MEMIF_ROLE.MASTER, - MEMIF_MODE.ETHERNET, - ring_size=1024, - buffer_size=2048) + memif = VppMemif(self, MEMIF_ROLE.SLAVE, MEMIF_MODE.ETHERNET, + ring_size=1024, buffer_size=2048) + + remote_socket = VppSocketFilename(self.remote_test, 1, + self.tempdir + "/memif.sock") + remote_socket.add_vpp_config() + + remote_memif = VppMemif(self.remote_test, MEMIF_ROLE.MASTER, + MEMIF_MODE.ETHERNET, socket_id=1, + ring_size=1024, buffer_size=2048) self._connect_test_interface_pair(memif, remote_memif) @@ -219,9 +217,14 @@ def _verify_icmp(self, pg, memif, rx, seq): def test_memif_ping(self): """ Memif ping """ - memif = VppMemif(self, MEMIF_ROLE.MASTER, MEMIF_MODE.ETHERNET) - remote_memif = VppMemif(self.remote_test, MEMIF_ROLE.SLAVE, - MEMIF_MODE.ETHERNET) + memif = VppMemif(self, MEMIF_ROLE.SLAVE, MEMIF_MODE.ETHERNET) + + remote_socket = VppSocketFilename(self.remote_test, 1, + self.tempdir + "/memif.sock") + remote_socket.add_vpp_config() + + remote_memif = VppMemif(self.remote_test, MEMIF_ROLE.MASTER, + MEMIF_MODE.ETHERNET, socket_id=1) memif.add_vpp_config() memif.config_ip4() diff --git a/test/test_nat.py b/test/test_nat.py index 3b9007f6e93e..3094dd429e2f 100644 --- a/test/test_nat.py +++ b/test/test_nat.py @@ -1937,6 +1937,10 @@ def test_identity_nat(self): sessions = self.vapi.nat44_user_session_dump(self.pg0.remote_ip4n, 0) self.assertEqual(len(sessions), 0) + self.vapi.nat44_add_del_identity_mapping(ip=self.pg0.remote_ip4n, + vrf_id=1) + identity_mappings = self.vapi.nat44_identity_mapping_dump() + self.assertEqual(len(identity_mappings), 2) def test_multiple_inside_interfaces(self): """ NAT44 multiple non-overlapping address space inside interfaces """ @@ -3331,36 +3335,25 @@ def test_frag_in_order(self): self.vapi.nat44_interface_add_del_feature(self.pg1.sw_if_index, is_inside=0) - data = "A" * 4 + "B" * 16 + "C" * 3 - self.tcp_port_in = random.randint(1025, 65535) - - reass = self.vapi.nat_reass_dump() - reass_n_start = len(reass) + self.frag_in_order(proto=IP_PROTOS.tcp) + self.frag_in_order(proto=IP_PROTOS.udp) + self.frag_in_order(proto=IP_PROTOS.icmp) - # in2out - pkts = self.create_stream_frag(self.pg0, - self.pg1.remote_ip4, - self.tcp_port_in, - 20, - data) - self.pg0.add_stream(pkts) - self.pg_enable_capture(self.pg_interfaces) - self.pg_start() - frags = self.pg1.get_capture(len(pkts)) - p = self.reass_frags_and_verify(frags, - self.nat_addr, - self.pg1.remote_ip4) - self.assertEqual(p[TCP].dport, 20) - self.assertNotEqual(p[TCP].sport, self.tcp_port_in) - self.tcp_port_out = p[TCP].sport - self.assertEqual(data, p[Raw].load) + def test_frag_forwarding(self): + """ NAT44 forwarding fragment test """ + self.vapi.nat44_add_interface_addr(self.pg1.sw_if_index) + self.vapi.nat44_interface_add_del_feature(self.pg0.sw_if_index) + self.vapi.nat44_interface_add_del_feature(self.pg1.sw_if_index, + is_inside=0) + self.vapi.nat44_forwarding_enable_disable(1) - # out2in + data = "A" * 16 + "B" * 16 + "C" * 3 pkts = self.create_stream_frag(self.pg1, - self.nat_addr, - 20, - self.tcp_port_out, - data) + self.pg0.remote_ip4, + 4789, + 4789, + data, + proto=IP_PROTOS.udp) self.pg1.add_stream(pkts) self.pg_enable_capture(self.pg_interfaces) self.pg_start() @@ -3368,49 +3361,36 @@ def test_frag_in_order(self): p = self.reass_frags_and_verify(frags, self.pg1.remote_ip4, self.pg0.remote_ip4) - self.assertEqual(p[TCP].sport, 20) - self.assertEqual(p[TCP].dport, self.tcp_port_in) + self.assertEqual(p[UDP].sport, 4789) + self.assertEqual(p[UDP].dport, 4789) self.assertEqual(data, p[Raw].load) - reass = self.vapi.nat_reass_dump() - reass_n_end = len(reass) - - self.assertEqual(reass_n_end - reass_n_start, 2) - def test_reass_hairpinning(self): """ NAT44 fragments hairpinning """ - server = self.pg0.remote_hosts[1] - host_in_port = random.randint(1025, 65535) - server_in_port = random.randint(1025, 65535) - server_out_port = random.randint(1025, 65535) - data = "A" * 4 + "B" * 16 + "C" * 3 + self.server = self.pg0.remote_hosts[1] + self.host_in_port = random.randint(1025, 65535) + self.server_in_port = random.randint(1025, 65535) + self.server_out_port = random.randint(1025, 65535) self.nat44_add_address(self.nat_addr) self.vapi.nat44_interface_add_del_feature(self.pg0.sw_if_index) self.vapi.nat44_interface_add_del_feature(self.pg1.sw_if_index, is_inside=0) # add static mapping for server - self.nat44_add_static_mapping(server.ip4, self.nat_addr, - server_in_port, server_out_port, + self.nat44_add_static_mapping(self.server.ip4, self.nat_addr, + self.server_in_port, + self.server_out_port, proto=IP_PROTOS.tcp) + self.nat44_add_static_mapping(self.server.ip4, self.nat_addr, + self.server_in_port, + self.server_out_port, + proto=IP_PROTOS.udp) + self.nat44_add_static_mapping(self.server.ip4, self.nat_addr) - # send packet from host to server - pkts = self.create_stream_frag(self.pg0, - self.nat_addr, - host_in_port, - server_out_port, - data) - self.pg0.add_stream(pkts) - self.pg_enable_capture(self.pg_interfaces) - self.pg_start() - frags = self.pg0.get_capture(len(pkts)) - p = self.reass_frags_and_verify(frags, - self.nat_addr, - server.ip4) - self.assertNotEqual(p[TCP].sport, host_in_port) - self.assertEqual(p[TCP].dport, server_in_port) - self.assertEqual(data, p[Raw].load) + self.reass_hairpinning(proto=IP_PROTOS.tcp) + self.reass_hairpinning(proto=IP_PROTOS.udp) + self.reass_hairpinning(proto=IP_PROTOS.icmp) def test_frag_out_of_order(self): """ NAT44 translate fragments arriving out of order """ @@ -3420,45 +3400,9 @@ def test_frag_out_of_order(self): self.vapi.nat44_interface_add_del_feature(self.pg1.sw_if_index, is_inside=0) - data = "A" * 4 + "B" * 16 + "C" * 3 - random.randint(1025, 65535) - - # in2out - pkts = self.create_stream_frag(self.pg0, - self.pg1.remote_ip4, - self.tcp_port_in, - 20, - data) - pkts.reverse() - self.pg0.add_stream(pkts) - self.pg_enable_capture(self.pg_interfaces) - self.pg_start() - frags = self.pg1.get_capture(len(pkts)) - p = self.reass_frags_and_verify(frags, - self.nat_addr, - self.pg1.remote_ip4) - self.assertEqual(p[TCP].dport, 20) - self.assertNotEqual(p[TCP].sport, self.tcp_port_in) - self.tcp_port_out = p[TCP].sport - self.assertEqual(data, p[Raw].load) - - # out2in - pkts = self.create_stream_frag(self.pg1, - self.nat_addr, - 20, - self.tcp_port_out, - data) - pkts.reverse() - self.pg1.add_stream(pkts) - self.pg_enable_capture(self.pg_interfaces) - self.pg_start() - frags = self.pg0.get_capture(len(pkts)) - p = self.reass_frags_and_verify(frags, - self.pg1.remote_ip4, - self.pg0.remote_ip4) - self.assertEqual(p[TCP].sport, 20) - self.assertEqual(p[TCP].dport, self.tcp_port_in) - self.assertEqual(data, p[Raw].load) + self.frag_out_of_order(proto=IP_PROTOS.tcp) + self.frag_out_of_order(proto=IP_PROTOS.udp) + self.frag_out_of_order(proto=IP_PROTOS.icmp) def test_port_restricted(self): """ Port restricted NAT44 (MAP-E CE) """ @@ -3967,8 +3911,7 @@ def test_reass_hairpinning(self): self.server_in_port, self.server_out_port, proto=IP_PROTOS.udp) - self.nat44_add_static_mapping(self.server.ip4, self.nat_addr, - proto=IP_PROTOS.icmp) + self.nat44_add_static_mapping(self.server.ip4, self.nat_addr) self.reass_hairpinning(proto=IP_PROTOS.tcp) self.reass_hairpinning(proto=IP_PROTOS.udp) @@ -4921,6 +4864,88 @@ def test_twice_nat_interface_addr(self): adresses = self.vapi.nat44_address_dump() self.assertEqual(0, len(adresses)) + def test_tcp_close(self): + """ Close TCP session from inside network - output feature """ + self.vapi.nat44_forwarding_enable_disable(1) + self.nat44_add_address(self.pg1.local_ip4) + twice_nat_addr = '10.0.1.3' + service_ip = '192.168.16.150' + self.nat44_add_address(twice_nat_addr, twice_nat=1) + self.vapi.nat44_interface_add_del_feature(self.pg0.sw_if_index) + self.vapi.nat44_interface_add_del_feature(self.pg0.sw_if_index, + is_inside=0) + self.vapi.nat44_interface_add_del_output_feature(self.pg1.sw_if_index, + is_inside=0) + self.nat44_add_static_mapping(self.pg0.remote_ip4, + service_ip, + 80, + 80, + proto=IP_PROTOS.tcp, + out2in_only=1, + twice_nat=1) + sessions = self.vapi.nat44_user_session_dump(self.pg0.remote_ip4n, 0) + start_sessnum = len(sessions) + + # SYN packet out->in + p = (Ether(src=self.pg1.remote_mac, dst=self.pg1.local_mac) / + IP(src=self.pg1.remote_ip4, dst=service_ip) / + TCP(sport=33898, dport=80, flags="S")) + self.pg1.add_stream(p) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + capture = self.pg0.get_capture(1) + p = capture[0] + tcp_port = p[TCP].sport + + # SYN + ACK packet in->out + p = (Ether(src=self.pg0.remote_mac, dst=self.pg0.local_mac) / + IP(src=self.pg0.remote_ip4, dst=twice_nat_addr) / + TCP(sport=80, dport=tcp_port, flags="SA")) + self.pg0.add_stream(p) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + self.pg1.get_capture(1) + + # ACK packet out->in + p = (Ether(src=self.pg1.remote_mac, dst=self.pg1.local_mac) / + IP(src=self.pg1.remote_ip4, dst=service_ip) / + TCP(sport=33898, dport=80, flags="A")) + self.pg1.add_stream(p) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + self.pg0.get_capture(1) + + # FIN packet in -> out + p = (Ether(src=self.pg0.remote_mac, dst=self.pg0.local_mac) / + IP(src=self.pg0.remote_ip4, dst=twice_nat_addr) / + TCP(sport=80, dport=tcp_port, flags="FA", seq=100, ack=300)) + self.pg0.add_stream(p) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + self.pg1.get_capture(1) + + # FIN+ACK packet out -> in + p = (Ether(src=self.pg1.remote_mac, dst=self.pg1.local_mac) / + IP(src=self.pg1.remote_ip4, dst=service_ip) / + TCP(sport=33898, dport=80, flags="FA", seq=300, ack=101)) + self.pg1.add_stream(p) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + self.pg0.get_capture(1) + + # ACK packet in -> out + p = (Ether(src=self.pg0.remote_mac, dst=self.pg0.local_mac) / + IP(src=self.pg0.remote_ip4, dst=twice_nat_addr) / + TCP(sport=80, dport=tcp_port, flags="A", seq=101, ack=301)) + self.pg0.add_stream(p) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + self.pg1.get_capture(1) + + sessions = self.vapi.nat44_user_session_dump(self.pg0.remote_ip4n, + 0) + self.assertEqual(len(sessions) - start_sessnum, 0) + def test_tcp_session_close_in(self): """ Close TCP session from inside network """ self.tcp_port_out = 10505 @@ -5633,7 +5658,7 @@ def test_session_timeout(self): pkts = [] for i in range(0, max_sessions): - src = "10.10.%u.%u" % ((i & 0xFF00) >> 8, i & 0xFF) + src = "10.11.%u.%u" % ((i & 0xFF00) >> 8, i & 0xFF) p = (Ether(dst=self.pg0.local_mac, src=self.pg0.remote_mac) / IP(src=src, dst=self.pg1.remote_ip4) / ICMP(id=1026, type='echo-request')) @@ -5649,6 +5674,42 @@ def test_session_timeout(self): nsessions = nsessions + user.nsessions self.assertLess(nsessions, 2 * max_sessions) + @unittest.skipUnless(running_extended_tests(), "part of extended tests") + def test_session_rst_timeout(self): + """ NAT44 session RST timeouts """ + self.nat44_add_address(self.nat_addr) + self.vapi.nat44_interface_add_del_feature(self.pg0.sw_if_index) + self.vapi.nat44_interface_add_del_feature(self.pg1.sw_if_index, + is_inside=0) + self.vapi.nat_set_timeouts(tcp_transitory=5) + + self.initiate_tcp_session(self.pg0, self.pg1) + p = (Ether(src=self.pg0.remote_mac, dst=self.pg0.local_mac) / + IP(src=self.pg0.remote_ip4, dst=self.pg1.remote_ip4) / + TCP(sport=self.tcp_port_in, dport=self.tcp_external_port, + flags="R")) + self.pg0.add_stream(p) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + self.pg1.get_capture(1) + + sleep(6) + + p = (Ether(src=self.pg0.remote_mac, dst=self.pg0.local_mac) / + IP(src=self.pg0.remote_ip4, dst=self.pg1.remote_ip4) / + TCP(sport=self.tcp_port_in + 1, dport=self.tcp_external_port + 1, + flags="S")) + self.pg0.add_stream(p) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + self.pg1.get_capture(1) + + nsessions = 0 + users = self.vapi.nat44_user_dump() + self.assertEqual(len(users), 1) + self.assertEqual(users[0].ip_address, self.pg0.remote_ip4n) + self.assertEqual(users[0].nsessions, 1) + @unittest.skipUnless(running_extended_tests(), "part of extended tests") def test_session_limit_per_user(self): """ Maximum sessions per user limit """ diff --git a/test/test_neighbor.py b/test/test_neighbor.py index a15106af1f9a..674240487974 100644 --- a/test/test_neighbor.py +++ b/test/test_neighbor.py @@ -6,11 +6,12 @@ from framework import VppTestCase, VppTestRunner from vpp_neighbor import VppNeighbor, find_nbr from vpp_ip_route import VppIpRoute, VppRoutePath, find_route, \ - VppIpTable + VppIpTable, DpoProto from scapy.packet import Raw from scapy.layers.l2 import Ether, ARP, Dot1Q from scapy.layers.inet import IP, UDP +from scapy.layers.inet6 import IPv6 from scapy.contrib.mpls import MPLS from scapy.layers.inet6 import IPv6 @@ -1321,14 +1322,16 @@ def test_arp_incomplete(self): """ Incomplete Entries """ # - # ensure that we throttle the ARP requests + # ensure that we throttle the ARP and ND requests # self.pg0.generate_remote_hosts(2) + # + # IPv4/ARP + # ip_10_0_0_1 = VppIpRoute(self, "10.0.0.1", 32, [VppRoutePath(self.pg0.remote_hosts[1].ip4, - self.pg0.sw_if_index, - labels=[55])]) + self.pg0.sw_if_index)]) ip_10_0_0_1.add_vpp_config() p1 = (Ether(dst=self.pg1.local_mac, @@ -1349,6 +1352,34 @@ def test_arp_incomplete(self): # self.assertTrue(len(rx) < 64) + # + # IPv6/ND + # + ip_10_1 = VppIpRoute(self, "10::1", 128, + [VppRoutePath(self.pg0.remote_hosts[1].ip6, + self.pg0.sw_if_index, + proto=DpoProto.DPO_PROTO_IP6)], + is_ip6=1) + ip_10_1.add_vpp_config() + + p1 = (Ether(dst=self.pg1.local_mac, + src=self.pg1.remote_mac) / + IPv6(src=self.pg1.remote_ip6, + dst="10::1") / + UDP(sport=1234, dport=1234) / + Raw()) + + self.pg1.add_stream(p1 * 257) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + rx = self.pg0._get_capture(1) + + # + # how many we get is going to be dependent on the time for packet + # processing but it should be small + # + self.assertTrue(len(rx) < 64) + class NeighborStatsTestCase(VppTestCase): """ ARP Test Case """ diff --git a/test/vpp_memif.py b/test/vpp_memif.py index 2095480a7c2b..24e8d19b1c71 100644 --- a/test/vpp_memif.py +++ b/test/vpp_memif.py @@ -50,7 +50,8 @@ def add_vpp_config(self): rv = self._test.vapi.memif_socket_filename_add_del( 1, self.socket_id, self.socket_filename) if self.add_default_folder: - self.socket_filename = "/run/vpp/" + self.socket_filename + self.socket_filename = self._test.tempdir + "/" \ + + self.socket_filename return rv def remove_vpp_config(self):