Skip to content
Permalink
Browse files
net: switchdev: move SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE to the blockin…
…g notifier chain

Currently, br_switchdev_fdb_notify() uses call_switchdev_notifiers (and
br_fdb_replay() open-codes the same thing). This means that drivers
handle the SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE events on the atomic
switchdev notifier block.

Most existing switchdev drivers either talk to firmware, or to a device
over a bus where the I/O is sleepable (SPI, I2C, MDIO etc). So there
exists an (anti)pattern where drivers make a sleepable context for
offloading the given FDB entry by registering an ordered workqueue and
scheduling work items on it, and doing all the work from there.

The problem is the inherent limitation that this design imposes upon
what a switchdev driver can do with those FDB entries.

For example, a switchdev driver might want to perform FDB isolation,
i.e. associate each FDB entry with the bridge it belongs to. Maybe the
driver associates each bridge with a number, allocating that number when
the first port of the driver joins that bridge, and freeing it when the
last port leaves it.

And this is where the problem is. When user space deletes a bridge and
all the ports leave, the bridge will notify us of the deletion of all
FDB entries in atomic context, and switchdev drivers will schedule their
private work items on their private workqueue.

The FDB entry deletion notifications will succeed, the bridge will then
finish deleting itself, but the switchdev work items have not run yet.
When they will eventually get scheduled, the aforementioned association
between the bridge_dev and a number will have already been broken by the
switchdev driver. All ports are standalone now, the bridge is a foreign
interface!

One might say "why don't you cache all your associations while you're
still in the atomic context and they're still valid, pass them by value
through your switchdev_work and work with the cached values as opposed
to the current ones?"

This option smells of poor design, because instead of fixing a central
problem, we add tens of lateral workarounds to avoid it. It should be
easier to use switchdev, not harder, and we should look at the common
patterns which lead to code duplication and eliminate them.

In this case, we must notice that
(a) switchdev already has the concept of notifiers emitted from the fast
    path that are still processed by drivers from blocking context. This
    is accomplished through the SWITCHDEV_F_DEFER flag which is used by
    e.g. SWITCHDEV_OBJ_ID_HOST_MDB.
(b) the bridge del_nbp() function already calls switchdev_deferred_process().
    So if we could hook into that, we could have a chance that the
    bridge simply waits for our FDB entry offloading procedure to finish
    before it calls netdev_upper_dev_unlink() - which is almost
    immediately afterwards, and also when switchdev drivers typically
    break their stateful associations between the bridge upper and
    private data.

So it is in fact possible to use switchdev's generic
switchdev_deferred_enqueue mechanism to get a sleepable callback, and
from there we can call_switchdev_blocking_notifiers().

In the case of br_fdb_replay(), the only code path is from
switchdev_bridge_port_offload(), which is already in blocking context.
So we don't need to go through switchdev_deferred_enqueue, and we can
just call the blocking notifier block directly.

To preserve the same behavior as before, all drivers need to have their
SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE handlers moved from their switchdev
atomic notifier blocks to the blocking ones. This patch attempts to make
that trivial movement. Note that now they might schedule a work item for
nothing (since they are now called from a work item themselves), but I
don't have the energy or hardware to test all of them, so this will have
to do.

Note that previously, we were under rcu_read_lock() but now we're not.
I have eyeballed the drivers that make any sort of RCU assumption and
enclosed them between a private rcu_read_lock()/rcu_read_unlock(). This
can be dropped when the drivers themselves are reworked.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
  • Loading branch information
vladimiroltean authored and intel-lab-lkp committed Aug 18, 2021
1 parent f0b6745 commit c3f584a3a5fcbaf747a1c754720afa897885c9a7
Show file tree
Hide file tree
Showing 14 changed files with 443 additions and 248 deletions.
@@ -2254,52 +2254,11 @@ static int dpaa2_switch_port_event(struct notifier_block *nb,
unsigned long event, void *ptr)
{
struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
struct ethsw_port_priv *port_priv = netdev_priv(dev);
struct ethsw_switchdev_event_work *switchdev_work;
struct switchdev_notifier_fdb_info *fdb_info = ptr;
struct ethsw_core *ethsw = port_priv->ethsw_data;

if (event == SWITCHDEV_PORT_ATTR_SET)
return dpaa2_switch_port_attr_set_event(dev, ptr);

if (!dpaa2_switch_port_dev_check(dev))
return NOTIFY_DONE;

switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
if (!switchdev_work)
return NOTIFY_BAD;

INIT_WORK(&switchdev_work->work, dpaa2_switch_event_work);
switchdev_work->dev = dev;
switchdev_work->event = event;

switch (event) {
case SWITCHDEV_FDB_ADD_TO_DEVICE:
case SWITCHDEV_FDB_DEL_TO_DEVICE:
memcpy(&switchdev_work->fdb_info, ptr,
sizeof(switchdev_work->fdb_info));
switchdev_work->fdb_info.addr = kzalloc(ETH_ALEN, GFP_ATOMIC);
if (!switchdev_work->fdb_info.addr)
goto err_addr_alloc;

ether_addr_copy((u8 *)switchdev_work->fdb_info.addr,
fdb_info->addr);

/* Take a reference on the device to avoid being freed. */
dev_hold(dev);
break;
default:
kfree(switchdev_work);
return NOTIFY_DONE;
}

queue_work(ethsw->workqueue, &switchdev_work->work);

return NOTIFY_DONE;

err_addr_alloc:
kfree(switchdev_work);
return NOTIFY_BAD;
}

static int dpaa2_switch_port_obj_event(unsigned long event,
@@ -2324,6 +2283,46 @@ static int dpaa2_switch_port_obj_event(unsigned long event,
return notifier_from_errno(err);
}

static int dpaa2_switch_fdb_event(unsigned long event,
struct net_device *dev,
struct switchdev_notifier_fdb_info *fdb_info)
{
struct ethsw_port_priv *port_priv = netdev_priv(dev);
struct ethsw_switchdev_event_work *switchdev_work;
struct ethsw_core *ethsw = port_priv->ethsw_data;

if (!dpaa2_switch_port_dev_check(dev))
return NOTIFY_DONE;

switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
if (!switchdev_work)
return NOTIFY_BAD;

INIT_WORK(&switchdev_work->work, dpaa2_switch_event_work);
switchdev_work->dev = dev;
switchdev_work->event = event;

memcpy(&switchdev_work->fdb_info, fdb_info,
sizeof(switchdev_work->fdb_info));
switchdev_work->fdb_info.addr = kzalloc(ETH_ALEN, GFP_ATOMIC);
if (!switchdev_work->fdb_info.addr)
goto err_addr_alloc;

ether_addr_copy((u8 *)switchdev_work->fdb_info.addr,
fdb_info->addr);

/* Take a reference on the device to avoid being freed. */
dev_hold(dev);

queue_work(ethsw->workqueue, &switchdev_work->work);

return NOTIFY_DONE;

err_addr_alloc:
kfree(switchdev_work);
return NOTIFY_BAD;
}

static int dpaa2_switch_port_blocking_event(struct notifier_block *nb,
unsigned long event, void *ptr)
{
@@ -2335,6 +2334,9 @@ static int dpaa2_switch_port_blocking_event(struct notifier_block *nb,
return dpaa2_switch_port_obj_event(event, dev, ptr);
case SWITCHDEV_PORT_ATTR_SET:
return dpaa2_switch_port_attr_set_event(dev, ptr);
case SWITCHDEV_FDB_ADD_TO_DEVICE:
case SWITCHDEV_FDB_DEL_TO_DEVICE:
return dpaa2_switch_fdb_event(event, dev, ptr);
}

return NOTIFY_DONE;
@@ -845,10 +845,6 @@ static int prestera_switchdev_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
struct switchdev_notifier_fdb_info *fdb_info;
struct switchdev_notifier_info *info = ptr;
struct prestera_fdb_event_work *swdev_work;
struct net_device *upper;
int err;

if (event == SWITCHDEV_PORT_ATTR_SET) {
@@ -858,54 +854,7 @@ static int prestera_switchdev_event(struct notifier_block *unused,
return notifier_from_errno(err);
}

if (!prestera_netdev_check(dev))
return NOTIFY_DONE;

upper = netdev_master_upper_dev_get_rcu(dev);
if (!upper)
return NOTIFY_DONE;

if (!netif_is_bridge_master(upper))
return NOTIFY_DONE;

swdev_work = kzalloc(sizeof(*swdev_work), GFP_ATOMIC);
if (!swdev_work)
return NOTIFY_BAD;

swdev_work->event = event;
swdev_work->dev = dev;

switch (event) {
case SWITCHDEV_FDB_ADD_TO_DEVICE:
case SWITCHDEV_FDB_DEL_TO_DEVICE:
fdb_info = container_of(info,
struct switchdev_notifier_fdb_info,
info);

INIT_WORK(&swdev_work->work, prestera_fdb_event_work);
memcpy(&swdev_work->fdb_info, ptr,
sizeof(swdev_work->fdb_info));

swdev_work->fdb_info.addr = kzalloc(ETH_ALEN, GFP_ATOMIC);
if (!swdev_work->fdb_info.addr)
goto out_bad;

ether_addr_copy((u8 *)swdev_work->fdb_info.addr,
fdb_info->addr);
dev_hold(dev);
break;

default:
kfree(swdev_work);
return NOTIFY_DONE;
}

queue_work(swdev_wq, &swdev_work->work);
return NOTIFY_DONE;

out_bad:
kfree(swdev_work);
return NOTIFY_BAD;
}

static int
@@ -1101,6 +1050,53 @@ static int prestera_port_obj_del(struct net_device *dev, const void *ctx,
}
}

static int prestera_switchdev_fdb_event(struct net_device *dev,
unsigned long event,
struct switchdev_notifier_info *info)
{
struct switchdev_notifier_fdb_info *fdb_info;
struct prestera_fdb_event_work *swdev_work;
struct net_device *upper;

if (!prestera_netdev_check(dev))
return 0;

upper = netdev_master_upper_dev_get_rcu(dev);
if (!upper)
return 0;

if (!netif_is_bridge_master(upper))
return 0;

swdev_work = kzalloc(sizeof(*swdev_work), GFP_ATOMIC);
if (!swdev_work)
return -ENOMEM;

swdev_work->event = event;
swdev_work->dev = dev;

fdb_info = container_of(info, struct switchdev_notifier_fdb_info,
info);

INIT_WORK(&swdev_work->work, prestera_fdb_event_work);
memcpy(&swdev_work->fdb_info, fdb_info, sizeof(swdev_work->fdb_info));

swdev_work->fdb_info.addr = kzalloc(ETH_ALEN, GFP_ATOMIC);
if (!swdev_work->fdb_info.addr)
goto out_bad;

ether_addr_copy((u8 *)swdev_work->fdb_info.addr,
fdb_info->addr);
dev_hold(dev);

queue_work(swdev_wq, &swdev_work->work);
return 0;

out_bad:
kfree(swdev_work);
return -ENOMEM;
}

static int prestera_switchdev_blk_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
@@ -1123,8 +1119,12 @@ static int prestera_switchdev_blk_event(struct notifier_block *unused,
prestera_netdev_check,
prestera_port_obj_attr_set);
break;
default:
err = -EOPNOTSUPP;
case SWITCHDEV_FDB_ADD_TO_DEVICE:
case SWITCHDEV_FDB_DEL_TO_DEVICE:
rcu_read_lock();
err = prestera_switchdev_fdb_event(dev, event, ptr);
rcu_read_unlock();
break;
}

return notifier_from_errno(err);
@@ -276,6 +276,55 @@ mlx5_esw_bridge_port_obj_attr_set(struct net_device *dev,
return err;
}

static struct mlx5_bridge_switchdev_fdb_work *
mlx5_esw_bridge_init_switchdev_fdb_work(struct net_device *dev, bool add,
struct switchdev_notifier_fdb_info *fdb_info,
struct mlx5_esw_bridge_offloads *br_offloads);

static int
mlx5_esw_bridge_fdb_event(struct net_device *dev, unsigned long event,
struct switchdev_notifier_info *info,
struct mlx5_esw_bridge_offloads *br_offloads)
{
struct switchdev_notifier_fdb_info *fdb_info;
struct mlx5_bridge_switchdev_fdb_work *work;
struct mlx5_eswitch *esw = br_offloads->esw;
u16 vport_num, esw_owner_vhca_id;
struct net_device *upper, *rep;

upper = netdev_master_upper_dev_get_rcu(dev);
if (!upper)
return 0;
if (!netif_is_bridge_master(upper))
return 0;

rep = mlx5_esw_bridge_rep_vport_num_vhca_id_get(dev, esw,
&vport_num,
&esw_owner_vhca_id);
if (!rep)
return 0;

/* only handle the event on peers */
if (mlx5_esw_bridge_is_local(dev, rep, esw))
return 0;

fdb_info = container_of(info, struct switchdev_notifier_fdb_info, info);

work = mlx5_esw_bridge_init_switchdev_fdb_work(dev,
event == SWITCHDEV_FDB_ADD_TO_DEVICE,
fdb_info,
br_offloads);
if (IS_ERR(work)) {
WARN_ONCE(1, "Failed to init switchdev work, err=%ld",
PTR_ERR(work));
return PTR_ERR(work);
}

queue_work(br_offloads->wq, &work->work);

return 0;
}

static int mlx5_esw_bridge_event_blocking(struct notifier_block *nb,
unsigned long event, void *ptr)
{
@@ -295,6 +344,12 @@ static int mlx5_esw_bridge_event_blocking(struct notifier_block *nb,
case SWITCHDEV_PORT_ATTR_SET:
err = mlx5_esw_bridge_port_obj_attr_set(dev, ptr, br_offloads);
break;
case SWITCHDEV_FDB_ADD_TO_DEVICE:
case SWITCHDEV_FDB_DEL_TO_DEVICE:
rcu_read_lock();
err = mlx5_esw_bridge_fdb_event(dev, event, ptr, br_offloads);
rcu_read_unlock();
break;
default:
err = 0;
}
@@ -415,9 +470,7 @@ static int mlx5_esw_bridge_switchdev_event(struct notifier_block *nb,
/* only handle the event on peers */
if (mlx5_esw_bridge_is_local(dev, rep, esw))
break;
fallthrough;
case SWITCHDEV_FDB_ADD_TO_DEVICE:
case SWITCHDEV_FDB_DEL_TO_DEVICE:

fdb_info = container_of(info,
struct switchdev_notifier_fdb_info,
info);

0 comments on commit c3f584a

Please sign in to comment.