Skip to content

concurrent creates when min-free-disk is exceeded can lead to gfid mismatches between linkto and data files #4548

Open
@pranithk

Description

@pranithk

We faced an issue in production where 28 files were in this condition where the linkto file and data files had gfid mismatch. Upon code reading we were able to recreate the issue with the help of gdb.
TLDR of the flow:

mount-1, mount-2 try to create the same file when min-free-disk threshold on hashed subvol is breached :
1. mount-1 creates a linkto file for file1.txt(gfid-1) in subvol-1(hashed) and in the process of creating data file in subvol-2
2. mount-2 does a lookup on file1.txt before create is issued and doesn't find the data file for it.
3. mount-2 deletes the linkto file file1.txt(gfid-1) on subvol-1, thinking it is stale file
4. mount-1 goes ahead and creates the data file on subvol-2 (gfid-1)
5. mount-2 creates a linkto file on subvol-1 (gfid-2)
6. mount-2 tries to create data file on subvol-2 with gfid-2 but since the file already exists, create would succeed but the response would be a success with gfid-1 in the stat structures.

Leading to gfid mismatches

Detailed steps:
Prepare the volume so that the bricks go into min-free-disk breaching state

root@glusterfs-ubuntu-24:~# dd if=/dev/zero of=brick1.file bs=1M count=300
300+0 records in
300+0 records out
314572800 bytes (315 MB, 300 MiB) copied, 0.0984521 s, 3.2 GB/s
root@glusterfs-ubuntu-24:~# dd if=/dev/zero of=brick2.file bs=1M count=300
300+0 records in
300+0 records out
314572800 bytes (315 MB, 300 MiB) copied, 0.180304 s, 1.7 GB/s
root@glusterfs-ubuntu-24:~# dd if=/dev/zero of=brick3.file bs=1M count=300
300+0 records in
300+0 records out
314572800 bytes (315 MB, 300 MiB) copied, 0.13029 s, 2.4 GB/s
root@glusterfs-ubuntu-24:~# dd if=/dev/zero of=brick4.file bs=1M count=300
300+0 records in
300+0 records out
314572800 bytes (315 MB, 300 MiB) copied, 0.146483 s, 2.1 GB/s
root@glusterfs-ubuntu-24:~# sudo mkfs.xfs -f brick1.file
meta-data=brick1.file            isize=512    agcount=4, agsize=19200 blks
         =                       sectsz=512   attr=2, projid32bit=1
         =                       crc=1        finobt=1, sparse=1, rmapbt=1
         =                       reflink=1    bigtime=1 inobtcount=1 nrext64=0
data     =                       bsize=4096   blocks=76800, imaxpct=25
         =                       sunit=0      swidth=0 blks
naming   =version 2              bsize=4096   ascii-ci=0, ftype=1
log      =internal log           bsize=4096   blocks=16384, version=2
         =                       sectsz=512   sunit=0 blks, lazy-count=1
realtime =none                   extsz=4096   blocks=0, rtextents=0
root@glusterfs-ubuntu-24:~# sudo mkfs.xfs -f brick2.file
meta-data=brick2.file            isize=512    agcount=4, agsize=19200 blks
         =                       sectsz=512   attr=2, projid32bit=1
         =                       crc=1        finobt=1, sparse=1, rmapbt=1
         =                       reflink=1    bigtime=1 inobtcount=1 nrext64=0
data     =                       bsize=4096   blocks=76800, imaxpct=25
         =                       sunit=0      swidth=0 blks
naming   =version 2              bsize=4096   ascii-ci=0, ftype=1
log      =internal log           bsize=4096   blocks=16384, version=2
         =                       sectsz=512   sunit=0 blks, lazy-count=1
realtime =none                   extsz=4096   blocks=0, rtextents=0
root@glusterfs-ubuntu-24:~# sudo mkfs.xfs -f brick3.file
meta-data=brick3.file            isize=512    agcount=4, agsize=19200 blks
         =                       sectsz=512   attr=2, projid32bit=1
         =                       crc=1        finobt=1, sparse=1, rmapbt=1
         =                       reflink=1    bigtime=1 inobtcount=1 nrext64=0
data     =                       bsize=4096   blocks=76800, imaxpct=25
         =                       sunit=0      swidth=0 blks
naming   =version 2              bsize=4096   ascii-ci=0, ftype=1
log      =internal log           bsize=4096   blocks=16384, version=2
         =                       sectsz=512   sunit=0 blks, lazy-count=1
realtime =none                   extsz=4096   blocks=0, rtextents=0
root@glusterfs-ubuntu-24:~# sudo mkfs.xfs -f brick4.file
meta-data=brick4.file            isize=512    agcount=4, agsize=19200 blks
         =                       sectsz=512   attr=2, projid32bit=1
         =                       crc=1        finobt=1, sparse=1, rmapbt=1
         =                       reflink=1    bigtime=1 inobtcount=1 nrext64=0
data     =                       bsize=4096   blocks=76800, imaxpct=25
         =                       sunit=0      swidth=0 blks
naming   =version 2              bsize=4096   ascii-ci=0, ftype=1
log      =internal log           bsize=4096   blocks=16384, version=2
         =                       sectsz=512   sunit=0 blks, lazy-count=1
realtime =none                   extsz=4096   blocks=0, rtextents=0
root@glusterfs-ubuntu-24:~# mkdir /brick{1..4}
root@glusterfs-ubuntu-24:~# for i in {1..4}; do mount -t xfs brick$i.file /brick$i; done

root@glusterfs-ubuntu-24:~# df -Th
Filesystem     Type   Size  Used Avail Use% Mounted on
...
/dev/loop16    xfs    236M   20M  217M   9% /brick1
/dev/loop17    xfs    236M   20M  217M   9% /brick2
/dev/loop18    xfs    236M   20M  217M   9% /brick3
/dev/loop19    xfs    236M   20M  217M   9% /brick4
root@glusterfs-ubuntu-24:~# glusterd
root@glusterfs-ubuntu-24:~# gluster v create vol replica 2 glusterfs-ubuntu-24:/brick{1..4} force
volume create: vol: success: please start the volume to access data
root@glusterfs-ubuntu-24:~# gluster v start vol
volume start: vol: success
root@glusterfs-ubuntu-24:~# dd if=/dev/zero of=/brick1/.glusterfs/filler_file bs=1M count=200
200+0 records in
200+0 records out
209715200 bytes (210 MB, 200 MiB) copied, 0.0414856 s, 5.1 GB/s
root@glusterfs-ubuntu-24:~# dd if=/dev/zero of=/brick2/.glusterfs/filler_file bs=1M count=200
200+0 records in
200+0 records out
209715200 bytes (210 MB, 200 MiB) copied, 0.0483113 s, 4.3 GB/s
root@glusterfs-ubuntu-24:~# df -Th
Filesystem     Type   Size  Used Avail Use% Mounted on
tmpfs          tmpfs  392M  1.4M  391M   1% /run
/dev/vda2      ext4    49G   14G   34G  29% /
tmpfs          tmpfs  2.0G     0  2.0G   0% /dev/shm
tmpfs          tmpfs  5.0M  8.0K  5.0M   1% /run/lock
tmpfs          tmpfs  392M   80K  392M   1% /run/user/1000
/dev/loop16    xfs    236M  221M   16M  94% /brick1
/dev/loop17    xfs    236M  221M   16M  94% /brick2
/dev/loop18    xfs    236M   21M  216M   9% /brick3
/dev/loop19    xfs    236M   21M  216M   9% /brick4
root@glusterfs-ubuntu-24:~# mount -t glusterfs glusterfs-ubuntu-24:/vol /mnt/glusterfs/0
root@glusterfs-ubuntu-24:~# mount -t glusterfs glusterfs-ubuntu-24:/vol /mnt/glusterfs/1

Create 10 files on the mount to see which files have corresponding data file:

root@glusterfs-ubuntu-24:~# cd /mnt/glusterfs/0/
root@glusterfs-ubuntu-24:/mnt/glusterfs/0# touch {1..10}
root@glusterfs-ubuntu-24:/mnt/glusterfs/0# ls /brick*
/brick1:
1  5  7  8  9

/brick2:
1  5  7  8  9

/brick3:
1  10  2  3  4  5  6  7  8  9

/brick4:
1  10  2  3  4  5  6  7  8  9

files 1, 5, 7, 8, 9 are hashed to brick1, brick2 but the data is in brick3, 4
Pick file 9 as the one where we will simulate the race using gdb

root@glusterfs-ubuntu-24:~# cd /mnt/glusterfs/1
root@glusterfs-ubuntu-24:/mnt/glusterfs/1# rm -f 9

Attach both the mount processes in gdb and put a breakpoint on the following two functions:
dht_create_linkfile_create_cbk and afr_create_wind

On /mnt/glusterfs/0 mount do a touch 9

root@glusterfs-ubuntu-24:/mnt/glusterfs/0# touch 9

break point will be hit in the corresponding gdb session. continue it until it reaches the first afr_create_wind
This would have created a linkto file in brick1, brick2

Now do a touch 9 on /mnt/glusterfs/1

root@glusterfs-ubuntu-24:/mnt/glusterfs/1# touch 9

This mount detects the presence of linkto file on brick1, brick2 but won't see the data file on brick3, brick4 and concludes it is stale linkto file and deletes it from brick1, brick2. Following logs appear in both the bricks logs.

[2025-05-22 01:31:32.894237 +0000] I [MSGID: 113030] [posix-entry-ops.c:1401:posix_unlink] 0-vol-posix: open-fd-key-status: 0 for /brick1/9
[2025-05-22 01:31:32.894266 +0000] I [MSGID: 113031] [posix-entry-ops.c:1292:posix_skip_non_linkto_unlink] 0-posix: linkto_xattr status: 0 for /brick1/9

press continue on gdb session of /mnt/glusterfs/0 to let the data file creation go through. Now press continue on gdb session of /mnt/glusterfs/1 to let the data file creation go through.

Now the state of the system for 9 is gfid-mismatch

root@glusterfs-ubuntu-24:~# ls /brick*
/brick1:
1  5  7  8  9

/brick2:
1  5  7  8  9

/brick3:
1  10  2  3  4  5  6  7  8  9

/brick4:
1  10  2  3  4  5  6  7  8  9
root@glusterfs-ubuntu-24:~# getfattr -d -m. -e hex /brick{1..4}/9
getfattr: Removing leading '/' from absolute path names
# file: brick1/9
trusted.gfid=0x9e8ff9fcbc5c47af9f668266fdbc981c
trusted.gfid2path.cb798f8b54503c71=0x30303030303030302d303030302d303030302d303030302d3030303030303030303030312f39
trusted.glusterfs.dht.linkto=0x766f6c2d7265706c69636174652d3100
trusted.glusterfs.mdata=0x01000000000000000000000000682e7e74000000003588f3b400000000682e7e74000000003588f3b400000000682e7e74000000003588f3b4

# file: brick2/9
trusted.gfid=0x9e8ff9fcbc5c47af9f668266fdbc981c
trusted.gfid2path.cb798f8b54503c71=0x30303030303030302d303030302d303030302d303030302d3030303030303030303030312f39
trusted.glusterfs.dht.linkto=0x766f6c2d7265706c69636174652d3100
trusted.glusterfs.mdata=0x01000000000000000000000000682e7e74000000003588f3b400000000682e7e74000000003588f3b400000000682e7e74000000003588f3b4

# file: brick3/9
trusted.gfid=0x6df65a349aa24d2bb133f0038afdf379
trusted.gfid2path.cb798f8b54503c71=0x30303030303030302d303030302d303030302d303030302d3030303030303030303030312f39
trusted.glusterfs.mdata=0x01000000000000000000000000682e7ec1000000002e38e0c300000000682e7ec1000000002e38e0c300000000682e7ec1000000002e38e0c3

# file: brick4/9
trusted.gfid=0x6df65a349aa24d2bb133f0038afdf379
trusted.gfid2path.cb798f8b54503c71=0x30303030303030302d303030302d303030302d303030302d3030303030303030303030312f39
trusted.glusterfs.mdata=0x01000000000000000000000000682e7ec1000000002e38e0c300000000682e7ec1000000002e38e0c300000000682e7ec1000000002e38e0c3

root@glusterfs-ubuntu-24:~# getfattr -d -m. -e text /brick{1..4}/9
getfattr: Removing leading '/' from absolute path names
# file: brick1/9
trusted.gfid="�����\\G��f�f����"
trusted.gfid2path.cb798f8b54503c71="00000000-0000-0000-0000-000000000001/9"
trusted.glusterfs.dht.linkto="vol-replicate-1"
trusted.glusterfs.mdata="\000\000\000\000\000\000\000\000\000\000\000\000h.~t\000\000\000\0005��\000\000\000\000h.~t\000\000\000\0005��\000\000\000\000h.~t\000\000\000\0005��"

# file: brick2/9
trusted.gfid="�����\\G��f�f����"
trusted.gfid2path.cb798f8b54503c71="00000000-0000-0000-0000-000000000001/9"
trusted.glusterfs.dht.linkto="vol-replicate-1"
trusted.glusterfs.mdata="\000\000\000\000\000\000\000\000\000\000\000\000h.~t\000\000\000\0005��\000\000\000\000h.~t\000\000\000\0005��\000\000\000\000h.~t\000\000\000\0005��"

# file: brick3/9
trusted.gfid="m�Z4��M+�3����y"
trusted.gfid2path.cb798f8b54503c71="00000000-0000-0000-0000-000000000001/9"
trusted.glusterfs.mdata="\000\000\000\000\000\000\000\000\000\000\000\000h.~�\000\000\000\000.8��\000\000\000\000h.~�\000\000\000\000.8��\000\000\000\000h.~�\000\000\000\000.8��"

# file: brick4/9
trusted.gfid="m�Z4��M+�3����y"
trusted.gfid2path.cb798f8b54503c71="00000000-0000-0000-0000-000000000001/9"
trusted.glusterfs.mdata="\000\000\000\000\000\000\000\000\000\000\000\000h.~�\000\000\000\000.8��\000\000\000\000h.~�\000\000\000\000.8��\000\000\000\000h.~�\000\000\000\000.8��"

root@glusterfs-ubuntu-24:~#

Metadata

Metadata

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions