Description
We faced an issue in production where 28 files were in this condition where the linkto file and data files had gfid mismatch. Upon code reading we were able to recreate the issue with the help of gdb.
TLDR of the flow:
mount-1, mount-2 try to create the same file when min-free-disk threshold on hashed subvol is breached :
1. mount-1 creates a linkto file for file1.txt(gfid-1) in subvol-1(hashed) and in the process of creating data file in subvol-2
2. mount-2 does a lookup on file1.txt before create is issued and doesn't find the data file for it.
3. mount-2 deletes the linkto file file1.txt(gfid-1) on subvol-1, thinking it is stale file
4. mount-1 goes ahead and creates the data file on subvol-2 (gfid-1)
5. mount-2 creates a linkto file on subvol-1 (gfid-2)
6. mount-2 tries to create data file on subvol-2 with gfid-2 but since the file already exists, create would succeed but the response would be a success with gfid-1 in the stat structures.
Leading to gfid mismatches
Detailed steps:
Prepare the volume so that the bricks go into min-free-disk breaching state
root@glusterfs-ubuntu-24:~# dd if=/dev/zero of=brick1.file bs=1M count=300
300+0 records in
300+0 records out
314572800 bytes (315 MB, 300 MiB) copied, 0.0984521 s, 3.2 GB/s
root@glusterfs-ubuntu-24:~# dd if=/dev/zero of=brick2.file bs=1M count=300
300+0 records in
300+0 records out
314572800 bytes (315 MB, 300 MiB) copied, 0.180304 s, 1.7 GB/s
root@glusterfs-ubuntu-24:~# dd if=/dev/zero of=brick3.file bs=1M count=300
300+0 records in
300+0 records out
314572800 bytes (315 MB, 300 MiB) copied, 0.13029 s, 2.4 GB/s
root@glusterfs-ubuntu-24:~# dd if=/dev/zero of=brick4.file bs=1M count=300
300+0 records in
300+0 records out
314572800 bytes (315 MB, 300 MiB) copied, 0.146483 s, 2.1 GB/s
root@glusterfs-ubuntu-24:~# sudo mkfs.xfs -f brick1.file
meta-data=brick1.file isize=512 agcount=4, agsize=19200 blks
= sectsz=512 attr=2, projid32bit=1
= crc=1 finobt=1, sparse=1, rmapbt=1
= reflink=1 bigtime=1 inobtcount=1 nrext64=0
data = bsize=4096 blocks=76800, imaxpct=25
= sunit=0 swidth=0 blks
naming =version 2 bsize=4096 ascii-ci=0, ftype=1
log =internal log bsize=4096 blocks=16384, version=2
= sectsz=512 sunit=0 blks, lazy-count=1
realtime =none extsz=4096 blocks=0, rtextents=0
root@glusterfs-ubuntu-24:~# sudo mkfs.xfs -f brick2.file
meta-data=brick2.file isize=512 agcount=4, agsize=19200 blks
= sectsz=512 attr=2, projid32bit=1
= crc=1 finobt=1, sparse=1, rmapbt=1
= reflink=1 bigtime=1 inobtcount=1 nrext64=0
data = bsize=4096 blocks=76800, imaxpct=25
= sunit=0 swidth=0 blks
naming =version 2 bsize=4096 ascii-ci=0, ftype=1
log =internal log bsize=4096 blocks=16384, version=2
= sectsz=512 sunit=0 blks, lazy-count=1
realtime =none extsz=4096 blocks=0, rtextents=0
root@glusterfs-ubuntu-24:~# sudo mkfs.xfs -f brick3.file
meta-data=brick3.file isize=512 agcount=4, agsize=19200 blks
= sectsz=512 attr=2, projid32bit=1
= crc=1 finobt=1, sparse=1, rmapbt=1
= reflink=1 bigtime=1 inobtcount=1 nrext64=0
data = bsize=4096 blocks=76800, imaxpct=25
= sunit=0 swidth=0 blks
naming =version 2 bsize=4096 ascii-ci=0, ftype=1
log =internal log bsize=4096 blocks=16384, version=2
= sectsz=512 sunit=0 blks, lazy-count=1
realtime =none extsz=4096 blocks=0, rtextents=0
root@glusterfs-ubuntu-24:~# sudo mkfs.xfs -f brick4.file
meta-data=brick4.file isize=512 agcount=4, agsize=19200 blks
= sectsz=512 attr=2, projid32bit=1
= crc=1 finobt=1, sparse=1, rmapbt=1
= reflink=1 bigtime=1 inobtcount=1 nrext64=0
data = bsize=4096 blocks=76800, imaxpct=25
= sunit=0 swidth=0 blks
naming =version 2 bsize=4096 ascii-ci=0, ftype=1
log =internal log bsize=4096 blocks=16384, version=2
= sectsz=512 sunit=0 blks, lazy-count=1
realtime =none extsz=4096 blocks=0, rtextents=0
root@glusterfs-ubuntu-24:~# mkdir /brick{1..4}
root@glusterfs-ubuntu-24:~# for i in {1..4}; do mount -t xfs brick$i.file /brick$i; done
root@glusterfs-ubuntu-24:~# df -Th
Filesystem Type Size Used Avail Use% Mounted on
...
/dev/loop16 xfs 236M 20M 217M 9% /brick1
/dev/loop17 xfs 236M 20M 217M 9% /brick2
/dev/loop18 xfs 236M 20M 217M 9% /brick3
/dev/loop19 xfs 236M 20M 217M 9% /brick4
root@glusterfs-ubuntu-24:~# glusterd
root@glusterfs-ubuntu-24:~# gluster v create vol replica 2 glusterfs-ubuntu-24:/brick{1..4} force
volume create: vol: success: please start the volume to access data
root@glusterfs-ubuntu-24:~# gluster v start vol
volume start: vol: success
root@glusterfs-ubuntu-24:~# dd if=/dev/zero of=/brick1/.glusterfs/filler_file bs=1M count=200
200+0 records in
200+0 records out
209715200 bytes (210 MB, 200 MiB) copied, 0.0414856 s, 5.1 GB/s
root@glusterfs-ubuntu-24:~# dd if=/dev/zero of=/brick2/.glusterfs/filler_file bs=1M count=200
200+0 records in
200+0 records out
209715200 bytes (210 MB, 200 MiB) copied, 0.0483113 s, 4.3 GB/s
root@glusterfs-ubuntu-24:~# df -Th
Filesystem Type Size Used Avail Use% Mounted on
tmpfs tmpfs 392M 1.4M 391M 1% /run
/dev/vda2 ext4 49G 14G 34G 29% /
tmpfs tmpfs 2.0G 0 2.0G 0% /dev/shm
tmpfs tmpfs 5.0M 8.0K 5.0M 1% /run/lock
tmpfs tmpfs 392M 80K 392M 1% /run/user/1000
/dev/loop16 xfs 236M 221M 16M 94% /brick1
/dev/loop17 xfs 236M 221M 16M 94% /brick2
/dev/loop18 xfs 236M 21M 216M 9% /brick3
/dev/loop19 xfs 236M 21M 216M 9% /brick4
root@glusterfs-ubuntu-24:~# mount -t glusterfs glusterfs-ubuntu-24:/vol /mnt/glusterfs/0
root@glusterfs-ubuntu-24:~# mount -t glusterfs glusterfs-ubuntu-24:/vol /mnt/glusterfs/1
Create 10 files on the mount to see which files have corresponding data file:
root@glusterfs-ubuntu-24:~# cd /mnt/glusterfs/0/
root@glusterfs-ubuntu-24:/mnt/glusterfs/0# touch {1..10}
root@glusterfs-ubuntu-24:/mnt/glusterfs/0# ls /brick*
/brick1:
1 5 7 8 9
/brick2:
1 5 7 8 9
/brick3:
1 10 2 3 4 5 6 7 8 9
/brick4:
1 10 2 3 4 5 6 7 8 9
files 1, 5, 7, 8, 9 are hashed to brick1, brick2 but the data is in brick3, 4
Pick file 9 as the one where we will simulate the race using gdb
root@glusterfs-ubuntu-24:~# cd /mnt/glusterfs/1
root@glusterfs-ubuntu-24:/mnt/glusterfs/1# rm -f 9
Attach both the mount processes in gdb and put a breakpoint on the following two functions:
dht_create_linkfile_create_cbk
and afr_create_wind
On /mnt/glusterfs/0
mount do a touch 9
root@glusterfs-ubuntu-24:/mnt/glusterfs/0# touch 9
break point will be hit in the corresponding gdb session. continue it until it reaches the first afr_create_wind
This would have created a linkto file in brick1, brick2
Now do a touch 9
on /mnt/glusterfs/1
root@glusterfs-ubuntu-24:/mnt/glusterfs/1# touch 9
This mount detects the presence of linkto file on brick1, brick2 but won't see the data file on brick3, brick4 and concludes it is stale linkto file and deletes it from brick1, brick2. Following logs appear in both the bricks logs.
[2025-05-22 01:31:32.894237 +0000] I [MSGID: 113030] [posix-entry-ops.c:1401:posix_unlink] 0-vol-posix: open-fd-key-status: 0 for /brick1/9
[2025-05-22 01:31:32.894266 +0000] I [MSGID: 113031] [posix-entry-ops.c:1292:posix_skip_non_linkto_unlink] 0-posix: linkto_xattr status: 0 for /brick1/9
press continue on gdb session of /mnt/glusterfs/0
to let the data file creation go through. Now press continue on gdb session of /mnt/glusterfs/1
to let the data file creation go through.
Now the state of the system for 9
is gfid-mismatch
root@glusterfs-ubuntu-24:~# ls /brick*
/brick1:
1 5 7 8 9
/brick2:
1 5 7 8 9
/brick3:
1 10 2 3 4 5 6 7 8 9
/brick4:
1 10 2 3 4 5 6 7 8 9
root@glusterfs-ubuntu-24:~# getfattr -d -m. -e hex /brick{1..4}/9
getfattr: Removing leading '/' from absolute path names
# file: brick1/9
trusted.gfid=0x9e8ff9fcbc5c47af9f668266fdbc981c
trusted.gfid2path.cb798f8b54503c71=0x30303030303030302d303030302d303030302d303030302d3030303030303030303030312f39
trusted.glusterfs.dht.linkto=0x766f6c2d7265706c69636174652d3100
trusted.glusterfs.mdata=0x01000000000000000000000000682e7e74000000003588f3b400000000682e7e74000000003588f3b400000000682e7e74000000003588f3b4
# file: brick2/9
trusted.gfid=0x9e8ff9fcbc5c47af9f668266fdbc981c
trusted.gfid2path.cb798f8b54503c71=0x30303030303030302d303030302d303030302d303030302d3030303030303030303030312f39
trusted.glusterfs.dht.linkto=0x766f6c2d7265706c69636174652d3100
trusted.glusterfs.mdata=0x01000000000000000000000000682e7e74000000003588f3b400000000682e7e74000000003588f3b400000000682e7e74000000003588f3b4
# file: brick3/9
trusted.gfid=0x6df65a349aa24d2bb133f0038afdf379
trusted.gfid2path.cb798f8b54503c71=0x30303030303030302d303030302d303030302d303030302d3030303030303030303030312f39
trusted.glusterfs.mdata=0x01000000000000000000000000682e7ec1000000002e38e0c300000000682e7ec1000000002e38e0c300000000682e7ec1000000002e38e0c3
# file: brick4/9
trusted.gfid=0x6df65a349aa24d2bb133f0038afdf379
trusted.gfid2path.cb798f8b54503c71=0x30303030303030302d303030302d303030302d303030302d3030303030303030303030312f39
trusted.glusterfs.mdata=0x01000000000000000000000000682e7ec1000000002e38e0c300000000682e7ec1000000002e38e0c300000000682e7ec1000000002e38e0c3
root@glusterfs-ubuntu-24:~# getfattr -d -m. -e text /brick{1..4}/9
getfattr: Removing leading '/' from absolute path names
# file: brick1/9
trusted.gfid="�����\\G��f�f����"
trusted.gfid2path.cb798f8b54503c71="00000000-0000-0000-0000-000000000001/9"
trusted.glusterfs.dht.linkto="vol-replicate-1"
trusted.glusterfs.mdata="\000\000\000\000\000\000\000\000\000\000\000\000h.~t\000\000\000\0005��\000\000\000\000h.~t\000\000\000\0005��\000\000\000\000h.~t\000\000\000\0005��"
# file: brick2/9
trusted.gfid="�����\\G��f�f����"
trusted.gfid2path.cb798f8b54503c71="00000000-0000-0000-0000-000000000001/9"
trusted.glusterfs.dht.linkto="vol-replicate-1"
trusted.glusterfs.mdata="\000\000\000\000\000\000\000\000\000\000\000\000h.~t\000\000\000\0005��\000\000\000\000h.~t\000\000\000\0005��\000\000\000\000h.~t\000\000\000\0005��"
# file: brick3/9
trusted.gfid="m�Z4��M+�3����y"
trusted.gfid2path.cb798f8b54503c71="00000000-0000-0000-0000-000000000001/9"
trusted.glusterfs.mdata="\000\000\000\000\000\000\000\000\000\000\000\000h.~�\000\000\000\000.8��\000\000\000\000h.~�\000\000\000\000.8��\000\000\000\000h.~�\000\000\000\000.8��"
# file: brick4/9
trusted.gfid="m�Z4��M+�3����y"
trusted.gfid2path.cb798f8b54503c71="00000000-0000-0000-0000-000000000001/9"
trusted.glusterfs.mdata="\000\000\000\000\000\000\000\000\000\000\000\000h.~�\000\000\000\000.8��\000\000\000\000h.~�\000\000\000\000.8��\000\000\000\000h.~�\000\000\000\000.8��"
root@glusterfs-ubuntu-24:~#