Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jewel mds: order directories by hash and fix simultaneous readdir races #9655

Merged
merged 12 commits into from Jun 13, 2016
Merged
448 changes: 239 additions & 209 deletions src/client/Client.cc

Large diffs are not rendered by default.

90 changes: 62 additions & 28 deletions src/client/Client.h
Expand Up @@ -161,65 +161,98 @@ struct client_callback_args {
struct dir_result_t {
static const int SHIFT = 28;
static const int64_t MASK = (1 << SHIFT) - 1;
static const int64_t HASH = 0xFFULL << (SHIFT + 24); // impossible frag bits
static const loff_t END = 1ULL << (SHIFT + 32);

static uint64_t make_fpos(unsigned frag, unsigned off) {
return ((uint64_t)frag << SHIFT) | (uint64_t)off;
static uint64_t make_fpos(unsigned h, unsigned l, bool hash) {
uint64_t v = ((uint64_t)h<< SHIFT) | (uint64_t)l;
if (hash)
v |= HASH;
else
assert((v & HASH) != HASH);
return v;
}
static unsigned fpos_frag(uint64_t p) {
return (p & ~END) >> SHIFT;
static unsigned fpos_high(uint64_t p) {
unsigned v = (p & (END-1)) >> SHIFT;
if ((p & HASH) == HASH)
return ceph_frag_value(v);
return v;
}
static unsigned fpos_off(uint64_t p) {
static unsigned fpos_low(uint64_t p) {
return p & MASK;
}

static int fpos_cmp(uint64_t l, uint64_t r) {
int c = ceph_frag_compare(fpos_high(l), fpos_high(r));
if (c)
return c;
if (fpos_low(l) == fpos_low(r))
return 0;
return fpos_low(l) < fpos_low(r) ? -1 : 1;
}

InodeRef inode;
int owner_uid;
int owner_gid;

int64_t offset; // high bits: frag_t, low bits: an offset
int64_t offset; // hash order:
// (0xff << 52) | ((24 bits hash) << 28) |
// (the nth entry has hash collision);
// frag+name order;
// ((frag value) << 28) | (the nth entry in frag);

uint64_t this_offset; // offset of last chunk, adjusted for . and ..
uint64_t next_offset; // offset of next chunk (last_name's + 1)
unsigned next_offset; // offset of next chunk (last_name's + 1)
string last_name; // last entry in previous chunk

uint64_t release_count;
uint64_t ordered_count;
unsigned cache_index;
int start_shared_gen; // dir shared_gen at start of readdir

frag_t buffer_frag;
vector<pair<string,InodeRef> > *buffer;

string at_cache_name; // last entry we successfully returned
struct dentry {
int64_t offset;
string name;
InodeRef inode;
dentry(int64_t o) : offset(o) {}
dentry(int64_t o, const string& n, const InodeRef& in) :
offset(o), name(n), inode(in) {}
};
struct dentry_off_lt {
bool operator()(const dentry& d, int64_t off) const {
return dir_result_t::fpos_cmp(d.offset, off) < 0;
}
};
vector<dentry> buffer;

explicit dir_result_t(Inode *in);

frag_t frag() { return frag_t(offset >> SHIFT); }
unsigned fragpos() { return offset & MASK; }
unsigned offset_high() { return fpos_high(offset); }
unsigned offset_low() { return fpos_low(offset); }

void next_frag() {
frag_t fg = offset >> SHIFT;
if (fg.is_rightmost())
set_end();
else
set_frag(fg.next());
}
void set_frag(frag_t f) {
offset = (uint64_t)f << SHIFT;
assert(sizeof(offset) == 8);
}
void set_end() { offset |= END; }
bool at_end() { return (offset & END); }

void set_hash_order() { offset |= HASH; }
bool hash_order() { return (offset & HASH) == HASH; }

bool is_cached() {
if (buffer.empty())
return false;
if (hash_order()) {
return buffer_frag.contains(offset_high());
} else {
return buffer_frag == frag_t(offset_high());
}
}

void reset() {
last_name.clear();
at_cache_name.clear();
next_offset = 2;
this_offset = 0;
offset = 0;
delete buffer;
buffer = 0;
ordered_count = 0;
cache_index = 0;
buffer.clear();
}
};

Expand Down Expand Up @@ -663,6 +696,7 @@ class Client : public Dispatcher, public md_config_obs_t {
// metadata cache
void update_dir_dist(Inode *in, DirStat *st);

void clear_dir_complete_and_ordered(Inode *diri, bool complete);
void insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri);
Inode* insert_trace(MetaRequest *request, MetaSession *session);
void update_inode_file_bits(Inode *in,
Expand Down
8 changes: 3 additions & 5 deletions src/client/Dentry.h
Expand Up @@ -17,15 +17,13 @@ class Dentry : public LRUObject {
Dir *dir;
InodeRef inode;
int ref; // 1 if there's a dir beneath me.
uint64_t offset;
int64_t offset;
mds_rank_t lease_mds;
utime_t lease_ttl;
uint64_t lease_gen;
ceph_seq_t lease_seq;
int cap_shared_gen;

xlist<Dentry*>::item item_dentry_list;

/*
* ref==1 -> cached, unused
* ref >1 -> pinned in lru
Expand All @@ -49,8 +47,8 @@ class Dentry : public LRUObject {

Dentry() :
dir(0), ref(1), offset(0),
lease_mds(-1), lease_gen(0), lease_seq(0), cap_shared_gen(0),
item_dentry_list(this) { }
lease_mds(-1), lease_gen(0), lease_seq(0), cap_shared_gen(0)
{ }
private:
~Dentry() {
assert(ref == 0);
Expand Down
6 changes: 2 additions & 4 deletions src/client/Dir.h
Expand Up @@ -7,11 +7,9 @@ class Dir {
public:
Inode *parent_inode; // my inode
ceph::unordered_map<string, Dentry*> dentries;
xlist<Dentry*> dentry_list;
uint64_t release_count;
uint64_t ordered_count;
vector<Dentry*> readdir_cache;

explicit Dir(Inode* in) : release_count(0), ordered_count(0) { parent_inode = in; }
explicit Dir(Inode* in) { parent_inode = in; }

bool is_empty() { return dentries.empty(); }
};
Expand Down
14 changes: 7 additions & 7 deletions src/client/Inode.h
Expand Up @@ -226,8 +226,11 @@ struct Inode {
}

// about the dir (if this is one!)
Dir *dir; // if i'm a dir.
fragtree_t dirfragtree;
set<int> dir_contacts;
bool dir_hashed, dir_replicated;
uint64_t dir_release_count, dir_ordered_count;
bool dir_hashed, dir_replicated;

// per-mds caps
map<mds_rank_t, Cap*> caps; // mds -> Cap
Expand Down Expand Up @@ -256,10 +259,8 @@ struct Inode {

int _ref; // ref count. 1 for each dentry, fh that links to me.
int ll_ref; // separate ref count for ll client
Dir *dir; // if i'm a dir.
set<Dentry*> dn_set; // if i'm linked to a dentry.
string symlink; // symlink content, if it's a symlink
fragtree_t dirfragtree;
map<string,bufferptr> xattrs;
map<frag_t,int> fragmap; // known frag -> mds mappings

Expand Down Expand Up @@ -300,9 +301,8 @@ struct Inode {
rdev(0), mode(0), uid(0), gid(0), nlink(0),
size(0), truncate_seq(1), truncate_size(-1),
time_warp_seq(0), max_size(0), version(0), xattr_version(0),
inline_version(0),
flags(0),
qtree(NULL),
inline_version(0), flags(0), qtree(NULL),
dir(0), dir_release_count(1), dir_ordered_count(1),
dir_hashed(false), dir_replicated(false), auth_cap(NULL),
cap_dirtier_uid(-1), cap_dirtier_gid(-1),
dirty_caps(0), flushing_caps(0), shared_gen(0), cache_gen(0),
Expand All @@ -311,7 +311,7 @@ struct Inode {
snaprealm(0), snaprealm_item(this),
oset((void *)this, newlayout->pool_id, ino),
reported_size(0), wanted_max_size(0), requested_max_size(0),
_ref(0), ll_ref(0), dir(0), dn_set(),
_ref(0), ll_ref(0), dn_set(),
fcntl_locks(NULL), flock_locks(NULL),
async_err(0)
{
Expand Down
7 changes: 0 additions & 7 deletions src/client/MetaRequest.cc
Expand Up @@ -37,13 +37,6 @@ void MetaRequest::dump(Formatter *f) const

f->dump_int("got_unsafe", got_unsafe);

if (head.op == CEPH_MDS_OP_READDIR ||
head.op == CEPH_MDS_OP_LSSNAP) {
f->dump_stream("readdir_frag") << readdir_frag;
f->dump_string("readdir_start", readdir_start);
f->dump_unsigned("readdir_offset", readdir_offset);
}

f->dump_unsigned("uid", head.caller_uid);
f->dump_unsigned("gid", head.caller_gid);

Expand Down
12 changes: 2 additions & 10 deletions src/client/MetaRequest.h
Expand Up @@ -19,6 +19,7 @@

class MClientReply;
class Dentry;
class dir_result_t;

struct MetaRequest {
private:
Expand Down Expand Up @@ -56,15 +57,7 @@ struct MetaRequest {
bool success;

// readdir result
frag_t readdir_frag;
string readdir_start; // starting _after_ this name
uint64_t readdir_offset;

frag_t readdir_reply_frag;
vector<pair<string,InodeRef> > readdir_result;
bool readdir_end;
int readdir_num;
string readdir_last_name;
dir_result_t *dirp;

//possible responses
bool got_unsafe;
Expand Down Expand Up @@ -93,7 +86,6 @@ struct MetaRequest {
num_fwd(0), retry_attempt(0),
ref(1), reply(0),
kick(false), success(false),
readdir_offset(0), readdir_end(false), readdir_num(0),
got_unsafe(false), item(this), unsafe_item(this),
unsafe_dir_item(this), unsafe_target_item(this),
caller_cond(0), dispatch_cond(0) {
Expand Down
13 changes: 13 additions & 0 deletions src/include/ceph_fs.h
Expand Up @@ -387,6 +387,18 @@ extern const char *ceph_mds_op_name(int op);
#define CEPH_XATTR_REPLACE (1 << 1)
#define CEPH_XATTR_REMOVE (1 << 31)

/*
* readdir request flags;
*/
#define CEPH_READDIR_REPLY_BITFLAGS (1<<0)

/*
* readdir reply flags.
*/
#define CEPH_READDIR_FRAG_END (1<<0)
#define CEPH_READDIR_FRAG_COMPLETE (1<<8)
#define CEPH_READDIR_HASH_ORDER (1<<9)

union ceph_mds_request_args {
struct {
__le32 mask; /* CEPH_CAP_* */
Expand All @@ -404,6 +416,7 @@ union ceph_mds_request_args {
__le32 frag; /* which dir fragment */
__le32 max_entries; /* how many dentries to grab */
__le32 max_bytes;
__le16 flags;
} __attribute__ ((packed)) readdir;
struct {
__le32 mode;
Expand Down
2 changes: 1 addition & 1 deletion src/mds/CDentry.h
Expand Up @@ -110,7 +110,7 @@ class CDentry : public MDSCacheObject, public LRUObject {
snapid_t first, last;

dentry_key_t key() {
return dentry_key_t(last, name.c_str());
return dentry_key_t(last, name.c_str(), hash);
}

public:
Expand Down
15 changes: 10 additions & 5 deletions src/mds/CDir.cc
Expand Up @@ -295,10 +295,11 @@ bool CDir::check_rstats(bool scrub)
return good;
}

CDentry *CDir::lookup(const char *name, snapid_t snap)
CDentry *CDir::lookup(const string& name, snapid_t snap)
{
dout(20) << "lookup (" << snap << ", '" << name << "')" << dendl;
map_t::iterator iter = items.lower_bound(dentry_key_t(snap, name));
map_t::iterator iter = items.lower_bound(dentry_key_t(snap, name.c_str(),
inode->hash_dentry_name(name)));
if (iter == items.end())
return 0;
if (iter->second->name == name &&
Expand All @@ -310,9 +311,13 @@ CDentry *CDir::lookup(const char *name, snapid_t snap)
return 0;
}




CDentry *CDir::lookup_exact_snap(const string& name, snapid_t last) {
map_t::iterator p = items.find(dentry_key_t(last, name.c_str(),
inode->hash_dentry_name(name)));
if (p == items.end())
return NULL;
return p->second;
}

/***
* linking fun
Expand Down
13 changes: 4 additions & 9 deletions src/mds/CDir.h
Expand Up @@ -448,16 +448,11 @@ class CDir : public MDSCacheObject {

// -- dentries and inodes --
public:
CDentry* lookup_exact_snap(const std::string& dname, snapid_t last) {
map_t::iterator p = items.find(dentry_key_t(last, dname.c_str()));
if (p == items.end())
return NULL;
return p->second;
CDentry* lookup_exact_snap(const std::string& dname, snapid_t last);
CDentry* lookup(const std::string& n, snapid_t snap=CEPH_NOSNAP);
CDentry* lookup(const char *n, snapid_t snap=CEPH_NOSNAP) {
return lookup(std::string(n), snap);
}
CDentry* lookup(const std::string& n, snapid_t snap=CEPH_NOSNAP) {
return lookup(n.c_str(), snap);
}
CDentry* lookup(const char *n, snapid_t snap=CEPH_NOSNAP);

CDentry* add_null_dentry(const std::string& dname,
snapid_t first=2, snapid_t last=CEPH_NOSNAP);
Expand Down
3 changes: 3 additions & 0 deletions src/mds/MDCache.cc
Expand Up @@ -4558,6 +4558,7 @@ CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
if (!in->is_dir()) {
assert(in->state_test(CInode::STATE_REJOINUNDEF));
in->inode.mode = S_IFDIR;
in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
}
CDir *dir = in->get_or_open_dirfrag(this, df.frag);
dir->state_set(CDir::STATE_REJOINUNDEF);
Expand Down Expand Up @@ -5753,6 +5754,8 @@ void MDCache::opened_undef_inode(CInode *in) {
dout(10) << "opened_undef_inode " << *in << dendl;
rejoin_undef_inodes.erase(in);
if (in->is_dir()) {
// FIXME: re-hash dentries if necessary
assert(in->inode.dir_layout.dl_dir_hash == g_conf->mds_default_dir_hash);
if (in->has_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
CDir *dir = in->get_dirfrag(frag_t());
assert(dir);
Expand Down