Skip to content

Commit

Permalink
Merge pull request #9655 from gregsfortytwo/wip-jewel-15508
Browse files Browse the repository at this point in the history
Jewel mds: order directories by hash and fix simultaneous readdir races
  • Loading branch information
gregsfortytwo committed Jun 13, 2016
2 parents 89d6545 + d61e3dd commit f902309
Show file tree
Hide file tree
Showing 15 changed files with 442 additions and 313 deletions.
448 changes: 239 additions & 209 deletions src/client/Client.cc

Large diffs are not rendered by default.

90 changes: 62 additions & 28 deletions src/client/Client.h
Expand Up @@ -161,65 +161,98 @@ struct client_callback_args {
struct dir_result_t {
static const int SHIFT = 28;
static const int64_t MASK = (1 << SHIFT) - 1;
static const int64_t HASH = 0xFFULL << (SHIFT + 24); // impossible frag bits
static const loff_t END = 1ULL << (SHIFT + 32);

static uint64_t make_fpos(unsigned frag, unsigned off) {
return ((uint64_t)frag << SHIFT) | (uint64_t)off;
static uint64_t make_fpos(unsigned h, unsigned l, bool hash) {
uint64_t v = ((uint64_t)h<< SHIFT) | (uint64_t)l;
if (hash)
v |= HASH;
else
assert((v & HASH) != HASH);
return v;
}
static unsigned fpos_frag(uint64_t p) {
return (p & ~END) >> SHIFT;
static unsigned fpos_high(uint64_t p) {
unsigned v = (p & (END-1)) >> SHIFT;
if ((p & HASH) == HASH)
return ceph_frag_value(v);
return v;
}
static unsigned fpos_off(uint64_t p) {
static unsigned fpos_low(uint64_t p) {
return p & MASK;
}

static int fpos_cmp(uint64_t l, uint64_t r) {
int c = ceph_frag_compare(fpos_high(l), fpos_high(r));
if (c)
return c;
if (fpos_low(l) == fpos_low(r))
return 0;
return fpos_low(l) < fpos_low(r) ? -1 : 1;
}

InodeRef inode;
int owner_uid;
int owner_gid;

int64_t offset; // high bits: frag_t, low bits: an offset
int64_t offset; // hash order:
// (0xff << 52) | ((24 bits hash) << 28) |
// (the nth entry has hash collision);
// frag+name order;
// ((frag value) << 28) | (the nth entry in frag);

uint64_t this_offset; // offset of last chunk, adjusted for . and ..
uint64_t next_offset; // offset of next chunk (last_name's + 1)
unsigned next_offset; // offset of next chunk (last_name's + 1)
string last_name; // last entry in previous chunk

uint64_t release_count;
uint64_t ordered_count;
unsigned cache_index;
int start_shared_gen; // dir shared_gen at start of readdir

frag_t buffer_frag;
vector<pair<string,InodeRef> > *buffer;

string at_cache_name; // last entry we successfully returned
struct dentry {
int64_t offset;
string name;
InodeRef inode;
dentry(int64_t o) : offset(o) {}
dentry(int64_t o, const string& n, const InodeRef& in) :
offset(o), name(n), inode(in) {}
};
struct dentry_off_lt {
bool operator()(const dentry& d, int64_t off) const {
return dir_result_t::fpos_cmp(d.offset, off) < 0;
}
};
vector<dentry> buffer;

explicit dir_result_t(Inode *in);

frag_t frag() { return frag_t(offset >> SHIFT); }
unsigned fragpos() { return offset & MASK; }
unsigned offset_high() { return fpos_high(offset); }
unsigned offset_low() { return fpos_low(offset); }

void next_frag() {
frag_t fg = offset >> SHIFT;
if (fg.is_rightmost())
set_end();
else
set_frag(fg.next());
}
void set_frag(frag_t f) {
offset = (uint64_t)f << SHIFT;
assert(sizeof(offset) == 8);
}
void set_end() { offset |= END; }
bool at_end() { return (offset & END); }

void set_hash_order() { offset |= HASH; }
bool hash_order() { return (offset & HASH) == HASH; }

bool is_cached() {
if (buffer.empty())
return false;
if (hash_order()) {
return buffer_frag.contains(offset_high());
} else {
return buffer_frag == frag_t(offset_high());
}
}

void reset() {
last_name.clear();
at_cache_name.clear();
next_offset = 2;
this_offset = 0;
offset = 0;
delete buffer;
buffer = 0;
ordered_count = 0;
cache_index = 0;
buffer.clear();
}
};

Expand Down Expand Up @@ -663,6 +696,7 @@ class Client : public Dispatcher, public md_config_obs_t {
// metadata cache
void update_dir_dist(Inode *in, DirStat *st);

void clear_dir_complete_and_ordered(Inode *diri, bool complete);
void insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri);
Inode* insert_trace(MetaRequest *request, MetaSession *session);
void update_inode_file_bits(Inode *in,
Expand Down
8 changes: 3 additions & 5 deletions src/client/Dentry.h
Expand Up @@ -17,15 +17,13 @@ class Dentry : public LRUObject {
Dir *dir;
InodeRef inode;
int ref; // 1 if there's a dir beneath me.
uint64_t offset;
int64_t offset;
mds_rank_t lease_mds;
utime_t lease_ttl;
uint64_t lease_gen;
ceph_seq_t lease_seq;
int cap_shared_gen;

xlist<Dentry*>::item item_dentry_list;

/*
* ref==1 -> cached, unused
* ref >1 -> pinned in lru
Expand All @@ -49,8 +47,8 @@ class Dentry : public LRUObject {

Dentry() :
dir(0), ref(1), offset(0),
lease_mds(-1), lease_gen(0), lease_seq(0), cap_shared_gen(0),
item_dentry_list(this) { }
lease_mds(-1), lease_gen(0), lease_seq(0), cap_shared_gen(0)
{ }
private:
~Dentry() {
assert(ref == 0);
Expand Down
6 changes: 2 additions & 4 deletions src/client/Dir.h
Expand Up @@ -7,11 +7,9 @@ class Dir {
public:
Inode *parent_inode; // my inode
ceph::unordered_map<string, Dentry*> dentries;
xlist<Dentry*> dentry_list;
uint64_t release_count;
uint64_t ordered_count;
vector<Dentry*> readdir_cache;

explicit Dir(Inode* in) : release_count(0), ordered_count(0) { parent_inode = in; }
explicit Dir(Inode* in) { parent_inode = in; }

bool is_empty() { return dentries.empty(); }
};
Expand Down
14 changes: 7 additions & 7 deletions src/client/Inode.h
Expand Up @@ -226,8 +226,11 @@ struct Inode {
}

// about the dir (if this is one!)
Dir *dir; // if i'm a dir.
fragtree_t dirfragtree;
set<int> dir_contacts;
bool dir_hashed, dir_replicated;
uint64_t dir_release_count, dir_ordered_count;
bool dir_hashed, dir_replicated;

// per-mds caps
map<mds_rank_t, Cap*> caps; // mds -> Cap
Expand Down Expand Up @@ -256,10 +259,8 @@ struct Inode {

int _ref; // ref count. 1 for each dentry, fh that links to me.
int ll_ref; // separate ref count for ll client
Dir *dir; // if i'm a dir.
set<Dentry*> dn_set; // if i'm linked to a dentry.
string symlink; // symlink content, if it's a symlink
fragtree_t dirfragtree;
map<string,bufferptr> xattrs;
map<frag_t,int> fragmap; // known frag -> mds mappings

Expand Down Expand Up @@ -300,9 +301,8 @@ struct Inode {
rdev(0), mode(0), uid(0), gid(0), nlink(0),
size(0), truncate_seq(1), truncate_size(-1),
time_warp_seq(0), max_size(0), version(0), xattr_version(0),
inline_version(0),
flags(0),
qtree(NULL),
inline_version(0), flags(0), qtree(NULL),
dir(0), dir_release_count(1), dir_ordered_count(1),
dir_hashed(false), dir_replicated(false), auth_cap(NULL),
cap_dirtier_uid(-1), cap_dirtier_gid(-1),
dirty_caps(0), flushing_caps(0), shared_gen(0), cache_gen(0),
Expand All @@ -311,7 +311,7 @@ struct Inode {
snaprealm(0), snaprealm_item(this),
oset((void *)this, newlayout->pool_id, ino),
reported_size(0), wanted_max_size(0), requested_max_size(0),
_ref(0), ll_ref(0), dir(0), dn_set(),
_ref(0), ll_ref(0), dn_set(),
fcntl_locks(NULL), flock_locks(NULL),
async_err(0)
{
Expand Down
7 changes: 0 additions & 7 deletions src/client/MetaRequest.cc
Expand Up @@ -37,13 +37,6 @@ void MetaRequest::dump(Formatter *f) const

f->dump_int("got_unsafe", got_unsafe);

if (head.op == CEPH_MDS_OP_READDIR ||
head.op == CEPH_MDS_OP_LSSNAP) {
f->dump_stream("readdir_frag") << readdir_frag;
f->dump_string("readdir_start", readdir_start);
f->dump_unsigned("readdir_offset", readdir_offset);
}

f->dump_unsigned("uid", head.caller_uid);
f->dump_unsigned("gid", head.caller_gid);

Expand Down
12 changes: 2 additions & 10 deletions src/client/MetaRequest.h
Expand Up @@ -19,6 +19,7 @@

class MClientReply;
class Dentry;
class dir_result_t;

struct MetaRequest {
private:
Expand Down Expand Up @@ -56,15 +57,7 @@ struct MetaRequest {
bool success;

// readdir result
frag_t readdir_frag;
string readdir_start; // starting _after_ this name
uint64_t readdir_offset;

frag_t readdir_reply_frag;
vector<pair<string,InodeRef> > readdir_result;
bool readdir_end;
int readdir_num;
string readdir_last_name;
dir_result_t *dirp;

//possible responses
bool got_unsafe;
Expand Down Expand Up @@ -93,7 +86,6 @@ struct MetaRequest {
num_fwd(0), retry_attempt(0),
ref(1), reply(0),
kick(false), success(false),
readdir_offset(0), readdir_end(false), readdir_num(0),
got_unsafe(false), item(this), unsafe_item(this),
unsafe_dir_item(this), unsafe_target_item(this),
caller_cond(0), dispatch_cond(0) {
Expand Down
13 changes: 13 additions & 0 deletions src/include/ceph_fs.h
Expand Up @@ -387,6 +387,18 @@ extern const char *ceph_mds_op_name(int op);
#define CEPH_XATTR_REPLACE (1 << 1)
#define CEPH_XATTR_REMOVE (1 << 31)

/*
* readdir request flags;
*/
#define CEPH_READDIR_REPLY_BITFLAGS (1<<0)

/*
* readdir reply flags.
*/
#define CEPH_READDIR_FRAG_END (1<<0)
#define CEPH_READDIR_FRAG_COMPLETE (1<<8)
#define CEPH_READDIR_HASH_ORDER (1<<9)

union ceph_mds_request_args {
struct {
__le32 mask; /* CEPH_CAP_* */
Expand All @@ -404,6 +416,7 @@ union ceph_mds_request_args {
__le32 frag; /* which dir fragment */
__le32 max_entries; /* how many dentries to grab */
__le32 max_bytes;
__le16 flags;
} __attribute__ ((packed)) readdir;
struct {
__le32 mode;
Expand Down
2 changes: 1 addition & 1 deletion src/mds/CDentry.h
Expand Up @@ -110,7 +110,7 @@ class CDentry : public MDSCacheObject, public LRUObject {
snapid_t first, last;

dentry_key_t key() {
return dentry_key_t(last, name.c_str());
return dentry_key_t(last, name.c_str(), hash);
}

public:
Expand Down
15 changes: 10 additions & 5 deletions src/mds/CDir.cc
Expand Up @@ -295,10 +295,11 @@ bool CDir::check_rstats(bool scrub)
return good;
}

CDentry *CDir::lookup(const char *name, snapid_t snap)
CDentry *CDir::lookup(const string& name, snapid_t snap)
{
dout(20) << "lookup (" << snap << ", '" << name << "')" << dendl;
map_t::iterator iter = items.lower_bound(dentry_key_t(snap, name));
map_t::iterator iter = items.lower_bound(dentry_key_t(snap, name.c_str(),
inode->hash_dentry_name(name)));
if (iter == items.end())
return 0;
if (iter->second->name == name &&
Expand All @@ -310,9 +311,13 @@ CDentry *CDir::lookup(const char *name, snapid_t snap)
return 0;
}




CDentry *CDir::lookup_exact_snap(const string& name, snapid_t last) {
map_t::iterator p = items.find(dentry_key_t(last, name.c_str(),
inode->hash_dentry_name(name)));
if (p == items.end())
return NULL;
return p->second;
}

/***
* linking fun
Expand Down
13 changes: 4 additions & 9 deletions src/mds/CDir.h
Expand Up @@ -448,16 +448,11 @@ class CDir : public MDSCacheObject {

// -- dentries and inodes --
public:
CDentry* lookup_exact_snap(const std::string& dname, snapid_t last) {
map_t::iterator p = items.find(dentry_key_t(last, dname.c_str()));
if (p == items.end())
return NULL;
return p->second;
CDentry* lookup_exact_snap(const std::string& dname, snapid_t last);
CDentry* lookup(const std::string& n, snapid_t snap=CEPH_NOSNAP);
CDentry* lookup(const char *n, snapid_t snap=CEPH_NOSNAP) {
return lookup(std::string(n), snap);
}
CDentry* lookup(const std::string& n, snapid_t snap=CEPH_NOSNAP) {
return lookup(n.c_str(), snap);
}
CDentry* lookup(const char *n, snapid_t snap=CEPH_NOSNAP);

CDentry* add_null_dentry(const std::string& dname,
snapid_t first=2, snapid_t last=CEPH_NOSNAP);
Expand Down
3 changes: 3 additions & 0 deletions src/mds/MDCache.cc
Expand Up @@ -4558,6 +4558,7 @@ CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
if (!in->is_dir()) {
assert(in->state_test(CInode::STATE_REJOINUNDEF));
in->inode.mode = S_IFDIR;
in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
}
CDir *dir = in->get_or_open_dirfrag(this, df.frag);
dir->state_set(CDir::STATE_REJOINUNDEF);
Expand Down Expand Up @@ -5753,6 +5754,8 @@ void MDCache::opened_undef_inode(CInode *in) {
dout(10) << "opened_undef_inode " << *in << dendl;
rejoin_undef_inodes.erase(in);
if (in->is_dir()) {
// FIXME: re-hash dentries if necessary
assert(in->inode.dir_layout.dl_dir_hash == g_conf->mds_default_dir_hash);
if (in->has_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
CDir *dir = in->get_dirfrag(frag_t());
assert(dir);
Expand Down

0 comments on commit f902309

Please sign in to comment.