Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bitstream][CAS] Enable BitstreamWriter and BitstreamReader to use CAS as a backing storage #8419

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
15 changes: 15 additions & 0 deletions llvm/include/llvm/Bitstream/BitstreamReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,19 @@ class BitstreamCursor : SimpleBitstreamCursor {

BitstreamBlockInfo *BlockInfo = nullptr;

// I have a strong opinion that this is not the optimal implementation as we
// are reading the entire bitstream from the CAS once. We will be changing
// this for sure to read in chunks lazily and properly map the BitstreamCursor
// to the CAS world. Once the project starts, we will have a quick brainstorm
// of ideas and come up with a better design for the Cursor.
static ArrayRef<uint8_t>
getArrayRefFromCASObject(cas::ObjectStore &CAS, cas::ObjectRef &RootNodeID) {
BitstreamObjectProxy objectProxy = CAS.getProxy(RootNodeID);
raw_ostream OS;
objectProxy.getSchema().serializeBitstreamFile(objectProxy, OS);
return arrayRefFromStringRef(OS.str());
}

public:
static const size_t MaxChunkSize = 32;

Expand All @@ -384,6 +397,8 @@ class BitstreamCursor : SimpleBitstreamCursor {
: SimpleBitstreamCursor(BitcodeBytes) {}
explicit BitstreamCursor(MemoryBufferRef BitcodeBytes)
: SimpleBitstreamCursor(BitcodeBytes) {}
explicit BitstreamCASCursor(cas::ObjectStore &CAS, cas::ObjectRef RootNodeID)
: SimpleBitstreamCursor(getArrayRefFromCASObject(CAS, RootNodeID)) {}

using SimpleBitstreamCursor::AtEndOfStream;
using SimpleBitstreamCursor::canSkipToPos;
Expand Down
171 changes: 151 additions & 20 deletions llvm/include/llvm/Bitstream/BitstreamWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,19 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Bitstream/BitCodes.h"
#include "llvm/BitstreamCAS/BitstreamCASNode.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <optional>
#include <vector>

using namespace llvm::bitstreamcasformats::v1;

namespace llvm {

class BitstreamWriter {
class BitstreamCASWriter {
/// Out - The buffer that keeps unflushed bytes.
SmallVectorImpl<char> &Out;

Expand Down Expand Up @@ -56,6 +59,23 @@ class BitstreamWriter {
/// CurAbbrevs - Abbrevs installed at in this block.
std::vector<std::shared_ptr<BitCodeAbbrev>> CurAbbrevs;

// We will move some of these data structures to the BitstreamCASBuilder
// eventually as create() methods of Ref take this argument.
std::shared_ptr<cas::Node> CurNode;
std::stack<std::shared_ptr<cas::Node>> NodeStack;
std::unordered_map<std::shared_ptr<cas::Node>,
std::vector<std::shared_ptr<cas::Node>>>
DAG; // The refs are implicitly stored in the DAG.
std::unordered_map<unsigned, unsigned>
NextAvailableID; // Maps from block/record code to the next available ID.

bool insideBlock = false;
cas::ObjectStore CAS;
mc::CASBackendMode
CASMode; // Can be changed using a separate CASMode enum if preferred.
std::function<Error(cas::ObjectProxy, cas::ObjectStore &, raw_fd_stream &)>
SerializeBitstreamFile; // This will depend on BitstreamCASReader.

struct Block {
unsigned PrevCodeSize;
size_t StartSizeWord;
Expand All @@ -75,6 +95,11 @@ class BitstreamWriter {
std::vector<BlockInfo> BlockInfoRecords;

void WriteWord(unsigned Value) {
if (!insideBlock) {
CurNode->Data.append(reinterpret_cast<const char *>(&Value),
reinterpret_cast<const char *>(&Value + 1));
return;
}
Value =
support::endian::byte_swap<uint32_t, llvm::endianness::little>(Value);
Out.append(reinterpret_cast<const char *>(&Value),
Expand All @@ -91,17 +116,6 @@ class BitstreamWriter {
return Offset / 4;
}

/// If the related file stream supports reading, seeking and writing, flush
/// the buffer if its size is above a threshold.
void FlushToFile() {
if (!FS)
return;
if (Out.size() < FlushThreshold)
return;
FS->write((char *)&Out.front(), Out.size());
Out.clear();
}

public:
/// Create a BitstreamWriter that writes to Buffer \p O.
///
Expand All @@ -110,12 +124,16 @@ class BitstreamWriter {
///
/// \p FlushThreshold is the threshold (unit M) to flush \p O if \p FS is
/// valid. Flushing only occurs at (sub)block boundaries.
BitstreamWriter(SmallVectorImpl<char> &O, raw_fd_stream *FS = nullptr,
uint32_t FlushThreshold = 512)
: Out(O), FS(FS), FlushThreshold(uint64_t(FlushThreshold) << 20), CurBit(0),
CurValue(0), CurCodeSize(2) {}

~BitstreamWriter() {
BitstreamCASWriter(SmallVectorImpl<char> &O, raw_fd_stream *FS = nullptr,
cas::ObjectStore &CAS, mc::CASBackendMode CASMode,
std::function<Error(cas::ObjectProxy, cas::ObjectStore &,
raw_fd_stream &)>
SerializeBitstreamFile)
: Out(O), FS(FS), CurBit(0), CurValue(0), CurCodeSize(2), CAS(CAS),
CASMode(CASMode), SerializeBitstreamFile(SerializeBitstreamFile),
CurNode(std::make_shared<cas::Node>()), NodeStack({CurNode}) {}

~BitstreamCASWriter() {
assert(CurBit == 0 && "Unflushed data remaining");
assert(BlockScope.empty() && CurAbbrevs.empty() && "Block imbalance");
}
Expand Down Expand Up @@ -288,6 +306,8 @@ class BitstreamWriter {
}

void EnterSubblock(unsigned BlockID, unsigned CodeLen) {
insideBlock = true;
Out.clear();
// Block header:
// [ENTER_SUBBLOCK, blockid, newcodelen, <align4bytes>, blocklen]
EmitCode(bitc::ENTER_SUBBLOCK);
Expand All @@ -312,9 +332,14 @@ class BitstreamWriter {
// to the abbrev list.
if (BlockInfo *Info = getBlockInfo(BlockID))
append_range(CurAbbrevs, Info->Abbrevs);
CurNode = std::make_shared<cas::Node>(BlockID, NextAvailableID[BlockID]++);
CurNode->Data.append(Out.begin(), Out.end());
NodeStack.push(CurNode); // As new block is entered, we push to stack.
Out.clear();
}

void ExitBlock() {
Out.clear();
assert(!BlockScope.empty() && "Block scope imbalance!");
const Block &B = BlockScope.back();

Expand All @@ -334,7 +359,55 @@ class BitstreamWriter {
CurCodeSize = B.PrevCodeSize;
CurAbbrevs = std::move(B.PrevAbbrevs);
BlockScope.pop_back();
FlushToFile();
CurNode->Data.append(Out.begin(), Out.end());
Out.clear();

// Creating the block in CAS using the data and all the child node refs.
auto ref = CurNode->Code == bitc::BLOCKINFO_BLOCK_ID
? BlockInfoBlockRef::create(DAG[CurNode], CurNode->Data)
: GenericBlockRef::create(DAG[CurNode], CurNode->Data);
CurNode->CASRef = std::move(ref);

// As the block is completed, we remove the CurNode from the stack.
NodeStack.pop();

// Now add the edge to the current node's parent.
auto parent = NodeStack.top();
DAG[parent].push_back(CurNode);

// Now the current block's parent might have some other nodes to be added.
// So we point to it.
CurNode = parent;
insideBlock = false;
}

/// Depending upon the CASMode, this method will either:
/// - Serialize the bitstream and write the entire content to the file.
/// - Write the CASID to the file.
/// - Verify the content and write to the file.
void EndStream() {

// Note: Clients are expected to call this method after they are done with
// all the emission. This is just the first API change in addition to
// initializing the client in new way.
if (!FS)
return;

// Create the root node.
auto ref = BitstreamRef::create(DAG[CurNode], CurNode->Data);
CurNode->CASRef = std::move(ref);

switch (CASMode) {
case mc::CASBackendMode::Native:
FS->write(SerializeBitstreamFile(CurNode->CASRef, CAS, *FS));
break;
case mc::CASBackendMode::CASID:
FS->write(CurNode->CASRef.getCASID());
break;
case mc::CASBackendMode::Verify:
// Verify the content and write to the file.
break;
}
}

//===--------------------------------------------------------------------===//
Expand Down Expand Up @@ -496,6 +569,8 @@ class BitstreamWriter {
/// we have one to compress the output.
template <typename Container>
void EmitRecord(unsigned Code, const Container &Vals, unsigned Abbrev = 0) {
Out.clear();
CurNode = std::make_shared<cas::Node>(Code, NextAvailableID[Code]++);
if (!Abbrev) {
// If we don't have an abbrev to use, emit this in its fully unabbreviated
// form.
Expand All @@ -505,18 +580,48 @@ class BitstreamWriter {
EmitVBR(Count, 6);
for (unsigned i = 0, e = Count; i != e; ++i)
EmitVBR64(Vals[i], 6);
CurNode->Data.append(Out.begin(), Out.end());
Out.clear();
addCurNodeToDAG(UnAbbrevRecordRef::create({}, CurNode->Data));
return;
}

EmitRecordWithAbbrevImpl(Abbrev, ArrayRef(Vals), StringRef(), Code);
CurNode->Data.append(Out.begin(), Out.end());
Out.clear();
BitstreamObjectProxy CASRef;
switch (Code) { // Only common record codes are considered as we are not
// specializing per client currently.
case llvm::bitc::BLOCKINFO_CODE_SETBID:
CASRef = SetBidRecordRef::create({}, CurNode->Data);
break;
case llvm::bitc::BLOCKINFO_CODE_BLOCKNAME:
CASRef = BlockNameRecordRef::create({}, CurNode->Data);
break;
case llvm::bitc::BLOCKINFO_CODE_SETRECORDNAME:
CASRef = SetRecordNameRecordRef::create({}, CurNode->Data);
break;
case llvm::bitc::DEFINE_ABBREV:
CASRef = DefineAbbrevRecordRef::create({}, CurNode->Data);
break;
default:
CASRef = GenericRecordRef::create({}, CurNode->Data);
break;
}
addCurNodeToDAG(CASRef);
}

/// EmitRecordWithAbbrev - Emit a record with the specified abbreviation.
/// Unlike EmitRecord, the code for the record should be included in Vals as
/// the first entry.
template <typename Container>
void EmitRecordWithAbbrev(unsigned Abbrev, const Container &Vals) {
Out.clear();
CurNode = std::make_shared<cas::Node>(Abbrev, NextAvailableID[Abbrev]++);
EmitRecordWithAbbrevImpl(Abbrev, ArrayRef(Vals), StringRef(), std::nullopt);
CurNode->Data.append(Out.begin(), Out.end());
Out.clear();
addCurNodeToDAG(GenericRecordWithAbbrevRef::create({}, CurNode->Data));
}

/// EmitRecordWithBlob - Emit the specified record to the stream, using an
Expand All @@ -527,34 +632,61 @@ class BitstreamWriter {
template <typename Container>
void EmitRecordWithBlob(unsigned Abbrev, const Container &Vals,
StringRef Blob) {
Out.clear();
CurNode = std::make_shared<cas::Node>(Abbrev, NextAvailableID[Abbrev]++);
EmitRecordWithAbbrevImpl(Abbrev, ArrayRef(Vals), Blob, std::nullopt);
CurNode->Data.append(Out.begin(), Out.end());
Out.clear();
addCurNodeToDAG(GenericRecordWithBlobRef::create({}, CurNode->Data));
}
template <typename Container>
void EmitRecordWithBlob(unsigned Abbrev, const Container &Vals,
const char *BlobData, unsigned BlobLen) {
Out.clear();
CurNode = std::make_shared<cas::Node>(Abbrev, NextAvailableID[Abbrev]++);
return EmitRecordWithAbbrevImpl(Abbrev, ArrayRef(Vals),
StringRef(BlobData, BlobLen), std::nullopt);
CurNode->Data.append(Out.begin(), Out.end());
Out.clear();
addCurNodeToDAG(GenericRecordWithBlobRef::create({}, CurNode->Data));
}

/// EmitRecordWithArray - Just like EmitRecordWithBlob, works with records
/// that end with an array.
template <typename Container>
void EmitRecordWithArray(unsigned Abbrev, const Container &Vals,
StringRef Array) {
Out.clear();
CurNode = std::make_shared<cas::Node>(Abbrev, NextAvailableID[Abbrev]++);
EmitRecordWithAbbrevImpl(Abbrev, ArrayRef(Vals), Array, std::nullopt);
CurNode->Data.append(Out.begin(), Out.end());
Out.clear();
addCurNodeToDAG(GenericRecordWithArrayRef::create({}, CurNode->Data));
}
template <typename Container>
void EmitRecordWithArray(unsigned Abbrev, const Container &Vals,
const char *ArrayData, unsigned ArrayLen) {
Out.clear();
CurNode = std::make_shared<cas::Node>(Abbrev, NextAvailableID[Abbrev]++);
return EmitRecordWithAbbrevImpl(
Abbrev, ArrayRef(Vals), StringRef(ArrayData, ArrayLen), std::nullopt);
CurNode->Data.append(Out.begin(), Out.end());
Out.clear();
addCurNodeToDAG(GenericRecordWithArrayRef::create({}, CurNode->Data));
}

//===--------------------------------------------------------------------===//
// Abbrev Emission
//===--------------------------------------------------------------------===//

private:
void addCurNodeToDAG(BitstreamObjectProxy &CASRef) {
CurNode->CASRef = std::move(CASRef);
auto parent = NodeStack.top();
DAG[parent].push_back(CurNode);
CurNode = parent;
}

// Emit the abbreviation as a DEFINE_ABBREV record.
void EncodeAbbrev(const BitCodeAbbrev &Abbv) {
EmitCode(bitc::DEFINE_ABBREV);
Expand Down Expand Up @@ -629,7 +761,6 @@ class BitstreamWriter {
}
};


} // End llvm namespace

#endif
15 changes: 15 additions & 0 deletions llvm/include/llvm/BitstreamCAS/BitstreamCASNode.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#include "llvm/CAS/CASID.h"
#include "llvm/CAS/ObjectStore.h"

namespace llvm {
namespace cas {

struct Node {
unsigned Code;
unsigned ID;
SmallString<256> Data;
BitstreamObjectProxy CASRef;
}

} // namespace cas
} // namespace llvm