Skip to content
Permalink
Browse files
RowTableAnalyzer
  • Loading branch information
MikeHopcroft committed Oct 23, 2016
1 parent 8474b51 commit a2f601af0f240966fa6f3acbd76d10e632135aab
@@ -78,6 +78,8 @@ namespace BitFunnel
//virtual FileDescriptor0 CommonNegatedTerms() = 0;
//virtual FileDescriptor0 CommonPhrases() = 0;
//virtual FileDescriptor0 DocFreqTable() = 0;
virtual FileDescriptor0 ColumnDensities() = 0;
virtual FileDescriptor0 ColumnDensitySummary() = 0;
virtual FileDescriptor0 DocumentLengthHistogram() = 0;
//virtual FileDescriptor0 L1RankerConfig() = 0;
//virtual FileDescriptor0 Manifest() = 0;
@@ -106,6 +108,7 @@ namespace BitFunnel
virtual FileDescriptor1 IndexedIdfTable(size_t shard) = 0;
//virtual FileDescriptor1 DocTable(size_t shard) = 0;
//virtual FileDescriptor1 ScoreTable(size_t shard) = 0;
virtual FileDescriptor1 RowDensities(size_t shard) = 0;
virtual FileDescriptor1 TermTable(size_t shard) = 0;

//virtual FileDescriptor2 IndexSlice(size_t shard,
@@ -56,6 +56,9 @@ namespace BitFunnel

namespace Factories
{
void AnalyzeRowTables(ISimpleIndex const & index,
char const * outDir);

std::unique_ptr<IChunkManifestIngestor>
CreateBuiltinChunkManifest(
std::vector<std::pair<size_t, char const *>> const & chunks,
@@ -30,7 +30,7 @@

namespace BitFunnel
{
class TermToText;
class ITermToText;

//*************************************************************************
//
@@ -52,7 +52,7 @@ namespace BitFunnel
// via the ITermToText. Note: method is not const because it sorts
// the entries.
virtual void Write(std::ostream & output,
TermToText const * termToText) = 0;
ITermToText const * termToText) = 0;

// Adds an Entry to the table. Note that this method does not guard
// against duplicate Term::Hash values and it does not enforce any
@@ -23,6 +23,7 @@
#pragma once

#include <cstddef> // ptrdiff_t return value.
#include <iosfwd> // std::ostream parameter.

#include "BitFunnel/BitFunnelTypes.h" // DocIndex return value.
#include "BitFunnel/IInterface.h" // Base class.
@@ -31,8 +32,7 @@

namespace BitFunnel
{

class TermToText;
class ITermToText;

class IShard : public IInterface
{
@@ -52,8 +52,15 @@ namespace BitFunnel
// Returns the offset of the row in the slice buffer in a shard.
virtual ptrdiff_t GetRowOffset(RowId rowId) const = 0;

virtual void TemporaryWriteDocumentFrequencyTable(std::ostream& out,
TermToText const * termToText) const = 0;
virtual void TemporaryWriteDocumentFrequencyTable(
std::ostream& out,
ITermToText const * termToText) const = 0;

// Returns an std::vector containing the bit densities for each row in
// the RowTable with the specified rank. Bit densities are computed
// over all slices, for those columns that correspond to active
// documents.
virtual std::vector<double>
GetDensities(Rank rank) const = 0;
};
}
@@ -48,7 +48,16 @@ namespace BitFunnel
char const * statisticsDirectory,
char const * indexDirectory,
IFileSystem & fileSystem)
: m_cumulativeTermCounts(new ParameterizedFile1(fileSystem,
: m_columnDensities(new ParameterizedFile0(fileSystem,
statisticsDirectory,
"ColumnDensities",
".csv")),
m_columnDensitySummary(
new ParameterizedFile0(fileSystem,
statisticsDirectory,
"ColumnDensitySummary",
".txt")),
m_cumulativeTermCounts(new ParameterizedFile1(fileSystem,
statisticsDirectory,
"CumulativeTermCounts",
".csv")),
@@ -63,6 +72,11 @@ namespace BitFunnel
indexDirectory,
"IndexedIdfTable",
".bin")),
m_rowDensities(
new ParameterizedFile1(fileSystem,
statisticsDirectory,
"RowDensities",
".csv")),
m_termTable(new ParameterizedFile1(fileSystem,
indexDirectory,
"TermTable",
@@ -79,6 +93,18 @@ namespace BitFunnel
// FileDescriptor0 files.
//

FileDescriptor0 FileManager::ColumnDensities()
{
return FileDescriptor0(*m_columnDensities);
}


FileDescriptor0 FileManager::ColumnDensitySummary()
{
return FileDescriptor0(*m_columnDensitySummary);
}


FileDescriptor0 FileManager::DocumentLengthHistogram()
{
return FileDescriptor0(*m_documentLengthHistogram);
@@ -113,6 +139,12 @@ namespace BitFunnel
}


FileDescriptor1 FileManager::RowDensities(size_t shard)
{
return FileDescriptor1(*m_rowDensities, shard);
}


FileDescriptor1 FileManager::TermTable(size_t shard)
{
return FileDescriptor1(*m_termTable, shard);
@@ -48,6 +48,8 @@ namespace BitFunnel
//virtual FileDescriptor0 CommonNegatedTerms() override;
//virtual FileDescriptor0 CommonPhrases() override;
//virtual FileDescriptor0 DocFreqTable() override;
virtual FileDescriptor0 ColumnDensities() override;
virtual FileDescriptor0 ColumnDensitySummary() override;
virtual FileDescriptor0 DocumentLengthHistogram() override;
//virtual FileDescriptor0 L1RankerConfig() override;
//virtual FileDescriptor0 Manifest() override;
@@ -73,15 +75,19 @@ namespace BitFunnel
virtual FileDescriptor1 IndexedIdfTable(size_t shard) override;
//virtual FileDescriptor1 DocTable(size_t shard) override;
//virtual FileDescriptor1 ScoreTable(size_t shard) override;
virtual FileDescriptor1 RowDensities(size_t shard) override;
virtual FileDescriptor1 TermTable(size_t shard) override;

//virtual FileDescriptor2 IndexSlice(size_t shard, size_t slice) override;

private:
std::unique_ptr<IParameterizedFile0> m_columnDensities;
std::unique_ptr<IParameterizedFile0> m_columnDensitySummary;
std::unique_ptr<IParameterizedFile1> m_cumulativeTermCounts;
std::unique_ptr<IParameterizedFile1> m_docFreqTable;
std::unique_ptr<IParameterizedFile0> m_documentLengthHistogram;
std::unique_ptr<IParameterizedFile1> m_indexedIdfTable;
std::unique_ptr<IParameterizedFile1> m_rowDensities;
std::unique_ptr<IParameterizedFile1> m_termTable;
std::unique_ptr<IParameterizedFile0> m_termToText;
};
@@ -28,8 +28,8 @@ set(CPPFILES
RowId.cpp
RowIdSequence.cpp
RowConfiguration.cpp
RowTableAnalyzer.cpp
RowTableDescriptor.cpp
RowTableStatistics.cpp
Shard.cpp
SimpleIndex.cpp
Slice.cpp
@@ -72,7 +72,7 @@ set(PRIVATE_HFILES
IRecyclable.h
Recycler.h
RowTableDescriptor.h
RowTableStatistics.h
RowTableAnalyzer.h
Shard.h
SimpleIndex.h
Slice.h
@@ -26,6 +26,7 @@

#include "BitFunnel/Exceptions.h"
#include "BitFunnel/Index/Factories.h"
#include "BitFunnel/Index/ITermToText.h"
#include "CsvTsv/Csv.h"
#include "DocumentFrequencyTable.h"
#include "TermToText.h"
@@ -124,7 +125,7 @@ namespace BitFunnel


void DocumentFrequencyTable::Write(std::ostream & output,
TermToText const * termToText)
ITermToText const * termToText)
{
//
// Sort entries by descending frequency.
@@ -32,6 +32,8 @@

namespace BitFunnel
{
class ITermToText;

class DocumentFrequencyTable : public IDocumentFrequencyTable
{
public:
@@ -55,7 +57,7 @@ namespace BitFunnel
// via the ITermToText. Note: method is not const because it sorts
// the entries.
virtual void Write(std::ostream & output,
TermToText const * termToText) override;
ITermToText const * termToText) override;

// Adds an Entry to the table. Note that this method does not guard
// against duplicate Term::Hash values and it does not enforce any
@@ -49,7 +49,7 @@ namespace BitFunnel
// Write out sorted truncated list, sorted by count (TODO: frequency).
void DocumentFrequencyTableBuilder::WriteFrequencies(std::ostream& output,
double truncateBelowFrequency,
TermToText const * termToText) const
ITermToText const * termToText) const
{
DocumentFrequencyTable table;

@@ -32,7 +32,7 @@

namespace BitFunnel
{
class TermToText;
class ITermToText;

//*************************************************************************
//
@@ -83,7 +83,7 @@ namespace BitFunnel
// (ie. callers to OnDocumentEnter() and OnTerm()).
void WriteFrequencies(std::ostream& output,
double truncateBelowFrequency,
TermToText const * termToText) const;
ITermToText const * termToText) const;


// Writes the document frequency data to a stream in the binary format
Loading

0 comments on commit a2f601a

Please sign in to comment.