Permalink
Browse files

RowTableAnalyzer

  • Loading branch information...
MikeHopcroft committed Oct 23, 2016
1 parent 8474b51 commit a2f601af0f240966fa6f3acbd76d10e632135aab
@@ -78,6 +78,8 @@ namespace BitFunnel
//virtual FileDescriptor0 CommonNegatedTerms() = 0;
//virtual FileDescriptor0 CommonPhrases() = 0;
//virtual FileDescriptor0 DocFreqTable() = 0;
+ virtual FileDescriptor0 ColumnDensities() = 0;
+ virtual FileDescriptor0 ColumnDensitySummary() = 0;
virtual FileDescriptor0 DocumentLengthHistogram() = 0;
//virtual FileDescriptor0 L1RankerConfig() = 0;
//virtual FileDescriptor0 Manifest() = 0;
@@ -106,6 +108,7 @@ namespace BitFunnel
virtual FileDescriptor1 IndexedIdfTable(size_t shard) = 0;
//virtual FileDescriptor1 DocTable(size_t shard) = 0;
//virtual FileDescriptor1 ScoreTable(size_t shard) = 0;
+ virtual FileDescriptor1 RowDensities(size_t shard) = 0;
virtual FileDescriptor1 TermTable(size_t shard) = 0;
//virtual FileDescriptor2 IndexSlice(size_t shard,
@@ -56,6 +56,9 @@ namespace BitFunnel
namespace Factories
{
+ void AnalyzeRowTables(ISimpleIndex const & index,
+ char const * outDir);
+
std::unique_ptr<IChunkManifestIngestor>
CreateBuiltinChunkManifest(
std::vector<std::pair<size_t, char const *>> const & chunks,
@@ -30,7 +30,7 @@
namespace BitFunnel
{
- class TermToText;
+ class ITermToText;
//*************************************************************************
//
@@ -52,7 +52,7 @@ namespace BitFunnel
// via the ITermToText. Note: method is not const because it sorts
// the entries.
virtual void Write(std::ostream & output,
- TermToText const * termToText) = 0;
+ ITermToText const * termToText) = 0;
// Adds an Entry to the table. Note that this method does not guard
// against duplicate Term::Hash values and it does not enforce any
@@ -23,6 +23,7 @@
#pragma once
#include <cstddef> // ptrdiff_t return value.
+#include <iosfwd> // std::ostream parameter.
#include "BitFunnel/BitFunnelTypes.h" // DocIndex return value.
#include "BitFunnel/IInterface.h" // Base class.
@@ -31,8 +32,7 @@
namespace BitFunnel
{
-
- class TermToText;
+ class ITermToText;
class IShard : public IInterface
{
@@ -52,8 +52,15 @@ namespace BitFunnel
// Returns the offset of the row in the slice buffer in a shard.
virtual ptrdiff_t GetRowOffset(RowId rowId) const = 0;
- virtual void TemporaryWriteDocumentFrequencyTable(std::ostream& out,
- TermToText const * termToText) const = 0;
+ virtual void TemporaryWriteDocumentFrequencyTable(
+ std::ostream& out,
+ ITermToText const * termToText) const = 0;
+ // Returns an std::vector containing the bit densities for each row in
+ // the RowTable with the specified rank. Bit densities are computed
+ // over all slices, for those columns that correspond to active
+ // documents.
+ virtual std::vector<double>
+ GetDensities(Rank rank) const = 0;
};
}
@@ -48,7 +48,16 @@ namespace BitFunnel
char const * statisticsDirectory,
char const * indexDirectory,
IFileSystem & fileSystem)
- : m_cumulativeTermCounts(new ParameterizedFile1(fileSystem,
+ : m_columnDensities(new ParameterizedFile0(fileSystem,
+ statisticsDirectory,
+ "ColumnDensities",
+ ".csv")),
+ m_columnDensitySummary(
+ new ParameterizedFile0(fileSystem,
+ statisticsDirectory,
+ "ColumnDensitySummary",
+ ".txt")),
+ m_cumulativeTermCounts(new ParameterizedFile1(fileSystem,
statisticsDirectory,
"CumulativeTermCounts",
".csv")),
@@ -63,6 +72,11 @@ namespace BitFunnel
indexDirectory,
"IndexedIdfTable",
".bin")),
+ m_rowDensities(
+ new ParameterizedFile1(fileSystem,
+ statisticsDirectory,
+ "RowDensities",
+ ".csv")),
m_termTable(new ParameterizedFile1(fileSystem,
indexDirectory,
"TermTable",
@@ -79,6 +93,18 @@ namespace BitFunnel
// FileDescriptor0 files.
//
+ FileDescriptor0 FileManager::ColumnDensities()
+ {
+ return FileDescriptor0(*m_columnDensities);
+ }
+
+
+ FileDescriptor0 FileManager::ColumnDensitySummary()
+ {
+ return FileDescriptor0(*m_columnDensitySummary);
+ }
+
+
FileDescriptor0 FileManager::DocumentLengthHistogram()
{
return FileDescriptor0(*m_documentLengthHistogram);
@@ -113,6 +139,12 @@ namespace BitFunnel
}
+ FileDescriptor1 FileManager::RowDensities(size_t shard)
+ {
+ return FileDescriptor1(*m_rowDensities, shard);
+ }
+
+
FileDescriptor1 FileManager::TermTable(size_t shard)
{
return FileDescriptor1(*m_termTable, shard);
@@ -48,6 +48,8 @@ namespace BitFunnel
//virtual FileDescriptor0 CommonNegatedTerms() override;
//virtual FileDescriptor0 CommonPhrases() override;
//virtual FileDescriptor0 DocFreqTable() override;
+ virtual FileDescriptor0 ColumnDensities() override;
+ virtual FileDescriptor0 ColumnDensitySummary() override;
virtual FileDescriptor0 DocumentLengthHistogram() override;
//virtual FileDescriptor0 L1RankerConfig() override;
//virtual FileDescriptor0 Manifest() override;
@@ -73,15 +75,19 @@ namespace BitFunnel
virtual FileDescriptor1 IndexedIdfTable(size_t shard) override;
//virtual FileDescriptor1 DocTable(size_t shard) override;
//virtual FileDescriptor1 ScoreTable(size_t shard) override;
+ virtual FileDescriptor1 RowDensities(size_t shard) override;
virtual FileDescriptor1 TermTable(size_t shard) override;
//virtual FileDescriptor2 IndexSlice(size_t shard, size_t slice) override;
private:
+ std::unique_ptr<IParameterizedFile0> m_columnDensities;
+ std::unique_ptr<IParameterizedFile0> m_columnDensitySummary;
std::unique_ptr<IParameterizedFile1> m_cumulativeTermCounts;
std::unique_ptr<IParameterizedFile1> m_docFreqTable;
std::unique_ptr<IParameterizedFile0> m_documentLengthHistogram;
std::unique_ptr<IParameterizedFile1> m_indexedIdfTable;
+ std::unique_ptr<IParameterizedFile1> m_rowDensities;
std::unique_ptr<IParameterizedFile1> m_termTable;
std::unique_ptr<IParameterizedFile0> m_termToText;
};
@@ -28,8 +28,8 @@ set(CPPFILES
RowId.cpp
RowIdSequence.cpp
RowConfiguration.cpp
+ RowTableAnalyzer.cpp
RowTableDescriptor.cpp
- RowTableStatistics.cpp
Shard.cpp
SimpleIndex.cpp
Slice.cpp
@@ -72,7 +72,7 @@ set(PRIVATE_HFILES
IRecyclable.h
Recycler.h
RowTableDescriptor.h
- RowTableStatistics.h
+ RowTableAnalyzer.h
Shard.h
SimpleIndex.h
Slice.h
@@ -26,6 +26,7 @@
#include "BitFunnel/Exceptions.h"
#include "BitFunnel/Index/Factories.h"
+#include "BitFunnel/Index/ITermToText.h"
#include "CsvTsv/Csv.h"
#include "DocumentFrequencyTable.h"
#include "TermToText.h"
@@ -124,7 +125,7 @@ namespace BitFunnel
void DocumentFrequencyTable::Write(std::ostream & output,
- TermToText const * termToText)
+ ITermToText const * termToText)
{
//
// Sort entries by descending frequency.
@@ -32,6 +32,8 @@
namespace BitFunnel
{
+ class ITermToText;
+
class DocumentFrequencyTable : public IDocumentFrequencyTable
{
public:
@@ -55,7 +57,7 @@ namespace BitFunnel
// via the ITermToText. Note: method is not const because it sorts
// the entries.
virtual void Write(std::ostream & output,
- TermToText const * termToText) override;
+ ITermToText const * termToText) override;
// Adds an Entry to the table. Note that this method does not guard
// against duplicate Term::Hash values and it does not enforce any
@@ -49,7 +49,7 @@ namespace BitFunnel
// Write out sorted truncated list, sorted by count (TODO: frequency).
void DocumentFrequencyTableBuilder::WriteFrequencies(std::ostream& output,
double truncateBelowFrequency,
- TermToText const * termToText) const
+ ITermToText const * termToText) const
{
DocumentFrequencyTable table;
@@ -32,7 +32,7 @@
namespace BitFunnel
{
- class TermToText;
+ class ITermToText;
//*************************************************************************
//
@@ -83,7 +83,7 @@ namespace BitFunnel
// (ie. callers to OnDocumentEnter() and OnTerm()).
void WriteFrequencies(std::ostream& output,
double truncateBelowFrequency,
- TermToText const * termToText) const;
+ ITermToText const * termToText) const;
// Writes the document frequency data to a stream in the binary format
Oops, something went wrong.

0 comments on commit a2f601a

Please sign in to comment.