Skip to content

Commit a2f601a

Browse files
committed
RowTableAnalyzer
1 parent 8474b51 commit a2f601a

24 files changed

+592
-172
lines changed

inc/BitFunnel/IFileManager.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ namespace BitFunnel
7878
//virtual FileDescriptor0 CommonNegatedTerms() = 0;
7979
//virtual FileDescriptor0 CommonPhrases() = 0;
8080
//virtual FileDescriptor0 DocFreqTable() = 0;
81+
virtual FileDescriptor0 ColumnDensities() = 0;
82+
virtual FileDescriptor0 ColumnDensitySummary() = 0;
8183
virtual FileDescriptor0 DocumentLengthHistogram() = 0;
8284
//virtual FileDescriptor0 L1RankerConfig() = 0;
8385
//virtual FileDescriptor0 Manifest() = 0;
@@ -106,6 +108,7 @@ namespace BitFunnel
106108
virtual FileDescriptor1 IndexedIdfTable(size_t shard) = 0;
107109
//virtual FileDescriptor1 DocTable(size_t shard) = 0;
108110
//virtual FileDescriptor1 ScoreTable(size_t shard) = 0;
111+
virtual FileDescriptor1 RowDensities(size_t shard) = 0;
109112
virtual FileDescriptor1 TermTable(size_t shard) = 0;
110113

111114
//virtual FileDescriptor2 IndexSlice(size_t shard,

inc/BitFunnel/Index/Factories.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ namespace BitFunnel
5656

5757
namespace Factories
5858
{
59+
void AnalyzeRowTables(ISimpleIndex const & index,
60+
char const * outDir);
61+
5962
std::unique_ptr<IChunkManifestIngestor>
6063
CreateBuiltinChunkManifest(
6164
std::vector<std::pair<size_t, char const *>> const & chunks,

inc/BitFunnel/Index/IDocumentFrequencyTable.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030

3131
namespace BitFunnel
3232
{
33-
class TermToText;
33+
class ITermToText;
3434

3535
//*************************************************************************
3636
//
@@ -52,7 +52,7 @@ namespace BitFunnel
5252
// via the ITermToText. Note: method is not const because it sorts
5353
// the entries.
5454
virtual void Write(std::ostream & output,
55-
TermToText const * termToText) = 0;
55+
ITermToText const * termToText) = 0;
5656

5757
// Adds an Entry to the table. Note that this method does not guard
5858
// against duplicate Term::Hash values and it does not enforce any

inc/BitFunnel/Index/IShard.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#pragma once
2424

2525
#include <cstddef> // ptrdiff_t return value.
26+
#include <iosfwd> // std::ostream parameter.
2627

2728
#include "BitFunnel/BitFunnelTypes.h" // DocIndex return value.
2829
#include "BitFunnel/IInterface.h" // Base class.
@@ -31,8 +32,7 @@
3132

3233
namespace BitFunnel
3334
{
34-
35-
class TermToText;
35+
class ITermToText;
3636

3737
class IShard : public IInterface
3838
{
@@ -52,8 +52,15 @@ namespace BitFunnel
5252
// Returns the offset of the row in the slice buffer in a shard.
5353
virtual ptrdiff_t GetRowOffset(RowId rowId) const = 0;
5454

55-
virtual void TemporaryWriteDocumentFrequencyTable(std::ostream& out,
56-
TermToText const * termToText) const = 0;
55+
virtual void TemporaryWriteDocumentFrequencyTable(
56+
std::ostream& out,
57+
ITermToText const * termToText) const = 0;
5758

59+
// Returns an std::vector containing the bit densities for each row in
60+
// the RowTable with the specified rank. Bit densities are computed
61+
// over all slices, for those columns that correspond to active
62+
// documents.
63+
virtual std::vector<double>
64+
GetDensities(Rank rank) const = 0;
5865
};
5966
}

src/Common/Configuration/src/FileManager.cpp

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,16 @@ namespace BitFunnel
4848
char const * statisticsDirectory,
4949
char const * indexDirectory,
5050
IFileSystem & fileSystem)
51-
: m_cumulativeTermCounts(new ParameterizedFile1(fileSystem,
51+
: m_columnDensities(new ParameterizedFile0(fileSystem,
52+
statisticsDirectory,
53+
"ColumnDensities",
54+
".csv")),
55+
m_columnDensitySummary(
56+
new ParameterizedFile0(fileSystem,
57+
statisticsDirectory,
58+
"ColumnDensitySummary",
59+
".txt")),
60+
m_cumulativeTermCounts(new ParameterizedFile1(fileSystem,
5261
statisticsDirectory,
5362
"CumulativeTermCounts",
5463
".csv")),
@@ -63,6 +72,11 @@ namespace BitFunnel
6372
indexDirectory,
6473
"IndexedIdfTable",
6574
".bin")),
75+
m_rowDensities(
76+
new ParameterizedFile1(fileSystem,
77+
statisticsDirectory,
78+
"RowDensities",
79+
".csv")),
6680
m_termTable(new ParameterizedFile1(fileSystem,
6781
indexDirectory,
6882
"TermTable",
@@ -79,6 +93,18 @@ namespace BitFunnel
7993
// FileDescriptor0 files.
8094
//
8195

96+
FileDescriptor0 FileManager::ColumnDensities()
97+
{
98+
return FileDescriptor0(*m_columnDensities);
99+
}
100+
101+
102+
FileDescriptor0 FileManager::ColumnDensitySummary()
103+
{
104+
return FileDescriptor0(*m_columnDensitySummary);
105+
}
106+
107+
82108
FileDescriptor0 FileManager::DocumentLengthHistogram()
83109
{
84110
return FileDescriptor0(*m_documentLengthHistogram);
@@ -113,6 +139,12 @@ namespace BitFunnel
113139
}
114140

115141

142+
FileDescriptor1 FileManager::RowDensities(size_t shard)
143+
{
144+
return FileDescriptor1(*m_rowDensities, shard);
145+
}
146+
147+
116148
FileDescriptor1 FileManager::TermTable(size_t shard)
117149
{
118150
return FileDescriptor1(*m_termTable, shard);

src/Common/Configuration/src/FileManager.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ namespace BitFunnel
4848
//virtual FileDescriptor0 CommonNegatedTerms() override;
4949
//virtual FileDescriptor0 CommonPhrases() override;
5050
//virtual FileDescriptor0 DocFreqTable() override;
51+
virtual FileDescriptor0 ColumnDensities() override;
52+
virtual FileDescriptor0 ColumnDensitySummary() override;
5153
virtual FileDescriptor0 DocumentLengthHistogram() override;
5254
//virtual FileDescriptor0 L1RankerConfig() override;
5355
//virtual FileDescriptor0 Manifest() override;
@@ -73,15 +75,19 @@ namespace BitFunnel
7375
virtual FileDescriptor1 IndexedIdfTable(size_t shard) override;
7476
//virtual FileDescriptor1 DocTable(size_t shard) override;
7577
//virtual FileDescriptor1 ScoreTable(size_t shard) override;
78+
virtual FileDescriptor1 RowDensities(size_t shard) override;
7679
virtual FileDescriptor1 TermTable(size_t shard) override;
7780

7881
//virtual FileDescriptor2 IndexSlice(size_t shard, size_t slice) override;
7982

8083
private:
84+
std::unique_ptr<IParameterizedFile0> m_columnDensities;
85+
std::unique_ptr<IParameterizedFile0> m_columnDensitySummary;
8186
std::unique_ptr<IParameterizedFile1> m_cumulativeTermCounts;
8287
std::unique_ptr<IParameterizedFile1> m_docFreqTable;
8388
std::unique_ptr<IParameterizedFile0> m_documentLengthHistogram;
8489
std::unique_ptr<IParameterizedFile1> m_indexedIdfTable;
90+
std::unique_ptr<IParameterizedFile1> m_rowDensities;
8591
std::unique_ptr<IParameterizedFile1> m_termTable;
8692
std::unique_ptr<IParameterizedFile0> m_termToText;
8793
};

src/Index/src/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ set(CPPFILES
2828
RowId.cpp
2929
RowIdSequence.cpp
3030
RowConfiguration.cpp
31+
RowTableAnalyzer.cpp
3132
RowTableDescriptor.cpp
32-
RowTableStatistics.cpp
3333
Shard.cpp
3434
SimpleIndex.cpp
3535
Slice.cpp
@@ -72,7 +72,7 @@ set(PRIVATE_HFILES
7272
IRecyclable.h
7373
Recycler.h
7474
RowTableDescriptor.h
75-
RowTableStatistics.h
75+
RowTableAnalyzer.h
7676
Shard.h
7777
SimpleIndex.h
7878
Slice.h

src/Index/src/DocumentFrequencyTable.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
#include "BitFunnel/Exceptions.h"
2828
#include "BitFunnel/Index/Factories.h"
29+
#include "BitFunnel/Index/ITermToText.h"
2930
#include "CsvTsv/Csv.h"
3031
#include "DocumentFrequencyTable.h"
3132
#include "TermToText.h"
@@ -124,7 +125,7 @@ namespace BitFunnel
124125

125126

126127
void DocumentFrequencyTable::Write(std::ostream & output,
127-
TermToText const * termToText)
128+
ITermToText const * termToText)
128129
{
129130
//
130131
// Sort entries by descending frequency.

src/Index/src/DocumentFrequencyTable.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232

3333
namespace BitFunnel
3434
{
35+
class ITermToText;
36+
3537
class DocumentFrequencyTable : public IDocumentFrequencyTable
3638
{
3739
public:
@@ -55,7 +57,7 @@ namespace BitFunnel
5557
// via the ITermToText. Note: method is not const because it sorts
5658
// the entries.
5759
virtual void Write(std::ostream & output,
58-
TermToText const * termToText) override;
60+
ITermToText const * termToText) override;
5961

6062
// Adds an Entry to the table. Note that this method does not guard
6163
// against duplicate Term::Hash values and it does not enforce any

src/Index/src/DocumentFrequencyTableBuilder.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ namespace BitFunnel
4949
// Write out sorted truncated list, sorted by count (TODO: frequency).
5050
void DocumentFrequencyTableBuilder::WriteFrequencies(std::ostream& output,
5151
double truncateBelowFrequency,
52-
TermToText const * termToText) const
52+
ITermToText const * termToText) const
5353
{
5454
DocumentFrequencyTable table;
5555

0 commit comments

Comments
 (0)