forked from xitongsys/parquet-go
/
columnreader.go
97 lines (79 loc) · 2.89 KB
/
columnreader.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
package reader
import (
"fmt"
"github.com/xitongsys/parquet-go/source"
"github.com/xitongsys/parquet-go/schema"
)
// NewParquetColumnReader creates a parquet column reader
func NewParquetColumnReader(pFile source.ParquetFile, np int64) (*ParquetReader, error) {
res := new(ParquetReader)
res.NP = np
res.PFile = pFile
if err := res.ReadFooter(); err != nil {
return nil, err
}
res.ColumnBuffers = make(map[string]*ColumnBufferType)
res.SchemaHandler = schema.NewSchemaHandlerFromSchemaList(res.Footer.GetSchema())
res.RenameSchema()
return res, nil
}
func (self *ParquetReader) SkipRowsByPath(pathStr string, num int) error {
errPathNotFound := fmt.Errorf("path %v not found", pathStr)
pathStr, err := self.SchemaHandler.ConvertToInPathStr(pathStr)
if num <= 0 || len(pathStr) <= 0 || err != nil {
return err
}
if _, ok := self.SchemaHandler.MapIndex[pathStr]; !ok {
return errPathNotFound
}
if _, ok := self.ColumnBuffers[pathStr]; !ok {
var err error
if self.ColumnBuffers[pathStr], err = NewColumnBuffer(self.PFile, self.Footer, self.SchemaHandler, pathStr); err != nil {
return err
}
}
if cb, ok := self.ColumnBuffers[pathStr]; ok {
cb.SkipRows(int64(num))
} else{
return errPathNotFound
}
return nil
}
func (self *ParquetReader) SkipRowsByIndex(index int, num int) {
if index >= len(self.SchemaHandler.ValueColumns) {
return
}
pathStr := self.SchemaHandler.ValueColumns[index]
self.SkipRowsByPath(pathStr, num)
}
// ReadColumnByPath reads column by path in schema.
func (self *ParquetReader) ReadColumnByPath(pathStr string, num int) (values []interface{}, rls []int32, dls []int32, err error) {
errPathNotFound := fmt.Errorf("path %v not found", pathStr)
pathStr, err = self.SchemaHandler.ConvertToInPathStr(pathStr)
if num <= 0 || len(pathStr) <= 0 || err != nil {
return []interface{}{}, []int32{}, []int32{}, err
}
if _, ok := self.SchemaHandler.MapIndex[pathStr]; !ok {
return []interface{}{}, []int32{}, []int32{}, errPathNotFound
}
if _, ok := self.ColumnBuffers[pathStr]; !ok {
var err error
if self.ColumnBuffers[pathStr], err = NewColumnBuffer(self.PFile, self.Footer, self.SchemaHandler, pathStr); err != nil {
return []interface{}{}, []int32{}, []int32{}, err
}
}
if cb, ok := self.ColumnBuffers[pathStr]; ok {
table, _ := cb.ReadRows(int64(num))
return table.Values, table.RepetitionLevels, table.DefinitionLevels, nil
}
return []interface{}{}, []int32{}, []int32{}, errPathNotFound
}
// ReadColumnByIndex reads column by index. The index of first column is 0.
func (self *ParquetReader) ReadColumnByIndex(index int, num int) (values []interface{}, rls []int32, dls []int32, err error) {
if index >= len(self.SchemaHandler.ValueColumns) {
err = fmt.Errorf("index %v out of range %v", index, len(self.SchemaHandler.ValueColumns))
return
}
pathStr := self.SchemaHandler.ValueColumns[index]
return self.ReadColumnByPath(pathStr, num)
}