/
processor.go
215 lines (203 loc) · 6.82 KB
/
processor.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
// Copyright 2017 Applatix, Inc.
package ingest
import (
"archive/zip"
"bufio"
"compress/gzip"
"encoding/csv"
"io"
"log"
"os"
"path"
"path/filepath"
"strings"
"time"
"github.com/applatix/claudia"
"github.com/applatix/claudia/billingbucket"
"github.com/applatix/claudia/costdb"
"github.com/applatix/claudia/errors"
"github.com/applatix/claudia/parser"
)
// GetCSVReader returns a CSV reader given a file
func GetCSVReader(file *os.File) (*csv.Reader, error) {
var reader *csv.Reader
if strings.HasSuffix(file.Name(), ".gz") {
gzReader, err := gzip.NewReader(bufio.NewReader(file))
if err != nil {
return nil, errors.InternalError(err)
}
reader = csv.NewReader(gzReader)
} else {
reader = csv.NewReader(bufio.NewReader(file))
}
return reader, nil
}
// unzipReport extracts a .zip file in place.
// If user's reports are in zip format, we must first unzip the report instead of using a gzip reader stream
// as zip files cannot be streamed safely: https://github.com/golang/go/issues/10568
func unzipReport(zipPath string) (string, error) {
if !strings.HasSuffix(zipPath, ".zip") {
return "", errors.Errorf("%s does not have .zip extension: cannot formulate safe extraction path", zipPath)
}
r, err := zip.OpenReader(zipPath)
if err != nil {
return "", errors.InternalError(err)
}
defer func() {
if err := r.Close(); err != nil {
panic(err)
}
}()
extractDir := path.Dir(zipPath)
if len(r.File) != 1 {
return "", errors.Errorf("Failed to unzip %s: zip archive had %d files (expected 1)", zipPath, len(r.File))
}
extractPath := filepath.Join(extractDir, r.File[0].Name)
fileReader, err := r.File[0].Open()
if err != nil {
return "", errors.InternalError(err)
}
defer fileReader.Close()
targetFile, err := os.OpenFile(extractPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, r.File[0].Mode())
if err != nil {
return "", errors.InternalError(err)
}
defer targetFile.Close()
if _, err := io.Copy(targetFile, fileReader); err != nil {
return "", errors.InternalError(err)
}
return extractPath, nil
}
// shouldIngest checks whether or not we should ingest the report based on if
// we have seen this manifest's billingPeriod/assemblyID before and it has been successfully processed
func shouldIngest(repCtx *costdb.CostReportContext, manifest *billingbucket.Manifest) bool {
log.Printf("Determining if %s/%s/%s AssemblyID %s should be ingested", manifest.Bucket, manifest.ReportPath(), manifest.BillingPeriodString(), manifest.AssemblyID)
ingStatusByBillingPeriod := func() (*costdb.IngestStatus, error) {
return repCtx.CostDB.GetIngestStatusByBillingPeriod(manifest.Bucket, manifest.ReportPath(), manifest.BillingPeriodString())
}
ingStatusByAssemblyID := func() (*costdb.IngestStatus, error) {
return repCtx.CostDB.GetIngestStatus(manifest.AssemblyID)
}
funcs := map[string]func() (*costdb.IngestStatus, error){
"billing period": ingStatusByBillingPeriod,
"assemblyID": ingStatusByAssemblyID,
}
for method, getIngestStatusFunc := range funcs {
ingestStatus, err := getIngestStatusFunc()
if err != nil {
log.Printf("Failed to retrieve ingest status by %s: %s", method, err)
return false
}
if ingestStatus == nil {
log.Printf("Ingest status by %s has never been processed", method)
return true
}
if ingestStatus.ErrorMessage != "" {
log.Printf("Previous ingest status by %s had error: %s", method, ingestStatus.ErrorMessage)
return true
}
if ingestStatus.FinishTime == nil {
log.Printf("Found interrupted ingest by %s. Proceeding with ingest.", method)
return true
}
if ingestStatus.ParserVersion < parser.ParserVersion {
log.Printf("Previous ingest was parsed with older parser version: %d. Current version: %d. Proceeding with ingest", ingestStatus.ParserVersion, parser.ParserVersion)
return true
}
}
log.Printf("Ingest of %s/%s/%s AssemblyID %s already finished. Skipping ingest", manifest.Bucket, manifest.ReportPath(), manifest.BillingPeriodString(), manifest.AssemblyID)
return false
}
// IngestReportFile ingests a report file into the cost database
func IngestReportFile(repCtx *costdb.CostReportContext, job *manifestJob, reportPath string, run *bool) error {
var err error
log.Printf("Processing %s.\n", reportPath)
if strings.HasSuffix(reportPath, ".zip") {
reportPath, err = unzipReport(reportPath)
if err != nil {
return err
}
}
file, err := os.Open(reportPath)
if err != nil {
return errors.InternalError(err)
}
defer file.Close()
startRecords, _ := repCtx.CountRecords()
reader, err := GetCSVReader(file)
if err != nil {
return err
}
fields, err := reader.Read()
if err != nil {
return errors.InternalError(err)
}
log.Printf("Fields determined to be %s", fields)
// Create a new point batch
bp, err := repCtx.NewBatchPoints()
if err != nil {
return err
}
var lineNum int64
var wrote int64
startTime := time.Now()
var elapsed time.Duration
billingPeriodStr := job.manifest.BillingPeriodString()
for {
if run != nil && !*run {
break
}
line, err := reader.Read()
if err == io.EOF {
break
}
if err != nil {
return errors.InternalError(err)
}
lineNum++
lineItem, err := parser.ParseLine(fields, line)
if err != nil {
return err
}
if lineItem == nil {
log.Printf("Report %s (%s) line %d skipped", repCtx.ReportID, reportPath, lineNum)
continue
}
// Add the line number as a nanosecond offset to ensure data points are not deduped by InfluxDB
lineItem.Timestamp = lineItem.Timestamp.Add(time.Duration(lineNum))
// Add billing bucket and report path to the line item
lineItem.Tags[parser.ColumnBillingBucket.ColumnName] = job.bucket.Bucketname
lineItem.Tags[parser.ColumnBillingReportPath.ColumnName] = job.bucket.ReportPath
lineItem.Tags[parser.ColumnBillingPeriod.ColumnName] = billingPeriodStr
pt, err := repCtx.NewPoint(lineItem.Tags, lineItem.Fields, lineItem.Timestamp)
if err != nil {
return err
}
bp.AddPoint(pt)
wrote++
if lineNum%int64(claudia.IngestdBatchInterval) == 0 {
// Write the batch and print some progress
elapsed = time.Now().Sub(startTime)
log.Printf("Report %s (%s) processed %d lines (%d points written) %f/s", repCtx.ReportID, reportPath, lineNum, wrote, float64(lineNum)/elapsed.Seconds())
err = repCtx.CostDB.Write(bp)
if err != nil {
return err
}
bp, err = repCtx.NewBatchPoints()
if err != nil {
return err
}
}
}
err = repCtx.CostDB.Write(bp)
if err != nil {
return err
}
elapsed = time.Now().Sub(startTime)
currRecords, _ := repCtx.CountRecords()
created := currRecords - startRecords
cardinality, _ := repCtx.SeriesCardinality()
log.Printf("Report %s (%s) completed import in %fs (%f/s) (%d items) (wrote: %d, skipped: %d) New Records: %d (duplicated: %d) Cardinality: %d",
repCtx.ReportID, reportPath, elapsed.Seconds(), float64(wrote)/elapsed.Seconds(), lineNum, wrote, lineNum-wrote, created, wrote-created, cardinality)
return nil
}