From dbec878cdb7a47f27ef39c8dbfda1ffdef542645 Mon Sep 17 00:00:00 2001 From: AdamDrewsTR Date: Wed, 8 Apr 2026 11:31:26 -0500 Subject: [PATCH] Add compression options for ZIP archives and optimize memory usage during writes --- excelize.go | 61 ++++++++++++++- excelize_test.go | 2 +- file.go | 182 ++++++++++++++++++++++++++++++++++++++++--- file_test.go | 144 +++++++++++++++++++++++++++++++++- go.mod | 1 + go.sum | 2 + lib.go | 3 +- lib_test.go | 2 +- stream.go | 31 +++++++- stream_bench_test.go | 65 ++++++++++++++++ 10 files changed, 476 insertions(+), 17 deletions(-) diff --git a/excelize.go b/excelize.go index 5b4569e971..df34c7baf4 100644 --- a/excelize.go +++ b/excelize.go @@ -13,7 +13,6 @@ package excelize import ( - "archive/zip" "bytes" "encoding/xml" "io" @@ -24,6 +23,8 @@ import ( "strings" "sync" + "github.com/klauspost/compress/flate" + "github.com/klauspost/compress/zip" "golang.org/x/net/html/charset" ) @@ -73,6 +74,28 @@ type ZipWriter interface { Close() error } +// Compression defines the compression level for the ZIP archive used to store +// the spreadsheet. +type Compression int + +const ( + // CompressionDefault uses standard deflate compression (the default). + // This produces the smallest files but uses more CPU and memory during + // Save/WriteTo. + CompressionDefault Compression = iota + // CompressionNone disables ZIP compression entirely. The spreadsheet + // parts are stored uncompressed. This significantly reduces CPU time and + // memory usage during Save/WriteTo at the cost of larger output files + // (typically 5-10× larger). Recommended for memory-constrained + // environments (e.g. AWS Lambda) or when the output will be compressed + // by another layer (e.g. gzip transport, S3 transfer acceleration). + CompressionNone + // CompressionBestSpeed uses the fastest deflate compression level. This + // is a good middle ground: roughly 2× faster than default compression + // with only moderately larger output. + CompressionBestSpeed +) + // Options define the options for opening and reading the spreadsheet. // // MaxCalcIterations specifies the maximum iterations for iterative @@ -124,7 +147,10 @@ type Options struct { // StreamingChunkSize is the number of bytes of XML data accumulated in // memory before a streaming worksheet spills to a temp file. A smaller // value reduces peak memory usage at the cost of more disk I/O. Zero - // means use the default (StreamChunkSize = 16 MiB). + // means use the default (StreamChunkSize = 16 MiB). Set to -1 to + // disable temp files entirely (all data stays in memory); this + // eliminates disk I/O overhead and can be significantly faster when + // sufficient memory is available. StreamingChunkSize int // StreamingBufSize is the size of the bufio.Writer used for all disk // writes after the StreamingChunkSize threshold is crossed. Larger values @@ -132,6 +158,12 @@ type Options struct { // measured inflection point on NVMe and HDD alike is 128 KiB. Zero means // use the default (defaultBioSize = 128 KiB). StreamingBufSize int + // Compression specifies the compression level for the output ZIP + // archive. The default (CompressionDefault) uses standard deflate. Use + // CompressionNone in memory-constrained environments like AWS Lambda to + // eliminate compressor overhead, or CompressionBestSpeed for a balance + // of speed and size. + Compression Compression } // OpenFile take the name of a spreadsheet file and returns a populated @@ -257,6 +289,31 @@ func (f *File) CharsetTranscoder(fn func(charset string, input io.Reader) (rdr i // SetZipWriter set user defined zip writer function for saving the workbook. func (f *File) SetZipWriter(fn func(io.Writer) ZipWriter) *File { f.ZipWriter = fn; return f } +// configureZipCompression applies the Compression option to the zip writer. +// It is a no-op for the default compression level or for custom ZipWriter +// implementations that are not *zip.Writer. +func (f *File) configureZipCompression(zw ZipWriter) { + if f.options == nil || f.options.Compression == CompressionDefault { + return + } + zipW, ok := zw.(*zip.Writer) + if !ok { + return + } + var level int + switch f.options.Compression { + case CompressionNone: + level = flate.NoCompression + case CompressionBestSpeed: + level = flate.BestSpeed + default: + return + } + zipW.RegisterCompressor(zip.Deflate, func(out io.Writer) (io.WriteCloser, error) { + return flate.NewWriter(out, level) + }) +} + // Creates new XML decoder with charset reader. func (f *File) xmlNewDecoder(rdr io.Reader) (ret *xml.Decoder) { ret = xml.NewDecoder(rdr) diff --git a/excelize_test.go b/excelize_test.go index 13f309d413..3d13a78bb5 100644 --- a/excelize_test.go +++ b/excelize_test.go @@ -1,7 +1,6 @@ package excelize import ( - "archive/zip" "bytes" "compress/gzip" "encoding/xml" @@ -20,6 +19,7 @@ import ( "testing" "time" + "github.com/klauspost/compress/zip" "github.com/stretchr/testify/assert" "golang.org/x/net/html/charset" ) diff --git a/file.go b/file.go index 06282fd620..7f40e5bcd6 100644 --- a/file.go +++ b/file.go @@ -113,7 +113,10 @@ func (f *File) Write(w io.Writer, opts ...Options) error { return err } -// WriteTo implements io.WriterTo to write the file. +// WriteTo implements io.WriterTo to write the file. When no password +// encryption is required, the ZIP archive is streamed directly to w without +// buffering the entire compressed output in memory. When password encryption +// is required, a temporary file is used to reduce memory usage. func (f *File) WriteTo(w io.Writer, opts ...Options) (int64, error) { for i := range opts { f.options = &opts[i] @@ -127,18 +130,100 @@ func (f *File) WriteTo(w io.Writer, opts ...Options) (int64, error) { return 0, err } } - buf, err := f.WriteToBuffer() + // Password encryption requires post-processing the entire output. + // Use a temporary file to reduce peak memory usage. + if f.options != nil && f.options.Password != "" { + return f.writeToWithEncryption(w) + } + // Stream the ZIP directly to w. This avoids holding the full compressed + // archive in a bytes.Buffer, which can be 50-200 MB+ for large reports. + cw := &countWriter{w: w} + zw := f.ZipWriter(cw) + f.configureZipCompression(zw) + if err := f.writeToZip(zw); err != nil { + _ = zw.Close() + return cw.n, err + } + return cw.n, zw.Close() +} + +// writeToWithEncryption writes an encrypted file using a temporary file to +// reduce memory usage. This avoids buffering the entire ZIP in memory before +// encryption. +func (f *File) writeToWithEncryption(w io.Writer) (int64, error) { + var tmpDir string + if f.options != nil { + tmpDir = f.options.TmpDir + } + // Create temporary file for the unencrypted ZIP + tmpFile, err := os.CreateTemp(tmpDir, "excelize-encrypt-*.zip") + if err != nil { + return 0, err + } + tmpPath := tmpFile.Name() + defer func() { + _ = tmpFile.Close() + _ = os.Remove(tmpPath) + }() + + // Write ZIP to temp file + f.zip64Entries = nil // Reset before writing + zw := f.ZipWriter(tmpFile) + f.configureZipCompression(zw) + if err := f.writeToZip(zw); err != nil { + _ = zw.Close() + return 0, err + } + if err := zw.Close(); err != nil { + return 0, err + } + + // If ZIP64 entries exist, we need to fixup the local file headers + if len(f.zip64Entries) > 0 { + if err := f.writeZip64LFHFile(tmpFile); err != nil { + return 0, err + } + } + + // Read the ZIP file back and encrypt it + if _, err := tmpFile.Seek(0, 0); err != nil { + return 0, err + } + rawZip, err := io.ReadAll(tmpFile) + if err != nil { + return 0, err + } + + // Encrypt and write to output + encrypted, err := Encrypt(rawZip, f.options) if err != nil { return 0, err } - return buf.WriteTo(w) + n, err := w.Write(encrypted) + return int64(n), err +} + +// countWriter wraps an io.Writer and counts bytes written. +type countWriter struct { + w io.Writer + n int64 +} + +func (cw *countWriter) Write(p []byte) (int, error) { + n, err := cw.w.Write(p) + cw.n += int64(n) + return n, err } // WriteToBuffer provides a function to get bytes.Buffer from the saved file, // and it allocates space in memory. Be careful when the file size is large. +// Consider using WriteTo with a file for large password-protected files to +// reduce memory usage. func (f *File) WriteToBuffer() (*bytes.Buffer, error) { buf := new(bytes.Buffer) + f.zip64Entries = nil // Reset before writing zw := f.ZipWriter(buf) + f.configureZipCompression(zw) if err := f.writeToZip(zw); err != nil { _ = zw.Close() @@ -147,7 +232,11 @@ func (f *File) WriteToBuffer() (*bytes.Buffer, error) { if err := zw.Close(); err != nil { return buf, err } - err := f.writeZip64LFH(buf) + // Only perform ZIP64 fixup if we actually have ZIP64 entries + var err error + if len(f.zip64Entries) > 0 { + err = f.writeZip64LFH(buf) + } if f.options != nil && f.options.Password != "" { b, err := Encrypt(buf.Bytes(), f.options) if err != nil { @@ -180,13 +269,9 @@ func (f *File) writeToZip(zw ZipWriter) error { if err != nil { return err } - var from io.Reader - if from, err = stream.rawData.Reader(); err != nil { - _ = stream.rawData.Close() - return err - } - written, err := io.Copy(fi, from) + written, err := stream.rawData.CopyTo(fi) if err != nil { + _ = stream.rawData.Close() return err } if written > math.MaxUint32 { @@ -272,3 +357,80 @@ func (f *File) writeZip64LFH(buf *bytes.Buffer) error { } return nil } + +// writeZip64LFHFile performs ZIP64 local file header fixup on a file. +// This is used when encrypting to avoid loading the entire file into memory. +func (f *File) writeZip64LFHFile(file *os.File) error { + if len(f.zip64Entries) == 0 { + return nil + } + // Seek to start of file + if _, err := file.Seek(0, 0); err != nil { + return err + } + // Read file info to get size + info, err := file.Stat() + if err != nil { + return err + } + fileSize := info.Size() + + // Process file in chunks to avoid loading entire file into memory + const chunkSize = 1024 * 1024 // 1MB chunks + buf := make([]byte, chunkSize) + var offset int64 + + for offset < fileSize { + // Read chunk + n, err := file.ReadAt(buf, offset) + if err != nil && err != io.EOF { + return err + } + if n == 0 { + break + } + + // Search for local file headers in this chunk + searchBuf := buf[:n] + searchOffset := 0 + for searchOffset < n { + idx := bytes.Index(searchBuf[searchOffset:], []byte{0x50, 0x4b, 0x03, 0x04}) + if idx == -1 { + break + } + idx += searchOffset + absoluteIdx := offset + int64(idx) + + // Check if we have enough data for the header + if idx+30 > n { + // Header spans chunk boundary, will be caught in next iteration + break + } + + filenameLen := int(binary.LittleEndian.Uint16(searchBuf[idx+26 : idx+28])) + if idx+30+filenameLen > n { + // Filename spans chunk boundary, will be caught in next iteration + break + } + + filename := string(searchBuf[idx+30 : idx+30+filenameLen]) + if inStrSlice(f.zip64Entries, filename, true) != -1 { + // Update version field at offset idx+4 + versionBuf := make([]byte, 2) + binary.LittleEndian.PutUint16(versionBuf, 45) + if _, err := file.WriteAt(versionBuf, absoluteIdx+4); err != nil { + return err + } + } + searchOffset = idx + 1 + } + + offset += int64(n) + // Overlap by 30 bytes to catch headers that span chunks + if offset < fileSize { + offset -= 30 + } + } + + return nil +} diff --git a/file_test.go b/file_test.go index 5fea1cb0b0..19308fbbe2 100644 --- a/file_test.go +++ b/file_test.go @@ -1,7 +1,6 @@ package excelize import ( - "archive/zip" "bufio" "bytes" "encoding/binary" @@ -16,6 +15,7 @@ import ( "sync" "testing" + "github.com/klauspost/compress/zip" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -274,3 +274,145 @@ func TestRemoveTempFiles(t *testing.T) { assert.NoError(t, os.Remove(tmpName)) } } + +func TestStreamingWriteTo(t *testing.T) { + // Verify that WriteTo streams directly when no password is set, + // producing a valid XLSX file. + f := NewFile() + sw, err := f.NewStreamWriter("Sheet1") + assert.NoError(t, err) + for row := 1; row <= 100; row++ { + rowData := make([]interface{}, 10) + for col := range 10 { + rowData[col] = "test" + } + cell, _ := CoordinatesToCellName(1, row) + assert.NoError(t, sw.SetRow(cell, rowData)) + } + assert.NoError(t, sw.Flush()) + // WriteTo a buffer (exercises the streaming path, no password) + var buf bytes.Buffer + _, err = f.WriteTo(&buf) + assert.NoError(t, err) + assert.Greater(t, buf.Len(), 0) + assert.NoError(t, f.Close()) + // Verify the output is a valid ZIP/XLSX by reading it back + f2, err := OpenReader(bytes.NewReader(buf.Bytes())) + assert.NoError(t, err) + val, err := f2.GetCellValue("Sheet1", "A1") + assert.NoError(t, err) + assert.Equal(t, "test", val) + assert.NoError(t, f2.Close()) +} + +func TestCompressionOption(t *testing.T) { + // Generate a file with known content, then save with different + // compression levels and compare sizes. + makeFile := func() *File { + f := NewFile() + sw, _ := f.NewStreamWriter("Sheet1") + for row := 1; row <= 500; row++ { + rowData := make([]interface{}, 20) + for col := range 20 { + rowData[col] = "Hello World" + } + cell, _ := CoordinatesToCellName(1, row) + _ = sw.SetRow(cell, rowData) + } + _ = sw.Flush() + return f + } + + // Default compression + f1 := makeFile() + var buf1 bytes.Buffer + _, err := f1.WriteTo(&buf1) + assert.NoError(t, err) + assert.NoError(t, f1.Close()) + + // No compression + f2 := makeFile() + f2.options.Compression = CompressionNone + var buf2 bytes.Buffer + _, err = f2.WriteTo(&buf2) + assert.NoError(t, err) + assert.NoError(t, f2.Close()) + + // Best speed + f3 := makeFile() + f3.options.Compression = CompressionBestSpeed + var buf3 bytes.Buffer + _, err = f3.WriteTo(&buf3) + assert.NoError(t, err) + assert.NoError(t, f3.Close()) + + // No compression should produce the largest file + assert.Greater(t, buf2.Len(), buf1.Len(), "uncompressed should be larger than default") + assert.Greater(t, buf2.Len(), buf3.Len(), "uncompressed should be larger than best-speed") + + // All should be valid XLSX files + for _, buf := range []*bytes.Buffer{&buf1, &buf2, &buf3} { + f, err := OpenReader(bytes.NewReader(buf.Bytes())) + assert.NoError(t, err) + val, err := f.GetCellValue("Sheet1", "A1") + assert.NoError(t, err) + assert.Equal(t, "Hello World", val) + assert.NoError(t, f.Close()) + } +} + +func TestWriteToBufferCompression(t *testing.T) { + // Ensure WriteToBuffer also respects the Compression option + f := NewFile(Options{Compression: CompressionNone}) + sw, err := f.NewStreamWriter("Sheet1") + assert.NoError(t, err) + for row := 1; row <= 100; row++ { + cell, _ := CoordinatesToCellName(1, row) + _ = sw.SetRow(cell, []interface{}{"data"}) + } + _ = sw.Flush() + buf, err := f.WriteToBuffer() + assert.NoError(t, err) + assert.Greater(t, buf.Len(), 0) + assert.NoError(t, f.Close()) +} + +func TestWriteToWithPassword(t *testing.T) { + // Test that WriteTo with password uses temp file approach + f := NewFile() + assert.NoError(t, f.SetCellValue("Sheet1", "A1", "Encrypted Data")) + + // Write with password encryption to a buffer + var buf bytes.Buffer + n, err := f.WriteTo(&buf, Options{Password: "testpass"}) + assert.NoError(t, err) + assert.Greater(t, n, int64(0)) + assert.Greater(t, buf.Len(), 0) + + // Verify it can be opened with the password + encrypted := buf.Bytes() + f2, err := OpenReader(bytes.NewReader(encrypted), Options{Password: "testpass"}) + assert.NoError(t, err) + val, err := f2.GetCellValue("Sheet1", "A1") + assert.NoError(t, err) + assert.Equal(t, "Encrypted Data", val) + assert.NoError(t, f2.Close()) + assert.NoError(t, f.Close()) +} + +func TestWriteToWithPasswordAndCompression(t *testing.T) { + // Test that compression settings work with password encryption + f := NewFile(Options{Compression: CompressionBestSpeed}) + assert.NoError(t, f.SetCellValue("Sheet1", "A1", "Test")) + + var buf bytes.Buffer + _, err := f.WriteTo(&buf, Options{Password: "pass", Compression: CompressionBestSpeed}) + assert.NoError(t, err) + assert.Greater(t, buf.Len(), 0) + + // Verify it opens correctly + f2, err := OpenReader(bytes.NewReader(buf.Bytes()), Options{Password: "pass"}) + assert.NoError(t, err) + assert.NoError(t, f2.Close()) + assert.NoError(t, f.Close()) +} diff --git a/go.mod b/go.mod index 4601d7bc9d..6cdf6a3e2f 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/xuri/excelize/v2 go 1.25.0 require ( + github.com/klauspost/compress v1.18.5 github.com/richardlehane/mscfb v1.0.6 github.com/stretchr/testify v1.11.1 github.com/tiendc/go-deepcopy v1.7.2 diff --git a/go.sum b/go.sum index c011ad0b26..2c0e37cff0 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,7 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE= +github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/richardlehane/mscfb v1.0.6 h1:eN3bvvZCp00bs7Zf52bxNwAx5lJDBK1tCuH19qq5aC8= diff --git a/lib.go b/lib.go index 31199ad973..545bd9426a 100644 --- a/lib.go +++ b/lib.go @@ -12,7 +12,6 @@ package excelize import ( - "archive/zip" "bytes" "container/list" "encoding/xml" @@ -27,6 +26,8 @@ import ( "strconv" "strings" "unicode/utf16" + + "github.com/klauspost/compress/zip" ) // ReadZipReader extract spreadsheet with given options. diff --git a/lib_test.go b/lib_test.go index 225e24b668..7df1bbd2c9 100644 --- a/lib_test.go +++ b/lib_test.go @@ -1,7 +1,6 @@ package excelize import ( - "archive/zip" "bytes" "encoding/xml" "fmt" @@ -13,6 +12,7 @@ import ( "sync" "testing" + "github.com/klauspost/compress/zip" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) diff --git a/stream.go b/stream.go index 87d329e2a8..b39230726c 100644 --- a/stream.go +++ b/stream.go @@ -17,6 +17,7 @@ import ( "encoding/xml" "fmt" "io" + "math" "os" "reflect" "strconv" @@ -122,7 +123,10 @@ func (f *File) NewStreamWriter(sheet string) (*StreamWriter, error) { return nil, ErrSheetNotExist{sheet} } chunkSize := f.options.StreamingChunkSize - if chunkSize <= 0 { + switch { + case chunkSize < 0: + chunkSize = math.MaxInt // never spill to disk + case chunkSize == 0: chunkSize = StreamChunkSize } bufSize := f.options.StreamingBufSize @@ -1113,6 +1117,31 @@ func (bw *bufferedWriter) Bytes() []byte { return bw.buf.Bytes() } +// CopyTo efficiently copies all buffered data to w. For in-memory buffers +// this is a simple WriteTo. For temp files this uses a large read buffer to +// minimize syscalls (one read per bioSize bytes instead of per 32 KB). +func (bw *bufferedWriter) CopyTo(w io.Writer) (int64, error) { + if bw.tmp == nil { + return io.Copy(w, bytes.NewReader(bw.buf.Bytes())) + } + if err := bw.Flush(); err != nil { + return 0, err + } + if _, err := bw.tmp.Seek(0, 0); err != nil { + return 0, err + } + // Use a large read buffer to batch Pread syscalls. Without this, + // io.Copy uses 32 KB reads, generating thousands of syscalls for + // large worksheets (e.g. 100 MB XML → 3000+ syscalls). A 256 KB + // buffer reduces that to ~400. + readBufSize := 256 * 1024 + if bw.bioSize > readBufSize { + readBufSize = bw.bioSize + } + br := bufio.NewReaderSize(bw.tmp, readBufSize) + return io.Copy(w, br) +} + // Reader provides read-access to the underlying buffer/file. func (bw *bufferedWriter) Reader() (io.Reader, error) { if bw.tmp == nil { diff --git a/stream_bench_test.go b/stream_bench_test.go index 4f1f08bf44..836ce230f8 100644 --- a/stream_bench_test.go +++ b/stream_bench_test.go @@ -253,3 +253,68 @@ func BenchmarkStringCellSpecial(b *testing.B) { _ = file.Close() } } + +// writeExcelBenchWithOpts is like writeExcelBench but accepts Options. +func writeExcelBenchWithOpts(data [][]string, out io.Writer, opts Options) error { + file := NewFile(opts) + if len(data) == 0 { + return nil + } + sw, err := file.NewStreamWriter("Sheet1") + if err != nil { + return err + } + lineInterface := make([]interface{}, len(data[0])) + for excelLineNum, line := range data { + lineInterface = lineInterface[:0] + for x := range line { + lineInterface = append(lineInterface, line[x]) + } + cell, _ := CoordinatesToCellName(1, excelLineNum+1) + if err = sw.SetRow(cell, lineInterface); err != nil { + return err + } + } + if err = sw.Flush(); err != nil { + return err + } + _, err = file.WriteTo(out) + return err +} + +func BenchmarkCompressionLevels(b *testing.B) { + const rows, cols = 50000, 20 + data := make([][]string, rows) + for x := range data { + data[x] = make([]string, cols) + for y := range data[x] { + data[x][y] = "test value " + strconv.Itoa(x*cols+y) + } + } + + for _, tc := range []struct { + name string + comp Compression + chunk int // StreamingChunkSize; 0 = default + }{ + {"Default", CompressionDefault, 0}, + {"BestSpeed", CompressionBestSpeed, 0}, + {"None", CompressionNone, 0}, + {"Default/InMemory", CompressionDefault, -1}, + {"BestSpeed/InMemory", CompressionBestSpeed, -1}, + {"None/InMemory", CompressionNone, -1}, + } { + b.Run(tc.name, func(b *testing.B) { + b.ReportAllocs() + for n := 0; n < b.N; n++ { + b.StopTimer() + buf := bytes.Buffer{} + buf.Reset() + b.StartTimer() + if err := writeExcelBenchWithOpts(data, &buf, Options{Compression: tc.comp, StreamingChunkSize: tc.chunk}); err != nil { + b.Fatal(err) + } + } + }) + } +}