-
-
Notifications
You must be signed in to change notification settings - Fork 133
/
Archive.pm
364 lines (278 loc) · 11.9 KB
/
Archive.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
package LANraragi::Utils::Archive;
use strict;
use warnings;
use utf8;
use feature qw(say);
use feature qw(signatures);
no warnings 'experimental::signatures';
use Time::HiRes qw(gettimeofday);
use File::Basename;
use File::Path qw(remove_tree make_path);
use File::Find qw(finddepth);
use File::Copy qw(move);
use Encode;
use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
use Redis;
use Cwd;
use Data::Dumper;
use Image::Magick;
use Archive::Libarchive qw( ARCHIVE_OK );
use Archive::Libarchive::Extract;
use Archive::Libarchive::Peek;
use File::Temp qw(tempdir);
use LANraragi::Utils::TempFolder qw(get_temp);
use LANraragi::Utils::Logging qw(get_logger);
use LANraragi::Utils::Generic qw(is_image shasum);
# Utilitary functions for handling Archives.
# Relies on Libarchive, ImageMagick and GhostScript for PDFs.
use Exporter 'import';
our @EXPORT_OK =
qw(is_file_in_archive extract_file_from_archive extract_single_file extract_archive extract_thumbnail generate_thumbnail get_filelist);
sub is_pdf {
my ( $filename, $dirs, $suffix ) = fileparse( $_[0], qr/\.[^.]*/ );
return ( $suffix eq ".pdf" );
}
# use ImageMagick to make a thumbnail, height = 500px (view in index is 280px tall)
# If use_hq is true, the scale algorithm will be used instead of sample.
# If use_jxl is true, JPEG XL will be used instead of JPEG.
sub generate_thumbnail ( $orig_path, $thumb_path, $use_hq, $use_jxl ) {
my $img = Image::Magick->new;
my $format = $use_jxl ? 'jxl' : 'jpg';
# For JPEG, the size option (or jpeg:size option) provides a hint to the JPEG decoder
# that it can reduce the size on-the-fly during decoding. This saves memory because
# it never has to allocate memory for the full-sized image
if ($format eq 'jpg') {
$img->Set(option => 'jpeg:size=500x');
}
# If the image is a gif, only take the first frame
if ($orig_path =~ /\.gif$/) {
$img->Read($orig_path . "[0]");
} else {
$img->Read($orig_path);
}
# The "-scale" resize operator is a simplified, faster form of the resize command.
if ($use_hq) {
$img->Scale(geometry => '500x1000');
} else { # Sample is very fast due to not applying filters.
$img->Sample(geometry => '500x1000');
}
$img->Set(quality => "50", magick => $format);
$img->Write($thumb_path);
undef $img;
}
# Extract the given archive to the given path.
# This sub won't re-extract files already present in the destination unless force = 1.
sub extract_archive ( $destination, $to_extract, $force_extract ) {
my $logger = get_logger( "Archive", "lanraragi" );
$logger->debug("Fully extracting archive $to_extract");
# PDFs are handled by Ghostscript (alas)
if ( is_pdf($to_extract) ) {
return extract_pdf( $destination, $to_extract );
}
# Prepare libarchive with a callback to skip over existing files (unless force=1)
my $ae = Archive::Libarchive::Extract->new(
filename => $to_extract,
entry => sub {
my $e = shift;
if ($force_extract) { return 1; }
my $filename = $e->pathname;
if ( -e "$destination/$filename" ) {
$logger->debug("$filename already exists in $destination");
return 0;
}
$logger->debug("Extracting $filename");
# Pre-emptively create the file to signal we're working on it
open( my $fh, ">", "$destination/$filename" )
or
$logger->error("Couldn't create placeholder file $destination/$filename (might be a folder?), moving on nonetheless");
close $fh;
return 1;
}
);
# Extract to $destination. This method throws if extraction fails.
$ae->extract( to => $destination );
# Get extraction folder
my $result_dir = $ae->to;
my $cwd = getcwd();
# chdir back to the base cwd in case finddepth died midway
chdir $cwd;
# Return the directory we extracted the files to.
return $result_dir;
}
sub extract_pdf ( $destination, $to_extract ) {
my $logger = get_logger( "Archive", "lanraragi" );
# Raw Perl strings won't necessarily work in a terminal command, so we must decode the filepath here
$logger->debug("Decoding PDF filepath $to_extract before sending it to GhostScript");
eval {
# Try a guess to regular japanese encodings first
$to_extract = decode( "Guess", $to_extract );
};
# Fallback to utf8
$to_extract = decode_utf8($to_extract) if $@;
make_path($destination);
my $gscmd = "gs -dNOPAUSE -sDEVICE=jpeg -r200 -o '$destination/\%d.jpg' '$to_extract'";
$logger->debug("Sending PDF $to_extract to GhostScript...");
$logger->debug($gscmd);
`$gscmd`;
return $destination;
}
# Extracts a thumbnail from the specified archive ID and page. Returns the path to the thumbnail.
# Non-cover thumbnails land in a folder named after the ID. Specify page=0 if you want the cover.
# Thumbnails will be generated at low quality by default unless you specify use_hq=1.
sub extract_thumbnail ( $thumbdir, $id, $page, $use_hq ) {
my $logger = get_logger( "Archive", "lanraragi" );
# JPG is used for thumbnails by default
my $use_jxl = LANraragi::Model::Config->get_jxlthumbpages;
my $format = $use_jxl ? 'jxl' : 'jpg';
# Another subfolder with the first two characters of the id is used for FS optimization.
my $subfolder = substr( $id, 0, 2 );
my $thumbname = "$thumbdir/$subfolder/$id.$format";
make_path("$thumbdir/$subfolder");
my $redis = LANraragi::Model::Config->get_redis;
my $file = $redis->hget( $id, "file" );
my $temppath = tempdir();
# Get first image from archive using filelist
my ( $images, $sizes ) = get_filelist($file);
# Dereference arrays
my @filelist = @$images;
my $requested_image = $filelist[ $page > 0 ? $page - 1 : 0 ];
die "Requested image not found: $requested_image" unless $requested_image;
$logger->debug("Extracting thumbnail for $id page $page from $requested_image");
# Extract first image to temp dir
my $arcimg = extract_single_file( $file, $requested_image, $temppath );
if ( $page - 1 > 0 ) {
# Non-cover thumbnails land in a dedicated folder.
$thumbname = "$thumbdir/$subfolder/$id/$page.$format";
make_path("$thumbdir/$subfolder/$id");
} else {
# For cover thumbnails, grab the SHA-1 hash for tag research.
# That way, no need to repeat a costly extraction later.
my $shasum = shasum( $arcimg, 1 );
$logger->debug("Setting thumbnail hash: $shasum");
$redis->hset( $id, "thumbhash", $shasum );
$redis->quit();
}
# Thumbnail generation
generate_thumbnail( $arcimg, $thumbname, $use_hq, $use_jxl );
# Clean up safe folder
remove_tree($temppath);
return $thumbname;
}
#magical sort function used below
sub expand {
my $file = shift;
$file =~ s{(\d+)}{sprintf "%04d", $1}eg;
return lc($file);
}
# Returns a list of all the files contained in the given archive.
sub get_filelist($archive) {
my @files = ();
my @sizes = ();
if ( is_pdf($archive) ) {
# For pdfs, extraction returns images from 1.jpg to x.jpg, where x is the pdf pagecount.
# Using -dNOSAFER or --permit-file-read is required since GS 9.50, see https://github.com/doxygen/doxygen/issues/7290
my $pages = `gs -q -dNOSAFER -c "($archive) (r) file runpdfbegin pdfpagecount = quit"`;
for my $num ( 1 .. $pages ) {
push @files, "$num.jpg";
push @sizes, 0;
}
} else {
my $r = Archive::Libarchive::ArchiveRead->new;
$r->support_filter_all;
$r->support_format_all;
my $ret = $r->open_filename( $archive, 10240 );
die unless ( $ret == ARCHIVE_OK );
my $e = Archive::Libarchive::Entry->new;
while ( $r->next_header($e) == ARCHIVE_OK ) {
my $filesize = ( $e->size_is_set eq 64 ) ? $e->size : 0;
my $filename = $e->pathname;
if ( is_image($filename) ) {
push @files, $filename;
push @sizes, $filesize;
}
$r->read_data_skip;
}
}
@files = sort { &expand($a) cmp &expand($b) } @files;
# Move front cover pages to the start of a gallery, and miscellaneous pages such as translator credits to the end.
my @cover_pages = grep { /^(?!.*(back|end|rear|recover|discover)).*cover.*/i } @files;
my @credit_pages = grep { /^end_card_save_file|notes\.[^\.]*$|note\.[^\.]*$|^artist_info|credit|999nhnl\./i } @files;
# Get all the leftover pages
my %credit_hash = map { $_ => 1 } @credit_pages;
my %cover_hash = map { $_ => 1 } @cover_pages;
my @other_pages = grep { !$credit_hash{$_} && !$cover_hash{$_} } @files;
@files = ( @cover_pages, @other_pages, @credit_pages );
# Return files and sizes in a hashref
return ( \@files, \@sizes );
}
# Uses libarchive::peek to figure out if $archive contains $file.
# Returns the exact in-archive path of the file if it exists, undef otherwise.
sub is_file_in_archive ( $archive, $wantedname ) {
my $logger = get_logger( "Archive", "lanraragi" );
if ( is_pdf($archive) ) {
$logger->debug("$archive is a pdf, no sense looking for specific files");
return;
}
$logger->debug("Iterating files of archive $archive, looking for '$wantedname'");
$Data::Dumper::Useqq = 1;
my $peek = Archive::Libarchive::Peek->new( filename => $archive );
my $found;
my @files = $peek->files;
for my $file (@files) {
$logger->debug( "Found file " . Dumper($file) );
my ( $name, $path, $suffix ) = fileparse( $file, qr/\.[^.]*/ );
# If the end of the file contains $wantedname we're good
if ( "$name$suffix" =~ /$wantedname$/ ) {
$logger->debug("OK!");
$found = $file;
last;
}
}
return $found;
}
# Extract $file from $archive to $destination and returns the filesystem path it's extracted to.
# If the file doesn't exist in the archive, this will still create a file, but empty.
sub extract_single_file ( $archive, $filepath, $destination ) {
my $logger = get_logger( "Archive", "lanraragi" );
my $outfile = "$destination/$filepath";
$logger->debug("Output for single file extraction: $outfile");
# Remove file from $outfile and hand the full directory to make_path
my ( $name, $path, $suffix ) = fileparse( $outfile, qr/\.[^.]*/ );
make_path($path);
if ( is_pdf($archive) ) {
# For pdfs the filenames are always x.jpg, so we pull the page number from that
my $page = $filepath;
$page =~ s/^(\d+).jpg$/$1/;
my $gscmd = "gs -dNOPAUSE -dFirstPage=$page -dLastPage=$page -sDEVICE=jpeg -r200 -o '$outfile' '$archive'";
$logger->debug("Extracting page $filepath from PDF $archive");
$logger->debug($gscmd);
`$gscmd`;
} else {
my $contents = "";
my $peek = Archive::Libarchive::Peek->new( filename => $archive );
my @files = $peek->files;
for my $name (@files) {
my $decoded_name = LANraragi::Utils::Database::redis_decode($name);
# This sub can receive either encoded or raw filenames, so we have to test for both.
if ( $decoded_name eq $filepath || $name eq $filepath ) {
$logger->debug("Found file $filepath in archive $archive");
$contents = $peek->file($name);
last;
}
}
open( my $fh, '>', $outfile )
or die "Could not open file '$outfile' $!";
print $fh $contents;
close $fh;
}
return $outfile;
}
# Variant for plugins.
# Extracts the file to a folder in /temp/plugin.
sub extract_file_from_archive ( $archive, $filename ) {
my $path = get_temp . "/plugin";
mkdir $path;
my $tmp = tempdir( DIR => $path, CLEANUP => 1 );
return extract_single_file( $archive, $filename, $tmp );
}
1;