Skip to content

Honour pod encoding #213

Closed
wants to merge 9 commits into from
View
26 lib/MetaCPAN/Document/File.pm
@@ -42,23 +42,20 @@ sub _build_abstract {
my ( $documentation, $abstract );
my $section = MetaCPAN::Util::extract_section( $text, 'NAME' );
return undef unless ($section);
- $section =~ s/^=\w+.*$//mg;
- $section =~ s/X<.*?>//mg;
- if ( $section =~ /^\s*(\S+)((\h+-+\h+(.+))|(\r?\n\h*\r?\n\h*(.+)))?/ms ) {
+ $section =~ s/^=(?!encoding\b)\w+.*$//mg; # strip non-encoding commands
+ $section =~ s/^\h+//mg;
+ $section = MetaCPAN::Util::strip_pod( $section );
+ if ( $section =~ /^\s*(\S+)((\h+[\x{2013}\x{2014}-]+\h+(.+))|(\s+(.+)))?/ms ) {
chomp( $abstract = $4 || $6 ) if ( $4 || $6 );
- my $name = MetaCPAN::Util::strip_pod($1);
+ my $name = $1;
$documentation = $name if ( $name =~ /^[\w\.:\-_']+$/ );
}
if ($abstract) {
- $abstract =~ s/^=\w+.*$//xms;
- $abstract =~ s{\r?\n\h*\r?\n\h*.*$}{}xms;
- $abstract =~ s{\n}{ }gxms;
- $abstract =~ s{\s+$}{}gxms;
- $abstract =~ s{(\s)+}{$1}gxms;
- $abstract = MetaCPAN::Util::strip_pod($abstract);
+ $abstract =~ s{\n.*$}{}xms;
+ $abstract =~ s{\n}{}gxms;
}
if ($documentation) {
- $self->documentation( MetaCPAN::Util::strip_pod($documentation) );
+ $self->documentation( $documentation );
}
return $abstract;
}
@@ -128,13 +125,8 @@ sub _build_description {
my $section = MetaCPAN::Util::extract_section( ${ $self->content },
'DESCRIPTION' );
return undef unless ($section);
- my $parser = Pod::Text->new;
- my $text = "";
- $parser->output_string( \$text );
- $parser->parse_string_document("=pod\n\n$section");
+ my $text = MetaCPAN::Util::strip_pod( $section );
$text =~ s/\s+/ /g;
- $text =~ s/^\s+//;
- $text =~ s/\s+$//;
return $text;
}
View
49 lib/MetaCPAN/Util.pm
@@ -5,7 +5,7 @@ use warnings;
use Digest::SHA1;
use version;
use Try::Tiny;
-use Encode;
+use Pod::Simple::Text 3.23;
sub digest {
my $digest = Digest::SHA1::sha1_base64(join("\0", grep { defined } @_));
@@ -43,24 +43,48 @@ sub author_dir {
}
-# TODO: E<escape>
sub strip_pod {
my $pod = shift;
- $pod =~ s/L<([^\/]*?)\/([^\/]*?)>/$2 in $1/g;
- $pod =~ s/\w<(.*?)(\|.*?)?>/$1/g;
- return $pod;
+
+ # Was encoding explicitly declared or inferred by POD parser?
+ my $have_encoding = $pod =~ /^=encoding/m;
+
+ my $parser = Pod::Simple::Text->new();
+ my $text = "";
+ $parser->output_string( \$text );
+ $parser->no_whining( 1 );
+ {
+ local($Text::Wrap::columns) = 10_000;
+ $parser->parse_string_document("=pod\n\n$pod");
+ }
+ if($have_encoding and $text =~ /POD ERRORS.*unsupported encoding/s) {
+ $pod =~ s/^=encoding.*$//mg;
+ return strip_pod($pod);
+ }
+
+ # If encoding was not declared, replace "smart-quote" chars with ASCII
+ if(!$have_encoding) {
+ $text =~ tr/\x{91}\x{92}\x{93}\x{94}\x{96}\x{97}/''""\-\-/;
+ }
+
+ $text =~ s/\h+/ /g;
+ $text =~ s/^\s+//mg;
+ $text =~ s/\s+$//mg;
+
+ return $text;
}
sub extract_section {
my ( $pod, $section ) = @_;
- eval { $pod = Encode::decode_utf8($pod, Encode::FB_CROAK) };
@monken
MetaCPAN member
monken added a note Aug 25, 2012

The idea here was to try to utf8_decode $pod. Your patch will just assume no encoding if =encoding is missing?

@grantm
grantm added a note Aug 25, 2012

No, if the =encoding is missing, the POD parser will guess so we don't have to. Pod::Simple 3.21 or later will choose between UTF-8 or Latin-1 based on the presence or absence of a valid UTF-8 byte sequence. This is better than blindly assuming UTF-8 so this decode is no longer necessary.

Also the decode should really only have been applied if no =encoding was specified - not much point attempting to interpret the bytes as UTF-8 (and potentially corrupting them) if the source was explicitly telling us which encoding was used.

Also, calling decode_utf8 transforms a byte string into a character string. However Pod::Simple was only ever designed to accept byte strings - passing it a character string could cause it to crash. You could be forgiven for not knowing that since it was only explicitly documented in release 3.23 but it has been implicit in the design for ever.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
+ my $encoding = $pod =~ /^(=encoding.*?\n)/m ? "$1\n" : '';
return undef
unless ( $pod =~ /^=head1 $section\b(.*?)(^((\=head1)|(\=cut)))/msi
|| $pod =~ /^=head1 $section\b(.*)/msi );
my $out = $1;
$out =~ s/^\s*//g;
$out =~ s/\s*$//g;
- return $out;
+ $out =~ s/^=encoding.*$//m;
+ return $encoding . $out;
}
@@ -108,3 +132,14 @@ This function will digest the passed parameters to a 32 byte string and makes it
It consists of the characters A-Z, a-z, 0-9, - and _.
The digest is built using L<Digest::SHA1>.
+
+=head2 strip_pod
+
+Takes a string of POD source code (bytes) and returns a plain text rendering
+(which may include 'wide' characters). If the source POD declares an encoding,
+it will be honoured where possible.
+
+The returned text will use single newlines as paragraph separators and all
+whitespace will be collapsed.
+
+=cut
View
174 t/util.t
@@ -1,8 +1,10 @@
use Test::Most;
use strict;
use warnings;
+use utf8; # This file contains literal UTF8 strings
use MetaCPAN::Util;
use CPAN::Meta;
+use Pod::Text;
is( MetaCPAN::Util::numify_version(1), 1.000 );
is( MetaCPAN::Util::numify_version('010'), 10.000 );
@@ -19,22 +21,111 @@ lives_ok { is(version("V0.01"), 0.01) };
lives_ok { is(version('0.99_1'), '0.99001') };
lives_ok { is(version('0.99.01'), '0.99.01') };
-is(MetaCPAN::Util::strip_pod('hello L<link|http://www.google.com> foo'), 'hello link foo');
-is(MetaCPAN::Util::strip_pod('hello L<Module/section> foo'), 'hello section in Module foo');
-is(MetaCPAN::Util::strip_pod('for L<Dist::Zilla>'), 'for Dist::Zilla');
-is(MetaCPAN::Util::strip_pod('without a leading C<$>.'), 'without a leading $.');
+is(
+ MetaCPAN::Util::strip_pod('hello L<link|http://www.google.com> foo'),
+ 'hello link <http://www.google.com> foo',
+ 'link to URL'
+);
+is(
+ MetaCPAN::Util::strip_pod('hello L<Module/section> foo'),
+ 'hello "section" in Module foo',
+ 'link to Module/section'
+);
+is(
+ MetaCPAN::Util::strip_pod('for L<Dist::Zilla>'),
+ 'for Dist::Zilla',
+ 'link to Module'
+);
+is(
+ MetaCPAN::Util::strip_pod('without a leading C<$>.'),
+ 'without a leading $.',
+ 'code section'
+);
+is(
+ MetaCPAN::Util::strip_pod('B<bold> I<italics> C<code> F<file>'),
+ 'bold italics code file',
+ 'character formatting stripped'
+);
+is(
+ MetaCPAN::Util::strip_pod('E<lt>me@example.comE<gt>'),
+ '<me@example.com>',
+ 'POD escapes decoded'
+);
+is(
+ MetaCPAN::Util::strip_pod("Para one.\n\nPara\ntwo.\n\nPara three.\n"),
+ "Para one.\nPara two.\nPara three.",
+ 'whitespace collapsed, paras as lines'
+);
+is(
+ MetaCPAN::Util::strip_pod("Para one.\n\n verbatim a\n verbatim b\nPara two.\n"),
+ "Para one.\nverbatim a\nverbatim b\nPara two.",
+ 'verbatim lines not wrapped'
+);
+is(
+ MetaCPAN::Util::strip_pod("=encoding utf8\n\nMoose - \xC3\x89lan\n"),
+ "Moose - Élan",
+ 'utf8 bytes decoded'
+);
+is(
+ MetaCPAN::Util::strip_pod("Moose - \xC3\x89lan\n"),
+ "Moose - Élan",
+ 'utf8 bytes decoded - even without encoding declaration'
+);
+is(
+ MetaCPAN::Util::strip_pod("=encoding iso8859-1\n\nMoose - \xC9lan\n"),
+ "Moose - Élan",
+ 'Latin1 bytes decoded'
+);
+is(
+ MetaCPAN::Util::strip_pod("Moose - \xC9lan\n"),
+ "Moose - Élan",
+ 'Latin1 bytes decoded - even without encoding declaration'
+);
+is(
+ MetaCPAN::Util::strip_pod(
+ "=encoding CP1252\n\nMoose \x96 \xC9lan \x97 \x93 Dou\xE9 \x94 \x91 Fut\xE9 \x92"
+ ),
+ "Moose – Élan — “ Doué ” ‘ Futé ’",
+ 'CP1252 bytes decoded'
+);
+is(
+ MetaCPAN::Util::strip_pod("Moose \x96 \xC9lan \x97 \x93 Dou\xE9 \x94 \x91 Fut\xE9 \x92"),
+ q{Moose - Élan - " Doué " ' Futé '},
+ 'CP1252 bytes de-smarted without encoding declaration'
+);
+is(
+ MetaCPAN::Util::strip_pod(
+ "=encoding iso8859-2\n\nAlien::Not - \xD2\xF4\xFE \xE3\xF1 \xB1\xE5\xED\xEA\xF2"
+ ),
+ "Alien::Not - Ňôţ ăń ąĺíęň",
+ 'strip_pod honoured latin2 encoding'
+);
+is(
+ MetaCPAN::Util::strip_pod("=encoding BOGUS-ENC-9000\n\nMoose - \xC9lan\n"),
+ "Moose - Élan",
+ 'bytes in unknown encoding decoded as Latin1'
+);
+my $chars = Encode::decode_utf8(
+ "=encoding utf8\x0A\x0AModule \xE2\x80\x93 \xE2\x80\x9CName\xE2\x80\x9D\x0A"
+);
+is(
+ MetaCPAN::Util::strip_pod($chars),
+ "Module – “Name”",
+ 'when source POD is already char string, characters are not decoded'
+);
sub version {
- CPAN::Meta->new(
- { name => 'foo',
- license => 'unknown',
- version => MetaCPAN::Util::fix_version(shift) } )->version;
+ CPAN::Meta->new({
+ name => 'foo',
+ license => 'unknown',
+ version => MetaCPAN::Util::fix_version(shift)
+ })->version;
}
# extract_section tests
{
- my $content = <<EOF;
+ my $content = <<EOF;
=head1 NAME
Some::Thing - Test
@@ -45,22 +136,77 @@ Some data about a named pipe
EOF
- my $section = MetaCPAN::Util::extract_section( $content, 'NAME');
- is($section, 'Some::Thing - Test', 'NAME matched correct head1 section');
+ my $section = MetaCPAN::Util::extract_section( $content, 'NAME');
+ is($section, 'Some::Thing - Test', 'NAME matched correct head1 section');
}
# https://github.com/CPAN-API/cpan-api/issues/167
{
- my $content = <<EOF;
+ my $content = <<EOF;
=head1 NAMED PIPE
Some description
=cut
EOF
- my $section = MetaCPAN::Util::extract_section( $content, 'NAME');
- is($section, undef, 'NAMED did not match requested section NAME');
+ my $section = MetaCPAN::Util::extract_section( $content, 'NAME');
+ is($section, undef, 'NAMED did not match requested section NAME');
+}
+
+# section extraction should honour =encoding declaration
+
+sub pod_to_text {
+ my($pod) = @_;
+
+ my $parser = Pod::Text->new;
+ my $text = "";
+ $parser->output_string( \$text );
+ $parser->parse_string_document("=pod\n\n$pod");
+ return $text;
+}
+
+{
+ my $content = <<"EOF";
+
+=encoding CP1252
+
+=head1 NAME
+
+Some::Thing - Somethin\x92 or nothin\x92
+
+=head1 DESCRIPTION
+
+This is meant to be \x93descriptive\x94.
+
+EOF
+
+ my $section = MetaCPAN::Util::extract_section( $content, 'NAME');
+ is(
+ $section,
+ "=encoding CP1252\n\nSome::Thing - Somethin\x92 or nothin\x92",
+ 'NAME section came through as bytes with =encoding declaration'
+ );
+ my $formatted = pod_to_text( $section );
+ like(
+ $formatted,
+ qr/Some::Thing - Somethin’ or nothin’/,
+ 'POD parser was able to decode bytes'
+ );
+
+ $section = MetaCPAN::Util::extract_section( $content, 'DESCRIPTION');
+ is(
+ $section,
+ "=encoding CP1252\n\nThis is meant to be \x93descriptive\x94.",
+ 'DESCRIPTION section came through as bytes with =encoding declaration'
+ );
+ $formatted = pod_to_text( $section );
+ like(
+ $formatted,
+ qr/This is meant to be “descriptive”./,
+ 'POD parser was able to decode bytes'
+ );
+
}
done_testing;
Something went wrong with that request. Please try again.