johnl / deb-sphinx-search

Debian/Ubuntu package for the Sphinx, the free SQL full-text search engine

This URL has Read+Write access

johnl (author)
Sat Dec 13 14:37:29 -0800 2008
commit  9e16b906e8dae79d131950c5d674f40d352ed24e
tree    1d36fb6f7f5deb95cf2bfbb275c90c31e53ef984
parent  f5acaa2d9c0791dc19da50a95ae5b1daef672e1c
deb-sphinx-search / doc / html2txt.pl
100644 185 lines (156 sloc) 3.497 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#
# $Id: html2txt.pl 1125 2008-02-04 06:08:30Z shodan $
#
 
use Text::Wrap;
use Data::Dumper;
$Text::Wrap::columns = 76;
$Text::Wrap::unexpand = 0;
 
$MAGIC_NBSP = "\xff";
 
sub unentity
{
my $html = shift;
die ( "INTERNAL ERROR: magic nbsp code found in html" ) if ( $html =~ /$MAGIC_NBSP/ );
$html =~ s/ /$MAGIC_NBSP/g;
$html =~ s/&lt;/</g;
$html =~ s/&gt;/>/g;
$html =~ s/&#8220;/<</g;
$html =~ s/&#8221;/>>/g;
$html =~ s/&copy;/(c)/g;
$html =~ s/&amp;/&/g;
return $html;
};
 
sub linify
{
my $text = shift;
$text =~ s/\n/ /g;
$text =~ s/\t/ /g;
$text =~ s/\s{2,}/ /g;
$text =~ s/^\s+//ms;
$text =~ s/\s+$//ms;
$text =~ s/\s+(\w)\s+/ \1$MAGIC_NBSP/g; # glue 1-letter words with nbsp's
return $text;
};
 
sub in
{
my ($a,$v) = @_;
foreach my $e ( @$a )
{
return 1 if $e eq $v;
}
return 0;
};
 
###############
## entry point
###############
 
undef $/;
$html = <>;
 
$html =~ s/.*?<body.*?>//ms;
$html =~ s/<\/body.*/\n--eof--\n/ms;
 
$res = "";
$acc = "";
$hdr = "";
 
$left = 0;
$red = "";
@lists = "";
$li = 0;
$pre = 0;
 
while ( $html =~ s/^(.*?)<([\/]*\w+)(\s+.*?)*>//ms )
{
$text = $1;
$tag = lc $2;
$attrs = lc $3;
 
$acc .= $text;
 
next if ( in ( [ "a","/a","i","/i","p","code","/code","b","/b","span","/span",
"strong","/strong","br","em","/em" ], $tag ) );
 
if ( in ( [ "div","/div","/h1","/h2","/h3","/h4","/h5","/p","hr","h5",
"dl","ol","ul","dt","dd","/dl","/ol","/ul","/dt","/dd",
"li","/li","pre","/pre" ], $tag ) )
{
# fixme! handle userinput instead of literallayout
if ( !$pre )
{
$acc = linify(unentity($acc));
$acc = wrap ( (" " x $left).$red, " "x($left+length($red)), $acc );
die if ( $acc =~ /\t/ );
} else
{
my @lines = split ( /\n/, unentity($acc) );
$acc = "";
foreach $line ( @lines )
{
next if ( !$acc && !$line );
$acc .= ( " " x $left ) . "| $line\n";
}
$acc = "\n$acc\n";
}
$red = " "x length($red);
 
if ( $acc )
{
if ( length($hdr) )
{
$res .= "$acc\n" . $hdr x length($acc) . "\n\n";
$hdr = "";
} else
{
$res .= "$acc\n";
}
}
$acc = "";
 
$res .= "\n" if ( in ( [ "div","/div","/h1","/h2","/h3","/p","hr" ], $tag ) );
$res .= "\n" if ( $#lists<=2 && in ( ["/dl","/ol","/ul" ], $tag ) );
 
if ( $tag eq "hr" )
{
$res .= "-" x $Text::Wrap::columns . "\n\n";
}
 
if ( in ( [ "dd","ol","ul" ], $tag ) )
{
$left += 3;
$li = 1 if ( $tag eq "ol" ); # fixme! allow nested ol
push ( @lists, $tag );
next;
}
 
if ( in ( [ "/dd","/ol","/ul" ], $tag ) )
{
$left -= 3;
pop @lists;
$red = ""; # just in case
$res .= "\n";
next;
}
 
if ( $tag eq "li" )
{
my $list = $lists[$#lists];
 
if ( $list eq "ol" ) { $red = "$li. "; $li++; }
elsif ( $list eq "ul" ) { $red = "* "; }
else { print Dumper(@lists);
die ( "INTERNAL ERROR: 'li' in unknown list '$list'" ); }
next;
}
 
if ( $tag eq "pre" || ( $tag eq "div" && $attrs =~ /literallayout/ ) )
{
$pre = 1;
$left += 3;
next;
}
 
if ( $pre && ( $tag eq "/pre" || $tag eq "/div" ) )
{
$pre = 0;
$left -= 3;
next;
}
 
next;
}
 
if ( $tag eq "h1" ) { $hdr = "="; next; }
if ( $tag eq "h2" ) { $hdr = "="; next; }
if ( $tag eq "h3" ) { $hdr = "-"; next; }
if ( $tag eq "h4" ) { $hdr = "-"; next; }
 
die ( "unknown tag='$tag' attrs='$attrs'\n" );
}
 
$res .= "\n\n--eof--\n";
$res =~ s/^\n+//;
$res =~ s/\n{3,}/\n\n/gms;
$res =~ s/$MAGIC_NBSP/ /g;
print $res;
 
#
# $Id: html2txt.pl 1125 2008-02-04 06:08:30Z shodan $
#