crcx / ent

Ent provides tools and services via an email interface

This URL has Read+Write access

ent / html2txt.php
100644 155 lines (138 sloc) 7.131 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/************************************************************
* Modified 12/27/2007 by Charles Childers for use with Ent
************************************************************/
 
 
/************************************************************
Library to convert HTML into an approximate text equivalent
v1.0.4 update 11/10/2008 to convert HTML entities
*************************************************************
 
Please see http://www.howtocreate.co.uk/php/ for details
Please see http://www.howtocreate.co.uk/jslibs/termsOfUse.html
for terms and conditions of use
 
The reason this library was written was to convert HTML email
contents into a text based email content, where the rendering
does not have to be as accurate as with a text based browser.
However, there must be many more uses for it.
 
This library attempts to deal with non-standard HTML, but may
occasionally suffer from problems with pages that are not
properly written - most especially: Tags written as
<tagName attribute=somethingWithA"or'InItButNotSurroundedByQuotes>,
Closing </pre> or </textarea> tags without their corresponding
opening tags, Tags within <textarea> </textarea> tags, which
will be rendered, even though they should not be.
 
Conversion requires a lot of preg_replace statements, so it can
be quite slow with large HTML files.
 
******
To use
******
 
This library requires PHP 4+
 
To use this library, put the following line in your script
before the part that needs it:
require('PATH_TO_THIS_FILE/html2text.php');
 
To convert HTML/PHP to text:
$textVersion = html2text( $HTMLversion );
 
************
Further info
************
 
For the technically minded, this is the process I use for
converting HTML to approx text:
 
REMOVE php start and end tags
REMOVE <!-- -->
ensure HTML uses entities in the right places (like inside tags) so strip_tags works properly
<STYLE|SCRIPT|OPTION>
carefully remove everything between them
strip_tags except the important ones
replace all \s that are after the start or a </pre> and before <pre> or end with a single space
</TITLE|HR>
\n --------------------
<H1|H2|H3|H4|H5|H6|DIV|P|PRE>
\n\n
<SUP>
^
<UL|OL|BR|DL|DT|TABLE|CAPTION|TR->(TH|TD)>
\n
<LI>
\n·
<DD>
\n\t
<TH|TD>
\t
<A|AREA href=(!javascript:&&!#)>
[LINK:hrefWithout#]
<IMG>
[IMG:alt]
<FORM>
[FORM:action]
<INPUT|TEXTAREA|BUTTON|SELECT>
[INPUT]
strip tags again, leaving nothing this time
un-htmlspecialchars
*/
 
<?
function html2text($badStr) {
//remove PHP if it exists
while( substr_count( $badStr, '<'.'?' ) && substr_count( $badStr, '?'.'>' ) && strpos( $badStr, '?'.'>', strpos( $badStr, '<'.'?' ) ) > strpos( $badStr, '<'.'?' ) ) {
$badStr = substr( $badStr, 0, strpos( $badStr, '<'.'?' ) ) . substr( $badStr, strpos( $badStr, '?'.'>', strpos( $badStr, '<'.'?' ) ) + 2 ); }
//remove comments
while( substr_count( $badStr, '<!--' ) && substr_count( $badStr, '-->' ) && strpos( $badStr, '-->', strpos( $badStr, '<!--' ) ) > strpos( $badStr, '<!--' ) ) {
$badStr = substr( $badStr, 0, strpos( $badStr, '<!--' ) ) . substr( $badStr, strpos( $badStr, '-->', strpos( $badStr, '<!--' ) ) + 3 ); }
//now make sure all HTML tags are correctly written (> not in between quotes)
for( $x = 0, $goodStr = '', $is_open_tb = false, $is_open_sq = false, $is_open_dq = false; strlen( $chr = $badStr{$x} ); $x++ ) {
//take each letter in turn and check if that character is permitted there
switch( $chr ) {
case '<':
if( !$is_open_tb && strtolower( substr( $badStr, $x + 1, 5 ) ) == 'style' ) {
$badStr = substr( $badStr, 0, $x ) . substr( $badStr, strpos( strtolower( $badStr ), '</style>', $x ) + 7 ); $chr = '';
} elseif( !$is_open_tb && strtolower( substr( $badStr, $x + 1, 6 ) ) == 'script' ) {
$badStr = substr( $badStr, 0, $x ) . substr( $badStr, strpos( strtolower( $badStr ), '</script>', $x ) + 8 ); $chr = '';
} elseif( !$is_open_tb ) { $is_open_tb = true; } else { $chr = '&lt;'; }
break;
case '>':
if( !$is_open_tb || $is_open_dq || $is_open_sq ) { $chr = '&gt;'; } else { $is_open_tb = false; }
break;
case '"':
if( $is_open_tb && !$is_open_dq && !$is_open_sq ) { $is_open_dq = true; }
elseif( $is_open_tb && $is_open_dq && !$is_open_sq ) { $is_open_dq = false; }
else { $chr = '&quot;'; }
break;
case "'":
if( $is_open_tb && !$is_open_dq && !$is_open_sq ) { $is_open_sq = true; }
elseif( $is_open_tb && !$is_open_dq && $is_open_sq ) { $is_open_sq = false; }
} $goodStr .= $chr;
}
//now that the page is valid (I hope) for strip_tags, strip all unwanted tags
$goodStr = strip_tags( $goodStr, '<title><hr><h1><h2><h3><h4><h5><h6><div><p><pre><sup><ul><ol><br><dl><dt><table><caption><tr><li><dd><th><td><a><area><img><form><input><textarea><button><select><option>' );
//strip extra whitespace except between <pre> and <textarea> tags
$badStr = preg_split( "/<\/?pre[^>]*>/i", $goodStr );
for( $x = 0; is_string( $badStr[$x] ); $x++ ) {
if( $x % 2 ) { $badStr[$x] = '<pre>'.$badStr[$x].'</pre>'; } else {
$goodStr = preg_split( "/<\/?textarea[^>]*>/i", $badStr[$x] );
for( $z = 0; is_string( $goodStr[$z] ); $z++ ) {
if( $z % 2 ) { $goodStr[$z] = '<textarea>'.$goodStr[$z].'</textarea>'; } else {
$goodStr[$z] = preg_replace( "/\s+/", ' ', $goodStr[$z] );
} }
$badStr[$x] = implode('',$goodStr);
} }
$goodStr = implode('',$badStr);
//remove all options from select inputs
$goodStr = preg_replace( "/<option[^>]*>[^<]*/i", '', $goodStr );
 
//replace all tags with their text equivalents
$goodStr = preg_replace( "/<(\/title|hr)[^>]*>/i", "\n --------------------\n", $goodStr );
$goodStr = preg_replace( "/<(h|div|p)[^>]*>/i", "\n\n", $goodStr );
$goodStr = preg_replace( "/<sup[^>]*>/i", '^', $goodStr );
$goodStr = preg_replace( "/<(ul|ol|br|dl|dt|table|caption|\/textarea|tr[^>]*>\s*<(td|th))[^>]*>/i", "\n", $goodStr );
$goodStr = preg_replace( "/<li[^>]*>/i", "\n· ", $goodStr );
$goodStr = preg_replace( "/<dd[^>]*>/i", "\n\t", $goodStr );
$goodStr = preg_replace( "/<(th|td)[^>]*>/i", "\t", $goodStr );
$goodStr = preg_replace( "/<a[^>]* href=(\"((?!\"|#|javascript:)[^\"#]*)(\"|#)|'((?!'|#|javascript:)[^'#]*)('|#)|((?!'|\"|>|#|javascript:)[^#\"'> ]*))[^>]*>/i", "", $goodStr );
$goodStr = preg_replace( "/<img[^>]* alt=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "[IMAGE: $2$3$4] ", $goodStr );
$goodStr = preg_replace( "/<form[^>]* action=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "\n[FORM: $2$3$4] ", $goodStr );
$goodStr = preg_replace( "/<(input|textarea|button|select)[^>]*>/i", "[INPUT] ", $goodStr );
//strip all remaining tags (mostly closing tags)
$goodStr = strip_tags( $goodStr );
//convert HTML entities
$goodStr = strtr( $goodStr, array_flip( get_html_translation_table( HTML_ENTITIES ) ) );
$goodStr = preg_replace( "/&#(\d+);/me", "chr('$1')", $goodStr );
//make sure there are no more than 3 linebreaks in a row and trim whitespace
return preg_replace( "/^\n*|\n*$/", '', preg_replace( "/[ \t]+(\n|$)/", "$1", preg_replace( "/\n(\s*\n){2}/", "\n\n\n", preg_replace( "/\r\n?|\f/", "\n", str_replace( chr(160), ' ', $goodStr ) ) ) ) );
}
 
?>