-
Notifications
You must be signed in to change notification settings - Fork 2
/
TextCleanser.php
57 lines (42 loc) · 1.6 KB
/
TextCleanser.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
<?php
class TextCleanser {
public function cleanse($str) {
$str = $this->cleanse_article_link_markup($str);
$str = $this->cleanse_non_ascii_characters($str);
$str = $this->cleanse_quotes($str);
$str = $this->cleanse_brackets($str);
$str = $this->cleanse_inbetween_commas($str);
$str = $this->cleanse_html_entities($str);
$str = $this->cleanse_newline_characters($str);
$str = $this->cleanse_preceding_and_following_special_characters($str);
return $str;
}
private function cleanse_inbetween_commas($str) {
return preg_replace('/((?![a-zA-Z+]),(?=[a-zA-Z+]))/', ' ', $str);
}
private function cleanse_quotes($str) {
return preg_replace('/[\'\"\`\*]/', '', $str);
}
private function cleanse_brackets($str) {
return preg_replace('/[\(\)\[\]\{\}]/', ' ' , $str);
}
private function cleanse_html_entities($str) {
return preg_replace("/&#?[a-zA-Z0-9]+;/", ' ',$str);
}
private function cleanse_newline_characters($str) {
return str_ireplace(["\n", "\x0D"], " ", $str);
}
private function cleanse_article_link_markup($str) {
return preg_replace('/\[.+?\]\(\d+\s\".*?\"\)/', "", $str);
}
private function cleanse_non_ascii_characters($str) {
return preg_replace('/[^\x00-\x7F]/', '', $str);
}
private function cleanse_preceding_and_following_special_characters($str) {
// chars to be removed only on beginning of word
$patterns[] = '/(^|\s|\.\,)([\@\#\$])+\b/';
// chars to be removed only on end of word
$patterns[] = '/\b([\!\?\%\.\,\:\;\/\$])+(\Z|\s|\.|\,)/';
return preg_replace($patterns, ' ', $str);
}
}