-
Notifications
You must be signed in to change notification settings - Fork 1
/
config.php
151 lines (123 loc) · 7.2 KB
/
config.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
<?php
\MirrorReader\Processor::$store = '/Library/'; // Configuration variable for where the domains are stored offline.
\MirrorReader\Processor::$host = 'http://localhost';
\MirrorReader\Processor::$domainConfiguration = array(
'default' => array(
'homeFiles' => [
'index.html',
'index.php',
'Main_Page', // Mediawiki
],
'passthru' => false, // If enabled, when the script encounters a non-stored file, it will instead include the live one from the web -- say, PayPal and Twitter links.
'scriptHacks' => [ // Which script hacks should be used. Script processing is tricky, as there are a million ways things can be done (indeed, URLs could even be encrypted or specially encoded, making it impossible to work with them). Instead, we have four common hacks: 'suspectFileString' and 'suspectFileAnywhere' are mutually exclusive, with the former usually working but not breaking anything, and the latter more likely to both work and break something. Additionally, there are 'suspectDirString' which is more likely to break something, but can work with directories that are placed in strings (it is almost guarenteed to break something if implemented using the anywhere method, due to regex, etc.), and 'suspectDomainAnywhere' which usually won't break anything, but should still be used with caution.
//'removeAll', // Removes all scripts, and activates <noscript> nodes. An easy way to disable Javascript on a per-site basis.
'removeComments', // Remove comments from the JavaScript body. Results in a speed boost, but can also break things.
//'suspectFileAnywhere', // Searches for file patterns inside of the whole JS body. It will probably find all URLs, but will likely break some as well.
'suspectFileString', // Searches for file patterns inside of JS strings. Fairly effective, and unlikely to break things.
//'suspectDomainAnywhere', // Searches for file patterns anywhere in the JS body. As you'll almost never find a full domain just cooincidentally exist in the JS body, this shouldn't break things, and will still be fairly effective.
],
'htmlHacks' => [
'backgroundHack', // Checks <body background>, <table background>, <tr background>, etc. No reason not to enable, but because the behaviour is not compliant with HTML, we have it available as a flag.
'selectHack', // Checks <option values> for URL-looking patterns, and rewrites as appropriate. This can, in very rare situations, break things. No noticable performance penalty.
// 'dirtyAttributes' // Processes style and javascript attributes. Currently quite slow, but can probably be sped up, and in any case is unlikely to break anything.
],
'customSrcAttributes' => ['data-src'], // This is a list of custom attributes containing URLs that should be parsed. data-src is used be Wikia.
'ignoreGETs' => ['PHPSESSID', 'sid', 'highlight', 's'],
// Heritrix MirrorReader has a lot of trouble with 301s. This is a hack that will fix some instances of this, either "none" (which does nothing) or "dir" which will redirect any file to a directory with the same name and a "1" appended.
// Note that the fixer script will generally remove most 301s, so after running it on a site you can set this parameter to none for a modest speed boost.
'301mode' => 'none',
'recognisedExtensions' => ['asp', 'css', 'doc', 'docx', 'gif', 'htm', 'html', 'jpeg', 'jpg', 'js', 'pdf', 'php', 'png', 'rss', 'txt', 'xml'], // List of recognised extensions.
'cacheStore' => '/var/www/cache/',
'redirect' => [], // An array in the form of "find => replace" that redirects domains, directories, and files.
),
'www.youtube.com' => array(
'passthru' => true,
),
'youtube.com' => array(
'passthru' => true,
),
'arstechnica.com' => array(
'redirect' => array(
'http://arstechnica.com/archive/' => 'http://archive.arstechnica.org/archive/',
'/journals/microsoft.ars/' => '/information-technology/',
'/microsoft/' => '/information-technology/',
'/journals/thumbs.ars/' => '/gaming/',
'/news/' => '/',
'/1/' => '/',
'/articles/' => '/features/',
),
'redirectRegex' => array(
'http://arstechnica.com(.*)\.(jpg|png)$' => 'http://cdn.arstechnica.net$1.$2',
'/(\d{4})/(\d{2})/\d{2}/' => '/$1/$2/',
'\.(html|ars)$' => '/',
)
),
/* seem to be the occasion stray unknown glpyh character, but I can't tell what from -- we seem to be using the same encoding as the server */
'shrines.rpgclassics.com' => array(
'redirect' => array(
'shrines.rpgclassics.com/shrines/' => 'shrines.rpgclassics.com/',
)
),
'www.rpgclassics.com' => array(
'redirect' => array(
'www.rpgclassics.com/shrines/' => 'shrines.rpgclassics.com/',
'www.rpgclassics.com/staff/' => 'staff.rpgclassics.com/'
)
),
/* working 100% */
'www.zeldawiki.org' => array(
'redirect' => array(
'www.zeldawiki.org' => 'zeldawiki.org',
)
),
'jaytheham.com' => array(
'redirect' => array(
'jaytheham.com' => 'www.jaytheham.com',
)
),
'wikibound.info' => array(
'htmlHacks' => ['dirtyAttributes'],
),
'niwanetwork.org' => array(
'redirect' => array(
'niwanetwork.org' => 'www.niwanetwork.org',
)
),
'lostlevels.org' => array(
'redirect' => array(
'lostlevels.org' => 'www.lostlevels.org',
)
),
/* need to fix some glitchdex URLs */
'www.glitchcity.info' => array(
'redirect' => array(
'www.glitchcity.info' => 'glitchcity.info',
)
),
'ajax.googleapis.com' => array(
'passthru' => true,
),
/* still need fixes */
'www.psypokes.com' => array(
'scriptHacks' => [ // Which script hacks should be used. Script processing is tricky, as there are a million ways things can be done (indeed, URLs could even be encrypted or specially encoded, making it impossible to work with them). Instead, we have four common hacks: 'suspectFileString' and 'suspectFileAnywhere' are mutually exclusive, with the former usually working but not breaking anything, and the latter more likely to both work and break something. Additionally, there are 'suspectDirString' which is more likely to break something, but can work with directories that are placed in strings (it is almost guarenteed to break something if implemented using the anywhere method, due to regex, etc.), and 'suspectDomainAnywhere' which usually won't break anything, but should still be used with caution.
//'suspectDomainAnywhere'
],
),
/* working 100% */
'www.serebii.net' => [
'homeFiles' => [
'index.shtml',
'index.html',
]
],
'www.victoryroad.net' => [
'redirect' => [
'/clientscript/misc.php' => '/misc.php',
],
],
// TODO: inherit subdomains
'wikia.com' => [
'scriptHacks' => ['removeAll']
]
);
?>