-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathWalkSection.java
executable file
·252 lines (236 loc) · 8.77 KB
/
WalkSection.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
package html_parser;
import java.sql.Connection;
import java.util.ArrayList;
import html_parser.page.PageWalker;
import html_parser.page.PageWalkerAware;
import html_parser.record.Record;
import html_parser.record_processor.RecordProcessor;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import database.Connector;
import sites.autozvuk_com_ua.AvtozvukPageWalker;
import sites.autozvuk_com_ua.AvtozvukPageWalkerAware;
import sites.autozvuk_com_ua.AvtozvukSaver;
import sites.autozvuk_com_ua.AvtozvukSectionXml;
/** êëàññ, êîòîðûé íà îñíîâàíèè XML äîêóìåíòà ïîçâîëÿåò "ïðîãóëÿòüñÿ" ïî âñåì çàïèñÿì â óêàçàííîé ñåêöèè, îáÿçàòåëüíûå àòòðèáóòû êàæäîãî óçëà - "href","caption" */
public class WalkSection {
/** ñëóøàòåëè, êîòîðûå ïðèíèìàþò ðåøåíèå ïî ïîâîäó ñêàíèðîâàíèÿ î÷åðåäíîé ñåêöèè */
private ArrayList<ScanSectionFilter> allowListeners=new ArrayList<ScanSectionFilter>();
/** äîáàâèòü ñëóøàòåëÿ, êîòîðûé ïðèíèìàåò ðåøåíèå ïî ïîâîäó ïàðñèíãà î÷åðåäíîé ñòðàíèöû */
public void addAllowListener(ScanSectionFilter listener){
this.allowListeners.add(listener);
}
/** óäàëèòü ñëóøàòåëÿ, êîòîðûé ïðèíèìàåò ðåøåíèå íà ïàðñèíã î÷åðåäíîé ñåêöèè */
public void removeAllowListener(ScanSectionFilter listener){
this.allowListeners.remove(listener);
}
/** ðåøåíèå, íóæíî ëè ïðîäîëæàòü ïàðñèòü óêàçàííûé ýëåìåíò äåðåâà */
private boolean isProcessAllow(Element element){
boolean returnValue=true;
for(int counter=0;counter<this.allowListeners.size();counter++){
if(this.allowListeners.get(counter).isFilter(element)==false){
returnValue=false;
break;
}
}
return returnValue;
}
/** ïðîéòèñü ïî âñåì çàïèñÿì, êîòîðûå óêàçàíû â XML äîêóìåíòå
* @param xmlDocument - äîêóìåíò, êîòîðûé ñîäåðæèò âñå ýëåìåíòû {@literal"<leaf caption="" href="">"}
* @param urlPrefix - ïðåôèêñ, êîòîðûé íóæíî äîáàâëÿòü äëÿ ïîëó÷åíèÿ àäðåñà ñåêöèè (href)
* @param processParent - íóæíî ëè "ïàðñèòü" óçëû, êîòîðûå ÿâëÿþòñÿ ðîäèòåëüñêèìè, ò.å. ñîäåðæàò äðóãèå óçëû (true) èëè æå íóæíî ïàðñèòü òîëüêî "ëèñòû" áåç ðîäèòåëüñêèõ êîìïîíåíòîâ (false)
* @param processLeaf - íóæíî ëè "ïàðñèòü" êîíå÷íûå óçëû äåðåâà - ëèñòüÿ
* @param pageWalker - ïàðñåð ñòðàíèöû, êîòîðûé "âûíèìàåò" èç ñòðàíèöû íåîáõîäèìûå äàííûå
* @param pageWalkerAware - îáúåêò, êîòîðûé ïîëó÷àåò ññûëêè íà ñëåäóþùèå ñòðàíèöû
* @param delay - çàäåðæêè, êîòîðûå íåîáõîäèìî äåëàòü äëÿ íîðìàëüíîãî(ýìóëÿöèÿ ïîëüçîâàòåëÿ) ÷òåíèÿ äàííûõ èç ñåòè
* @param postProcessor - îáðàáîòêà ïîëó÷åííûõ çàïèñåé ïåðåä ñîõðàíåíèÿ
*/
public void walk(Document xmlDocument,
String urlPrefix,
boolean processParent,
boolean processLeaf,
PageWalker pageWalker,
PageWalkerAware pageWalkerAware,
Delay delay,
RecordProcessor preProcessor,
Saver saver){
if(saver!=null)saver.begin();
// ïåðåáðàòü âñå ëèñòû, êîòîðûå ïåðåäàíû â xmlDocument
NodeList nodes=xmlDocument.getChildNodes().item(0).getChildNodes();
for(int counter=0;counter<nodes.getLength();counter++){
//System.out.println(this.getStringFromXmlDocument(nodes.item(counter)));
walkNode(nodes.item(counter),
urlPrefix,
processParent,
processLeaf,
pageWalker,
pageWalkerAware,
delay,
preProcessor,
saver);
}
if(saver!=null)saver.finish();
}
/*
private String getStringFromXmlDocument(Node document){
Writer out=null;
try{
javax.xml.transform.TransformerFactory transformer_factory = javax.xml.transform.TransformerFactory.newInstance();
javax.xml.transform.Transformer transformer = transformer_factory.newTransformer();
javax.xml.transform.dom.DOMSource dom_source = new javax.xml.transform.dom.DOMSource(document); // Pass in your document object here
out=new StringWriter();
//string_writer = new Packages.java.io.StringWriter();
javax.xml.transform.stream.StreamResult stream_result = new javax.xml.transform.stream.StreamResult(out);
transformer.transform(dom_source, stream_result);
}catch(Exception ex){
System.err.println("getStringFromXmlDocument:"+ex.getMessage());
}
return (out==null)?"":out.toString();
}*/
/** ïðîéòèñü ïî âñåì âåòêàì, ïî âñåì çàïèñÿì */
private void walkNode(Node node,
String urlPrefix,
boolean processParent,
boolean processLeaf,
PageWalker pageWalker,
PageWalkerAware pageWalkerAware,
Delay delay,
RecordProcessor preProcessor,
Saver saver){
if(node.hasChildNodes()){
//System.out.println("has child");
// åñòü äî÷åðíèå ýëåìåíòû
NodeList list=node.getChildNodes();
for(int counter=0;counter<list.getLength();counter++){
this.walkNode(list.item(counter),
urlPrefix,
processParent,
processLeaf,
pageWalker,
pageWalkerAware,
delay,
preProcessor,
saver);
}
if(processParent){
this.processNode(node,urlPrefix, pageWalker,pageWalkerAware,delay,preProcessor, saver);
};
}else{
//System.out.println("this is leaf");
// äàííûé ýëåìåíò ÿâëÿåòñÿ ëèñòîì, ò.å. íå èìååò äî÷åðíèõ ýëåìåíòîâ
// node
if(processLeaf){
this.processNode(node,urlPrefix,pageWalker,pageWalkerAware, delay,preProcessor, saver);
}
}
}
/** îáðàáîòàòü äàííûé ýëåìåíò èç ýëåìåíòà äåðåâà, êîòîðîå ñîäåðæèò âñå ñåêöèè
* @param node - óçåë, êîòîðûé áóäåò
* @param urlPrefix - ïðåôèêñ URL ê êîòîðîìó íóæíî äîáàâëÿòü äàííûå
* @param pageWalker - îáúåêò äëÿ ÷òåíèÿ ñòðàíèö
* @param pageWalkerAware - îáúåêò äëÿ ïåðåìåùåíèÿ ìåæäó ñòðàíèöàìè
* @param delay - çàäåðæêà äëÿ ÷òåíèÿ
* @param saver - îáúåêò, êîòîðûé ïðåäíàçíà÷åí äëÿ ñîõðàíåíèÿ Records
*/
private void processNode(Node node,
String urlPrefix,
PageWalker pageWalker,
PageWalkerAware pageWalkerAware,
Delay delay,
RecordProcessor preProcessor,
Saver saver){
if(node instanceof Element){
Element element=(Element)node;
String href=element.getAttribute("href");
String caption=element.getAttribute("caption");
System.out.println(">>> Section:"+caption+" Href: "+href);
if(this.isProcessAllow(element)){
pageWalker.updatePageWalkerAware(pageWalkerAware);
pageWalkerAware.reset(urlPrefix+href);
// ïîëó÷èòü êîä ïåðâîé ñòðàíèöû
while(pageWalker.hasMoreElements()){
// ïîëó÷åííûé ýëåìåíò ðàñïàðñèòü
ArrayList<Record> block=pageWalker.nextElement();
if(preProcessor!=null){
preProcessor.beforeSave(block);
}
// ïðîáåæàòüñÿ ïî Saver-ó
if(saver!=null){
for(int counter=0;counter<block.size();counter++){
//System.out.println(block.get(counter).toString());
if(saver.save(caption, block.get(counter))==false){
System.err.println("WalkSection save Error:");
}
}
}
if(preProcessor!=null){
preProcessor.afterSave(block);
}
System.out.println(">>> NextPage ( after "+pageWalker.getLastUrl()+") ");
// óñíóòü, åñëè íóæíî, äëÿ âîçìîæíîãî ñîêðûòèÿ ïàðñèíãà - âðåìÿ îáðàùåíèÿ ê îäíîìó è òîìó æå ñåðâåðó
try{
Thread.sleep(delay.getDelayReadPage());
}catch(Exception ex){};
}
try{
Thread.sleep(delay.getDelayReadSection());
}catch(Exception ex){};
}else{
// îòìåíà ïàðñèíãà î÷åðåäíîãî ýëåìåíòà - çàïðåò ñëóøàòåëåé
}
}else{
// ïðîìåæóòî÷íûé ýëåìåíò
}
}
public static void main(String[] args){
/* System.out.println("begin");
// îáúåêò, êîòîðûé ÷èòàåò ñòðàíèöû
PageWalker pageWalker=new AvtozvukPageWalker();
// îáúåêò, êîòîðûé äà¸ò íîìåð ñëåäóþùåé ñòðàíèöû
PageWalkerAware pageWalkerAware=new AvtozvukPageWalkerAware();
// îáúåêò, êîòîðûé âûäà¸ò XML íà îñíîâàíèè ðóáðèêàòîðà/äåðåâà ýëåìåíòîâ
AvtozvukSectionXml avtozvuk=new AvtozvukSectionXml();
Document treeXml=avtozvuk.getXmlDocument();
// îáúåêò-çàäåðæêà äëÿ ýìóëÿöèè ÷òåíèÿ äàííûõ ÷åëîâåêîì, à íå ïàðñåðîì
Delay delay=new Delay(5,2);
// îáúåêò äëÿ ñîõðàíåíèÿ äàííûõ
Connection connection=null;
try{
Connector connector=new Connector("V:/eclipse_workspace/ShopList_HtmlParser/shop_list.gdb");
connection=connector.getConnector().getConnection();
}catch(Exception ex){
System.out.println("WalkSection#main getConnection Exception: "+ex.getMessage());
}
AvtozvukSaver saver=new AvtozvukSaver(connection);
//saver.resetAllRecord();
WalkSection walk=new WalkSection();
// óñòàíîâèòü ôèëüòð íà ïàðñèíã
walk.addAllowListener(new ScanSectionFilter(){
private boolean parse=false;
@Override
public boolean isFilter(Element element) {
if(element.getAttribute("caption").indexOf("Êðîññîâåðû")>=0){
parse=true;
}
return this.parse;
}
});
walk.walk(treeXml,
"http://avtozvuk.ua",
true,
true,
pageWalker,
pageWalkerAware,
delay,
null,
saver
);
try{
connection.close();
}catch(Exception ex){};
System.out.println("end");
*/
}
}