-
Notifications
You must be signed in to change notification settings - Fork 2
/
Harvester.java
239 lines (197 loc) · 7.57 KB
/
Harvester.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
package de.biofid.services.crawler;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import de.biofid.services.crawler.Item.UnsupportedOutputFormatException;
import de.biofid.services.crawler.filter.Filter;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.JSONObject;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/***
* A Harvester crawls the data of some website and draws the literature items from it.
*
* This is an abstract class; subclasses must override {@link #nextItem(Item item)} and {@link #getFolderName()}.
*
* @author Adrian Pachzelt (University Library Johann Christian Senckenberg, Frankfurt)
* @author https://www.biofid.de
* @version 1.0
*/
public abstract class Harvester {
protected static final String ITEM_COMPLETE_METADATA = "Item";
private static String baseOutputDirectory = null;
protected Configuration configuration;
protected long millisecondsDelayBetweenRequests = 0;
protected List<Filter> filters = new ArrayList<>();
// Logging for all sub-classes
protected Logger logger = LogManager.getLogger(LiteratureHarvester.LOGGER_NAME);
/***
* Subclasses HAVE TO have a constructor that takes a Configuration object as the only parameter!
* @param configuration The configuration of this harvester.
* @throws UnsetHarvesterBaseDirectoryException
*/
protected Harvester (Configuration configuration) throws UnsetHarvesterBaseDirectoryException {
if (baseOutputDirectory == null) {
logger.fatal("The harvester output directory is not set!");
throw new UnsetHarvesterBaseDirectoryException("The base output directory has to be set!");
}
this.configuration = configuration;
logger.info("Instantiated " + this.getClass().getName() + " with the configuration: \n" + configuration);
}
/***
* Set the directory where to write all downloaded data.
*/
public static void setOutputDirectory(String outputDirectory) {
Harvester.baseOutputDirectory = outputDirectory;
}
public void setFilters(List<Filter> filters) {
this.filters = filters;
}
public final Path getWorkingDirectory() {
return Paths.get(baseOutputDirectory, getFolderName().toLowerCase());
}
/***
* This function can be called to start the harvesting of a specific internet source.
*/
public final void run() {
try {
createWorkingDirectory();
createOutputDirectory();
} catch (IOException e) {
logger.fatal("Could not create working directories for '{}'!", this.getClass().getName());
return;
}
while (true) {
Item item = createNewEmptyItem();
item.setSaveMetadataOnly(this.configuration.isOnlyMetadata());
pause();
boolean next = nextItem(item);
if (next) {
if (!isFilteredOut(item) || hasItemOpenLicense(item)) {
processItem(item);
} else {
logger.info("Item ID {} did not comply with the given filters and is not provided under " +
"an open license and hence is not further processed.",
item.getItemId());
}
} else {
logger.info("All items of Harvester {} processed!", this.getClass().getName());
break;
}
}
}
/**
* Checks, if a given {@link Item} agrees with all configured {@link Filter}s.
* @param item The {@link Item} object to check.
* @return True, if the {@link Item} complies with all filters. False, otherwise.
*/
public boolean isFilteredOut(Item item) {
for (Filter filter : filters) {
if (!filter.isItemValid(item)) {
return true;
}
}
return false;
}
private boolean hasItemOpenLicense(Item item) {
Item.CopyrightStatus status = item.getCopyrightStatus();
return (status == Item.CopyrightStatus.NOT_IN_COPYRIGHT || status == Item.CopyrightStatus.CREATIVE_COMMONS_LICENSE);
}
public Item createNewEmptyItem() {
return new Item();
}
public void setRequestDelayInMilliseconds(long millisecondsDelay) {
millisecondsDelayBetweenRequests = millisecondsDelay;
}
/***
* A function that simply returns the name of the folder where the Items should be stored.
* @return A folder name
*/
protected abstract String getFolderName();
/***
* This function produces a single item per call.
*
* Child classes should provide the necessary metadata to fill the given, empty Item object. However,
* the given Item object is not returned! Instead is returned a boolean that indicates whether or not
* the item was filled. If the returned boolean is false, the stream stops and no further items are requested.
* This also means that even if the given item object was filled and the function returns false, this item
* object is NOT processed!
*
* I decided for this approach, because it gives a better granularity and higher flexibility in the data.
* If you would return an item, the harvester might not be able to tell if this item was filled with all
* necessary data and when to stop. If you would set some variable within the Item object, guess what, you
* also can return a boolean from the function and it is more explicit.
*
* @param item An Item object to be filled.
* @return A boolean to indicate if there are more items to come. If false, the stream stops and the function
* is not called anymore.
*/
protected abstract boolean nextItem(Item item);
protected void pause() {
try {
Thread.sleep(millisecondsDelayBetweenRequests);
} catch (InterruptedException ex) {
logger.error(ex.getMessage());
}
}
protected JSONObject toJsonObject(Object obj) throws JsonProcessingException {
ObjectMapper mapper = new ObjectMapper();
String metdataJSONString = mapper.writeValueAsString(obj);
return new JSONObject(metdataJSONString);
}
private boolean createDirectoryIfNotExisting(Path pathToCreate) {
File pathFile = pathToCreate.toFile();
if (!pathFile.exists()) {
return pathFile.mkdirs();
} else {
return true;
}
}
private boolean createOutputDirectory() throws IOException {
Path baseDirectory = getWorkingDirectory();
if (!baseDirectory.toFile().exists()) {
throw new CouldNotCreateDirectoryException("The base directory " + baseDirectory.toString() +
" does not exists and so the working folders could not be created!");
}
return true;
}
private boolean createWorkingDirectory() throws IOException {
boolean doesBaseFolderExist = createDirectoryIfNotExisting(getWorkingDirectory());
if (!doesBaseFolderExist) {
throw new CouldNotCreateDirectoryException("The directory " + getWorkingDirectory().toString() +
" could not be created");
}
return true;
}
private boolean processItem(Item item) {
Path outputPath = getWorkingDirectory();
boolean overwriteExistingFiles = configuration.isOverwrittingEnabled();
String outputPathString = outputPath.toString();
item.writeTextFiles(outputPathString, overwriteExistingFiles);
try {
item.writeMetadataFile(outputPathString, Item.FileType.XML);
} catch (UnsupportedOutputFormatException ex) {
logger.error("Writing of the metadata of item ID {} failed!", item.getItemId());
logger.error(Arrays.toString(ex.getStackTrace()));
return false;
}
return true;
}
class CouldNotCreateDirectoryException extends IOException {
private static final long serialVersionUID = -8144628595804556669L;
CouldNotCreateDirectoryException(String s) {
super(s);
}
}
class UnsetHarvesterBaseDirectoryException extends IOException {
private static final long serialVersionUID = -235562017414278915L;
UnsetHarvesterBaseDirectoryException(String s) {
super(s);
}
}
}