Skip to content

Commit

Permalink
raising version to 0.4 and fixing depth 0 to index only
Browse files Browse the repository at this point in the history
  • Loading branch information
nitAI committed Jan 26, 2010
1 parent 92275c8 commit 738d9ac
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 24 deletions.
2 changes: 1 addition & 1 deletion default.properties
@@ -1,6 +1,6 @@
Name=Nutch
name=nutch-gui
version=0.4-dev
version=0.4
final.name=${name}-${version}
year=2009

Expand Down
49 changes: 26 additions & 23 deletions src/java/org/apache/nutch/crawl/CrawlTool.java
Expand Up @@ -147,27 +147,29 @@ public void crawl(Integer topn, Integer depth) throws IOException {
Path[] mergeSegments = HadoopFSUtil.getPaths(listStatus);
// list of all segments that will be deleted after indexing
Path[] segmentsToDelete = null;
try {
// merge segments
SegmentMerger segmentMerger = new SegmentMerger(_configuration);
Path mergeDir = new Path(segments, "merge-segments");
segmentMerger.merge(mergeDir, mergeSegments, false, false, 0);
// get merged segment
Path mergeSegTemp = _fileSystem.listStatus(mergeDir)[0].getPath();
// move merged segment to others
Path mergeSegment = new Path(segments, mergeSegTemp.getName());
_fileSystem.rename(mergeSegTemp, mergeSegment);
_fileSystem.delete(mergeDir, true);
// create statistic
hostStatistic.statistic(crawlDb, mergeSegment);
// use only merged segment
segmentsToDelete = mergeSegments;
mergeSegments = new Path[] { mergeSegment };
} catch (Exception e) {
e.printStackTrace();
if (i > 0) {
try {
// merge segments
SegmentMerger segmentMerger = new SegmentMerger(_configuration);
Path mergeDir = new Path(segments, "merge-segments");
segmentMerger.merge(mergeDir, mergeSegments, false, false, 0);
// get merged segment
Path mergeSegTemp = _fileSystem.listStatus(mergeDir)[0].getPath();
// move merged segment to others
Path mergeSegment = new Path(segments, mergeSegTemp.getName());
_fileSystem.rename(mergeSegTemp, mergeSegment);
_fileSystem.delete(mergeDir, true);
// create statistic
hostStatistic.statistic(crawlDb, mergeSegment);
// use only merged segment
segmentsToDelete = mergeSegments;
mergeSegments = new Path[] { mergeSegment };
} catch (Exception e) {
e.printStackTrace();
}
}

if (i > 0) {
if (mergeSegments.length > 0) {
linkDbTool.invert(linkDb, mergeSegments, true, true, false); // invert links

if (indexes != null) {
Expand Down Expand Up @@ -197,16 +199,17 @@ public void crawl(Integer topn, Integer depth) throws IOException {
} else {
LOG.warn("No URLs to fetch - check your seed list and URL filters.");
}
if (LOG.isInfoEnabled()) {
LOG.info("crawl finished: " + _crawlDir);
}


// delete old segments (after indexing so searching is meanwhile still possible)
if (segmentsToDelete != null) {
for (Path p : segmentsToDelete) {
_fileSystem.delete(p, true);
}
}

if (LOG.isInfoEnabled()) {
LOG.info("crawl finished: " + _crawlDir);
}
}

public FileSystem getFileSystem() {
Expand Down

0 comments on commit 738d9ac

Please sign in to comment.