Skip to content

Code smells #1007

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: develop
Choose a base branch
from
6 changes: 2 additions & 4 deletions webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
Original file line number Diff line number Diff line change
@@ -49,12 +49,10 @@ public class Page {

private byte[] bytes;

private List<Request> targetRequests = new ArrayList<Request>();
private List<Request> targetRequests = new ArrayList<>();

private String charset;

public Page() {
}

public static Page fail(){
Page page = new Page();
@@ -105,9 +103,9 @@ public Json getJson() {

/**
* @param html html
* @deprecated since 0.4.0
* The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
*/

public void setHtml(Html html) {
this.html = html;
}
17 changes: 10 additions & 7 deletions webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
Original file line number Diff line number Diff line change
@@ -62,7 +62,7 @@ public class Spider implements Runnable, Task {

protected Downloader downloader;

protected List<Pipeline> pipelines = new ArrayList<Pipeline>();
protected List<Pipeline> pipelines = new ArrayList<>();

protected PageProcessor pageProcessor;

@@ -86,11 +86,11 @@ public class Spider implements Runnable, Task {

protected boolean exitWhenComplete = true;

protected final static int STAT_INIT = 0;
protected static final int STAT_INIT = 0;

protected final static int STAT_RUNNING = 1;
protected static final int STAT_RUNNING = 1;

protected final static int STAT_STOPPED = 2;
protected static final int STAT_STOPPED = 2;

protected boolean spawnUrl = true;

@@ -246,7 +246,7 @@ public Spider setPipelines(List<Pipeline> pipelines) {
* @return this
*/
public Spider clearPipeline() {
pipelines = new ArrayList<Pipeline>();
pipelines = new ArrayList<>();
return this;
}

@@ -313,7 +313,8 @@ public void run() {
// wait until new url added
waitNewUrl();
} else {
threadPool.execute(new Runnable() {
threadPool.execute(
new Runnable() {
@Override
public void run() {
try {
@@ -427,7 +428,6 @@ private void onDownloadSuccess(Request request, Page page) {
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
}
sleep(site.getSleepTime());
return;
}

private void onDownloaderFail(Request request) {
@@ -458,6 +458,8 @@ protected void sleep(int time) {
Thread.sleep(time);
} catch (InterruptedException e) {
logger.error("Thread interrupted when sleep",e);
//restore interrupted thread
Thread.currentThread().interrupt();
}
}

@@ -564,6 +566,7 @@ private void waitNewUrl() {
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
logger.warn("waitNewUrl - interrupted, error {}", e);
Thread.currentThread().interrupt();
} finally {
newUrlLock.unlock();
}
Original file line number Diff line number Diff line change
@@ -15,7 +15,9 @@
*/
@Experimental
public class SmartContentSelector implements Selector {

/***
* Empty/ default constructor for SmartContentSelector
*/
public SmartContentSelector() {
}

@@ -33,7 +35,7 @@ public String select(String html) {
int start;
int end;
StringBuilder text = new StringBuilder();
ArrayList<Integer> indexDistribution = new ArrayList<Integer>();
ArrayList<Integer> indexDistribution = new ArrayList<>();

lines = Arrays.asList(html.split("\n"));

@@ -47,39 +49,42 @@ public String select(String html) {
}

start = -1; end = -1;
boolean boolstart = false, boolend = false;
boolean boolstart = false;
boolean boolend = false;
text.setLength(0);

for (int i = 0; i < indexDistribution.size() - 1; i++) {
if (indexDistribution.get(i) > threshold && ! boolstart) {
if (indexDistribution.get(i+1).intValue() != 0

int i=0;
while (i < indexDistribution.size() - 1) {

if ((indexDistribution.get(i) > threshold && ! boolstart)
&& (indexDistribution.get(i+1).intValue() != 0
|| indexDistribution.get(i+2).intValue() != 0
|| indexDistribution.get(i+3).intValue() != 0) {
|| indexDistribution.get(i+3).intValue() != 0) ){
boolstart = true;
start = i;
continue;
i++;
}
}
if (boolstart) {
if (indexDistribution.get(i).intValue() == 0
|| indexDistribution.get(i+1).intValue() == 0) {

if ((boolstart) && (indexDistribution.get(i).intValue() == 0
|| indexDistribution.get(i+1).intValue() == 0) ){
end = i;
boolend = true;
}
}


StringBuilder tmp = new StringBuilder();
if (boolend) {
//System.out.println(start+1 + "\t\t" + end+1);
for (int ii = start; ii <= end; ii++) {
if (lines.get(ii).length() < 5) continue;
if (lines.get(ii).length() < 5) i++;
tmp.append(lines.get(ii) + "\n");
}
String str = tmp.toString();
//System.out.println(str);
if (str.contains("Copyright") ) continue;

if (str.contains("Copyright")) i++;
text.append(str);
boolstart = boolend = false;
}
i++;
}
return text.toString();
}
Original file line number Diff line number Diff line change
@@ -11,6 +11,14 @@
public class FilePersistentBase {

protected String path;

public FilePersistentBase() {
setPath("/data/webmagic/");
}

public FilePersistentBase(String path) {
setPath(path);
}

public static String PATH_SEPERATOR = "/";

Original file line number Diff line number Diff line change
@@ -28,11 +28,11 @@ public class FilePageModelPipeline extends FilePersistentBase implements PageMod
* new JsonFilePageModelPipeline with default path "/data/webmagic/"
*/
public FilePageModelPipeline() {
setPath("/data/webmagic/");
super();
}

public FilePageModelPipeline(String path) {
setPath(path);
super(path);
}

@Override
Original file line number Diff line number Diff line change
@@ -29,11 +29,11 @@ public class JsonFilePageModelPipeline extends FilePersistentBase implements Pag
* new JsonFilePageModelPipeline with default path "/data/webmagic/"
*/
public JsonFilePageModelPipeline() {
setPath("/data/webmagic/");
super();
}

public JsonFilePageModelPipeline(String path) {
setPath(path);
super(path);
}

@Override
Original file line number Diff line number Diff line change
@@ -12,7 +12,7 @@
*/
public abstract class IPUtils {

public static String getFirstNoLoopbackIPAddresses() throws SocketException {
public static String getFirstNoLoopbackIPAddresses() throws SocketException, NullPointerException{

Enumeration<NetworkInterface> networkInterfaces = NetworkInterface.getNetworkInterfaces();