Skip to content

Commit

Permalink
fix pr merge conflict
Browse files Browse the repository at this point in the history
  • Loading branch information
DeeJay0921 committed Jan 9, 2020
2 parents 60c5249 + 29cc929 commit 73531e2
Showing 1 changed file with 35 additions and 41 deletions.
76 changes: 35 additions & 41 deletions src/main/java/com/github/DeeJay0921/Main.java
Expand Up @@ -23,7 +23,6 @@
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

Expand All @@ -33,42 +32,47 @@ public static void main(String[] args) throws SQLException {
String DBurl = "jdbc:h2:file://" + workDir + "/news";
Connection connection = DriverManager.getConnection(DBurl);

while (true) {
List<String> linkPool = loadLinkPoolFromDataBase(connection, "select LINK from LINKS_TO_BE_PROCESSED"); // 从库里读取未处理过的连接池

if (linkPool.isEmpty()) {
break;
}

String link = linkPool.remove(linkPool.size() - 1); // 获取连接池最后一个链接并从数据库及内存中删除该链接
insertLinkIntoDatabase(connection, link, "delete from LINKS_TO_BE_PROCESSED where link = ?");

String link;
while ((link = getNextLinkThenDelete(connection)) != null) { // 从库里去加载下一条链接 如果能加载到才进行循环
// 直接去查询数据库看该link有没有被处理过
if (isLinkProcessed(connection, link)) {
continue;
}

if (isInterestingLink(link)) { // 如果是感兴趣的页面
String stringHtml = getStringHtml(validateLink(link));
System.out.println("link = " + link);
insertLinkIntoDatabase(connection, link, "insert into LINKS_ALREADY_PROCESSED values ( ? )");
Document document = Jsoup.parse(stringHtml);
Elements aLinks = document.select("a"); // 获取所有的a标签

// 将链接加入连接池
for (Element alink : aLinks) {
String href = alink.attr("href");
if (isInterestingLink(href)) {
insertLinkIntoDatabase(connection, href, "insert into LINKS_TO_BE_PROCESSED values ( ? )");
}
}
Document document = Jsoup.parse(getStringHtml(validateLink(link)));
// 将爬取到的新页面上的链接入库
insertNewLinksToDatabase(connection, document);
// 对于新闻页做额外处理
storeIntoDataBaseIfIsNews(connection, link, document);
// 将访问过的链接加入已处理的数据库
updateDataBase(connection, link, "insert into LINKS_ALREADY_PROCESSED values ( ? )");
}
}
}

private static void insertNewLinksToDatabase(Connection connection, Document document) throws SQLException {
Elements aLinks = document.select("a"); // 获取所有的a标签
// 将链接加入连接池
for (Element alink : aLinks) {
String href = alink.attr("href");
if (isInterestingLink(href)) {
updateDataBase(connection, href, "insert into LINKS_TO_BE_PROCESSED values ( ? )");
}
}
}

private static void insertLinkIntoDatabase(Connection connection, String link, String sql) throws SQLException {
private static String getNextLinkThenDelete(Connection connection) throws SQLException {
String link = getNextLink(connection, "select LINK from LINKS_TO_BE_PROCESSED LIMIT 1"); // 每次取一个链接出来

if (link != null) {
updateDataBase(connection, link, "delete from LINKS_TO_BE_PROCESSED where link = ?");
}
return link;
}

private static void updateDataBase(Connection connection, String link, String sql) throws SQLException {
PreparedStatement preparedStatement = null;
try {
preparedStatement = connection.prepareStatement(sql);
Expand Down Expand Up @@ -103,7 +107,7 @@ private static boolean isLinkProcessed(Connection connection, String link) throw
}

private static void storeIntoDataBaseIfIsNews(Connection connection, String link, Document document) throws SQLException {
Elements articleTags = document.select("article");
Elements articleTags = document.select(".art_box");
if (!articleTags.isEmpty()) {
for (Element articleTag : articleTags) {
String articleTitle = articleTag.child(0).text();
Expand All @@ -118,25 +122,15 @@ private static void storeIntoDataBaseIfIsNews(Connection connection, String link
}
}

private static List<String> loadLinkPoolFromDataBase(Connection connection, String sql) throws SQLException {
List<String> linkPool = new ArrayList<>();
PreparedStatement preparedStatement = null;
ResultSet resultSet = null;
try {
preparedStatement = connection.prepareStatement(sql);
resultSet = preparedStatement.executeQuery();
private static String getNextLink(Connection connection, String sql) throws SQLException {
String link = null;
try (PreparedStatement preparedStatement = connection.prepareStatement(sql);
ResultSet resultSet = preparedStatement.executeQuery()) {
while (resultSet.next()) {
linkPool.add(resultSet.getString(1));
}
} finally {
if (preparedStatement != null) {
preparedStatement.close();
}
if (resultSet != null) {
resultSet.close();
link = resultSet.getString(1);
}
}
return linkPool;
return link;
}

private static boolean isInterestingLink(String url) {
Expand Down

0 comments on commit 73531e2

Please sign in to comment.