From a6ad4609a35ad4143f279e7146f6f8e86ede1ea4 Mon Sep 17 00:00:00 2001 From: DeeJay0921 <1018805743@qq.com> Date: Tue, 7 Jan 2020 15:37:40 +0800 Subject: [PATCH 1/2] =?UTF-8?q?#=20=E5=88=9D=E6=AD=A5=E5=AE=8C=E6=88=90?= =?UTF-8?q?=E4=BA=86=E5=9F=BA=E6=9C=AC=E7=AE=97=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 初步的开发完成了一个可以运行的算法,代码需要下次重构 且解决了公司内网使用httpClient的证书信任问题,默认信任全部证书,该逻辑后期会删除掉 --- pom.xml | 7 ++ src/main/java/com/github/DeeJay0921/Main.java | 95 +++++++++++++++++-- src/main/java/com/github/DeeJay0921/MiTM.java | 30 ++++++ 3 files changed, 125 insertions(+), 7 deletions(-) create mode 100644 src/main/java/com/github/DeeJay0921/MiTM.java diff --git a/pom.xml b/pom.xml index 90db3ff..25d9531 100644 --- a/pom.xml +++ b/pom.xml @@ -39,6 +39,13 @@ httpclient 4.5.9 + + + org.jsoup + jsoup + 1.12.1 + + diff --git a/src/main/java/com/github/DeeJay0921/Main.java b/src/main/java/com/github/DeeJay0921/Main.java index 655cc36..ef0e7f6 100644 --- a/src/main/java/com/github/DeeJay0921/Main.java +++ b/src/main/java/com/github/DeeJay0921/Main.java @@ -4,25 +4,97 @@ import org.apache.http.HttpHost; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; +import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.DefaultProxyRoutePlanner; import org.apache.http.util.EntityUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import javax.net.ssl.SSLContext; import java.io.IOException; +import java.security.KeyManagementException; +import java.security.NoSuchAlgorithmException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Set; public class Main { - public static void main(String[] args) throws IOException { + public static void main(String[] args) { + ArrayList linkPool = new ArrayList<>(); // 未处理过的连接池 + linkPool.add("https://sina.cn"); + Set handledLinkPool = new HashSet<>(); // 已经处理过的链接池 + while (true) { + if (linkPool.isEmpty()) { + break; + } + String link = linkPool.get(0); + linkPool.remove(0); // 处理过后从未处理的连接池中删除该链接 + + if (handledLinkPool.contains(link)) { // 如果该链接已经处理过 跳出本次循环 + continue; + } + + if (!link.contains("sina.cn")) { // 如果不包含sina.cn 等关键字 说明不是新浪本站的页面 不做处理 直接跳过 + continue; + } else { // 合法页面 进行请求 + System.out.println("link = " + link); + String stringHtml = getStringHtml(link); + handledLinkPool.add(link); // 处理完成后加入已经处理的连接池 + Document document = Jsoup.parse(stringHtml); + Elements aLinks = document.select("a"); // 获取所有的a标签 + + // 将链接加入连接池 + for (Element alink : aLinks) { + String aLinkHref = alink.attr("href"); + if (aLinkHref != null && aLinkHref.contains("sina.cn")) { + linkPool.add(alink.attr("href")); + } + } + + // 对于新闻页做额外处理 + Elements articleTags = document.select("article"); + if (!articleTags.isEmpty()) { + for (Element articleTag : articleTags) { + String articleTitle = articleTag.child(0).text(); // 获取新闻文章标题 输出 之后改为入库 + System.out.println("articleTitle = " + articleTitle); + } + } + } + } + } + + private static String getStringHtml(String url) { CloseableHttpClient httpclient = getHttpClient(); - HttpGet httpGet = new HttpGet("http://sina.cn"); + HttpGet httpGet = new HttpGet(url); + httpGet.addHeader("user-agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"); + String html = null; try (CloseableHttpResponse response1 = httpclient.execute(httpGet)) { System.out.println(response1.getStatusLine()); HttpEntity entity1 = response1.getEntity(); - System.out.println("EntityUtils.toString(entity1) = " + EntityUtils.toString(entity1)); + html = EntityUtils.toString(entity1); + } catch (IOException e) { + e.printStackTrace(); } + return html; } - private static CloseableHttpClient getHttpClient() throws IOException { + private static CloseableHttpClient getHttpClient() { + // 使httpClient信任所有证书 为了解决内网访问问题 + SSLContext sslcontext = null; //建立证书实体 + try { + sslcontext = SSLContext.getInstance("SSLv3"); + javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1]; + javax.net.ssl.TrustManager tm = new MiTM(); + trustAllCerts[0] = tm; + sslcontext.init(null, trustAllCerts, null); + } catch (NoSuchAlgorithmException | KeyManagementException e) { + e.printStackTrace(); + } + SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslcontext, SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER); // 先调用一次 判断是否为内网 如果内网则要做代理 CloseableHttpClient testClient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet("http://sina.cn"); @@ -31,10 +103,19 @@ private static CloseableHttpClient getHttpClient() throws IOException { HttpHost proxy = new HttpHost("10.30.6.49", 9090); DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(proxy); return HttpClients.custom() - .setRoutePlanner(routePlanner) - .build(); + .setRoutePlanner(routePlanner) + .setSSLSocketFactory(sslsf) + .build(); } + } catch (IOException e) { + e.printStackTrace(); } - return HttpClients.createDefault(); + return HttpClients.custom().setSSLSocketFactory(sslsf).build(); +// HttpHost proxy = new HttpHost("10.30.6.49", 9090); +// DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(proxy); +// return HttpClients.custom() +// .setRoutePlanner(routePlanner) +// .setSSLSocketFactory(sslsf) +// .build(); } } diff --git a/src/main/java/com/github/DeeJay0921/MiTM.java b/src/main/java/com/github/DeeJay0921/MiTM.java new file mode 100644 index 0000000..3927bde --- /dev/null +++ b/src/main/java/com/github/DeeJay0921/MiTM.java @@ -0,0 +1,30 @@ +package com.github.DeeJay0921; + +public class MiTM implements javax.net.ssl.TrustManager, + javax.net.ssl.X509TrustManager { + public java.security.cert.X509Certificate[] getAcceptedIssuers() { + return null; + } + + public boolean isServerTrusted( + java.security.cert.X509Certificate[] certs) { + return true; + } + + public boolean isClientTrusted( + java.security.cert.X509Certificate[] certs) { + return true; + } + + public void checkServerTrusted( + java.security.cert.X509Certificate[] certs, String authType) + throws java.security.cert.CertificateException { + return; + } + + public void checkClientTrusted( + java.security.cert.X509Certificate[] certs, String authType) + throws java.security.cert.CertificateException { + return; + } +} \ No newline at end of file From 9c94d64b829795fc221106b3828362213f135871 Mon Sep 17 00:00:00 2001 From: DeeJay0921 <1018805743@qq.com> Date: Tue, 7 Jan 2020 15:53:05 +0800 Subject: [PATCH 2/2] =?UTF-8?q?#=20=E5=88=9D=E6=AD=A5=E5=AE=8C=E6=88=90?= =?UTF-8?q?=E4=BA=86=E5=9F=BA=E6=9C=AC=E7=AE=97=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 初步的开发完成了一个可以运行的算法,代码需要下次重构 且解决了公司内网使用httpClient的证书信任问题,默认信任全部证书,该逻辑后期会删除掉 --- src/main/java/com/github/DeeJay0921/Main.java | 6 ++-- src/main/java/com/github/DeeJay0921/MiTM.java | 28 ++++++++----------- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/src/main/java/com/github/DeeJay0921/Main.java b/src/main/java/com/github/DeeJay0921/Main.java index ef0e7f6..3f7c3fa 100644 --- a/src/main/java/com/github/DeeJay0921/Main.java +++ b/src/main/java/com/github/DeeJay0921/Main.java @@ -103,9 +103,9 @@ private static CloseableHttpClient getHttpClient() { HttpHost proxy = new HttpHost("10.30.6.49", 9090); DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(proxy); return HttpClients.custom() - .setRoutePlanner(routePlanner) - .setSSLSocketFactory(sslsf) - .build(); + .setRoutePlanner(routePlanner) + .setSSLSocketFactory(sslsf) + .build(); } } catch (IOException e) { e.printStackTrace(); diff --git a/src/main/java/com/github/DeeJay0921/MiTM.java b/src/main/java/com/github/DeeJay0921/MiTM.java index 3927bde..0563c67 100644 --- a/src/main/java/com/github/DeeJay0921/MiTM.java +++ b/src/main/java/com/github/DeeJay0921/MiTM.java @@ -1,30 +1,26 @@ package com.github.DeeJay0921; -public class MiTM implements javax.net.ssl.TrustManager, - javax.net.ssl.X509TrustManager { - public java.security.cert.X509Certificate[] getAcceptedIssuers() { +import javax.net.ssl.TrustManager; +import javax.net.ssl.X509TrustManager; +import java.security.cert.X509Certificate; + +public class MiTM implements TrustManager, X509TrustManager { + public X509Certificate[] getAcceptedIssuers() { return null; } - public boolean isServerTrusted( - java.security.cert.X509Certificate[] certs) { + public boolean isServerTrusted(X509Certificate[] certs) { return true; } - public boolean isClientTrusted( - java.security.cert.X509Certificate[] certs) { + public boolean isClientTrusted(X509Certificate[] certs) { return true; } - public void checkServerTrusted( - java.security.cert.X509Certificate[] certs, String authType) - throws java.security.cert.CertificateException { - return; + public void checkServerTrusted(X509Certificate[] certs, String authType) { } - public void checkClientTrusted( - java.security.cert.X509Certificate[] certs, String authType) - throws java.security.cert.CertificateException { - return; + public void checkClientTrusted(X509Certificate[] certs, String authType) { } -} \ No newline at end of file +} +