From a6ad4609a35ad4143f279e7146f6f8e86ede1ea4 Mon Sep 17 00:00:00 2001
From: DeeJay0921 <1018805743@qq.com>
Date: Tue, 7 Jan 2020 15:37:40 +0800
Subject: [PATCH 1/2] =?UTF-8?q?#=20=E5=88=9D=E6=AD=A5=E5=AE=8C=E6=88=90?=
=?UTF-8?q?=E4=BA=86=E5=9F=BA=E6=9C=AC=E7=AE=97=E6=B3=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
初步的开发完成了一个可以运行的算法,代码需要下次重构
且解决了公司内网使用httpClient的证书信任问题,默认信任全部证书,该逻辑后期会删除掉
---
pom.xml | 7 ++
src/main/java/com/github/DeeJay0921/Main.java | 95 +++++++++++++++++--
src/main/java/com/github/DeeJay0921/MiTM.java | 30 ++++++
3 files changed, 125 insertions(+), 7 deletions(-)
create mode 100644 src/main/java/com/github/DeeJay0921/MiTM.java
diff --git a/pom.xml b/pom.xml
index 90db3ff..25d9531 100644
--- a/pom.xml
+++ b/pom.xml
@@ -39,6 +39,13 @@
httpclient
4.5.9
+
+
+ org.jsoup
+ jsoup
+ 1.12.1
+
+
diff --git a/src/main/java/com/github/DeeJay0921/Main.java b/src/main/java/com/github/DeeJay0921/Main.java
index 655cc36..ef0e7f6 100644
--- a/src/main/java/com/github/DeeJay0921/Main.java
+++ b/src/main/java/com/github/DeeJay0921/Main.java
@@ -4,25 +4,97 @@
import org.apache.http.HttpHost;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
+import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.DefaultProxyRoutePlanner;
import org.apache.http.util.EntityUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import javax.net.ssl.SSLContext;
import java.io.IOException;
+import java.security.KeyManagementException;
+import java.security.NoSuchAlgorithmException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Set;
public class Main {
- public static void main(String[] args) throws IOException {
+ public static void main(String[] args) {
+ ArrayList linkPool = new ArrayList<>(); // 未处理过的连接池
+ linkPool.add("https://sina.cn");
+ Set handledLinkPool = new HashSet<>(); // 已经处理过的链接池
+ while (true) {
+ if (linkPool.isEmpty()) {
+ break;
+ }
+ String link = linkPool.get(0);
+ linkPool.remove(0); // 处理过后从未处理的连接池中删除该链接
+
+ if (handledLinkPool.contains(link)) { // 如果该链接已经处理过 跳出本次循环
+ continue;
+ }
+
+ if (!link.contains("sina.cn")) { // 如果不包含sina.cn 等关键字 说明不是新浪本站的页面 不做处理 直接跳过
+ continue;
+ } else { // 合法页面 进行请求
+ System.out.println("link = " + link);
+ String stringHtml = getStringHtml(link);
+ handledLinkPool.add(link); // 处理完成后加入已经处理的连接池
+ Document document = Jsoup.parse(stringHtml);
+ Elements aLinks = document.select("a"); // 获取所有的a标签
+
+ // 将链接加入连接池
+ for (Element alink : aLinks) {
+ String aLinkHref = alink.attr("href");
+ if (aLinkHref != null && aLinkHref.contains("sina.cn")) {
+ linkPool.add(alink.attr("href"));
+ }
+ }
+
+ // 对于新闻页做额外处理
+ Elements articleTags = document.select("article");
+ if (!articleTags.isEmpty()) {
+ for (Element articleTag : articleTags) {
+ String articleTitle = articleTag.child(0).text(); // 获取新闻文章标题 输出 之后改为入库
+ System.out.println("articleTitle = " + articleTitle);
+ }
+ }
+ }
+ }
+ }
+
+ private static String getStringHtml(String url) {
CloseableHttpClient httpclient = getHttpClient();
- HttpGet httpGet = new HttpGet("http://sina.cn");
+ HttpGet httpGet = new HttpGet(url);
+ httpGet.addHeader("user-agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36");
+ String html = null;
try (CloseableHttpResponse response1 = httpclient.execute(httpGet)) {
System.out.println(response1.getStatusLine());
HttpEntity entity1 = response1.getEntity();
- System.out.println("EntityUtils.toString(entity1) = " + EntityUtils.toString(entity1));
+ html = EntityUtils.toString(entity1);
+ } catch (IOException e) {
+ e.printStackTrace();
}
+ return html;
}
- private static CloseableHttpClient getHttpClient() throws IOException {
+ private static CloseableHttpClient getHttpClient() {
+ // 使httpClient信任所有证书 为了解决内网访问问题
+ SSLContext sslcontext = null; //建立证书实体
+ try {
+ sslcontext = SSLContext.getInstance("SSLv3");
+ javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1];
+ javax.net.ssl.TrustManager tm = new MiTM();
+ trustAllCerts[0] = tm;
+ sslcontext.init(null, trustAllCerts, null);
+ } catch (NoSuchAlgorithmException | KeyManagementException e) {
+ e.printStackTrace();
+ }
+ SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslcontext, SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
// 先调用一次 判断是否为内网 如果内网则要做代理
CloseableHttpClient testClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet("http://sina.cn");
@@ -31,10 +103,19 @@ private static CloseableHttpClient getHttpClient() throws IOException {
HttpHost proxy = new HttpHost("10.30.6.49", 9090);
DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(proxy);
return HttpClients.custom()
- .setRoutePlanner(routePlanner)
- .build();
+ .setRoutePlanner(routePlanner)
+ .setSSLSocketFactory(sslsf)
+ .build();
}
+ } catch (IOException e) {
+ e.printStackTrace();
}
- return HttpClients.createDefault();
+ return HttpClients.custom().setSSLSocketFactory(sslsf).build();
+// HttpHost proxy = new HttpHost("10.30.6.49", 9090);
+// DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(proxy);
+// return HttpClients.custom()
+// .setRoutePlanner(routePlanner)
+// .setSSLSocketFactory(sslsf)
+// .build();
}
}
diff --git a/src/main/java/com/github/DeeJay0921/MiTM.java b/src/main/java/com/github/DeeJay0921/MiTM.java
new file mode 100644
index 0000000..3927bde
--- /dev/null
+++ b/src/main/java/com/github/DeeJay0921/MiTM.java
@@ -0,0 +1,30 @@
+package com.github.DeeJay0921;
+
+public class MiTM implements javax.net.ssl.TrustManager,
+ javax.net.ssl.X509TrustManager {
+ public java.security.cert.X509Certificate[] getAcceptedIssuers() {
+ return null;
+ }
+
+ public boolean isServerTrusted(
+ java.security.cert.X509Certificate[] certs) {
+ return true;
+ }
+
+ public boolean isClientTrusted(
+ java.security.cert.X509Certificate[] certs) {
+ return true;
+ }
+
+ public void checkServerTrusted(
+ java.security.cert.X509Certificate[] certs, String authType)
+ throws java.security.cert.CertificateException {
+ return;
+ }
+
+ public void checkClientTrusted(
+ java.security.cert.X509Certificate[] certs, String authType)
+ throws java.security.cert.CertificateException {
+ return;
+ }
+}
\ No newline at end of file
From 9c94d64b829795fc221106b3828362213f135871 Mon Sep 17 00:00:00 2001
From: DeeJay0921 <1018805743@qq.com>
Date: Tue, 7 Jan 2020 15:53:05 +0800
Subject: [PATCH 2/2] =?UTF-8?q?#=20=E5=88=9D=E6=AD=A5=E5=AE=8C=E6=88=90?=
=?UTF-8?q?=E4=BA=86=E5=9F=BA=E6=9C=AC=E7=AE=97=E6=B3=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
初步的开发完成了一个可以运行的算法,代码需要下次重构
且解决了公司内网使用httpClient的证书信任问题,默认信任全部证书,该逻辑后期会删除掉
---
src/main/java/com/github/DeeJay0921/Main.java | 6 ++--
src/main/java/com/github/DeeJay0921/MiTM.java | 28 ++++++++-----------
2 files changed, 15 insertions(+), 19 deletions(-)
diff --git a/src/main/java/com/github/DeeJay0921/Main.java b/src/main/java/com/github/DeeJay0921/Main.java
index ef0e7f6..3f7c3fa 100644
--- a/src/main/java/com/github/DeeJay0921/Main.java
+++ b/src/main/java/com/github/DeeJay0921/Main.java
@@ -103,9 +103,9 @@ private static CloseableHttpClient getHttpClient() {
HttpHost proxy = new HttpHost("10.30.6.49", 9090);
DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(proxy);
return HttpClients.custom()
- .setRoutePlanner(routePlanner)
- .setSSLSocketFactory(sslsf)
- .build();
+ .setRoutePlanner(routePlanner)
+ .setSSLSocketFactory(sslsf)
+ .build();
}
} catch (IOException e) {
e.printStackTrace();
diff --git a/src/main/java/com/github/DeeJay0921/MiTM.java b/src/main/java/com/github/DeeJay0921/MiTM.java
index 3927bde..0563c67 100644
--- a/src/main/java/com/github/DeeJay0921/MiTM.java
+++ b/src/main/java/com/github/DeeJay0921/MiTM.java
@@ -1,30 +1,26 @@
package com.github.DeeJay0921;
-public class MiTM implements javax.net.ssl.TrustManager,
- javax.net.ssl.X509TrustManager {
- public java.security.cert.X509Certificate[] getAcceptedIssuers() {
+import javax.net.ssl.TrustManager;
+import javax.net.ssl.X509TrustManager;
+import java.security.cert.X509Certificate;
+
+public class MiTM implements TrustManager, X509TrustManager {
+ public X509Certificate[] getAcceptedIssuers() {
return null;
}
- public boolean isServerTrusted(
- java.security.cert.X509Certificate[] certs) {
+ public boolean isServerTrusted(X509Certificate[] certs) {
return true;
}
- public boolean isClientTrusted(
- java.security.cert.X509Certificate[] certs) {
+ public boolean isClientTrusted(X509Certificate[] certs) {
return true;
}
- public void checkServerTrusted(
- java.security.cert.X509Certificate[] certs, String authType)
- throws java.security.cert.CertificateException {
- return;
+ public void checkServerTrusted(X509Certificate[] certs, String authType) {
}
- public void checkClientTrusted(
- java.security.cert.X509Certificate[] certs, String authType)
- throws java.security.cert.CertificateException {
- return;
+ public void checkClientTrusted(X509Certificate[] certs, String authType) {
}
-}
\ No newline at end of file
+}
+