Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: 给部分代码添加注释 #37

Merged
merged 6 commits into from Sep 6, 2021
Expand Up @@ -8,74 +8,103 @@
import java.util.List;
import java.util.Set;

/**
* 小作文比对工具类
*/
public class ArticleCompareUtil {
public static boolean isHighSimilarity(int textLength, float similarity) {
if (textLength > 250) {
return similarity > 0.5;
} else if (textLength > 150) {
return similarity > 0.6;
} else {
return similarity > 0.7;
}
}
/**
* 判断小作文是否高度相似
*
* @param textLength 小作文长度
* @param similarity 相似度
* @return
*/
public static boolean isHighSimilarity(int textLength, float similarity) {
if (textLength > 250) {
return similarity > 0.5;
} else if (textLength > 150) {
return similarity > 0.6;
} else {
return similarity > 0.7;
}
}

public static String trim(String s) {
String stopWord = "[\\pP\\p{Punct}]";
s = s.replaceAll("\\s*", "");
/**
* 过滤空白字符、标点符号、零宽空格等特殊字符
*
* @param s 小作文
* @return 经过过滤后的小作文
*/
public static String trim(String s) {
String stopWord = "[\\pP\\p{Punct}]";
s = s.replaceAll("\\s*", "");
//去除控制字符
s = s.replaceAll("\\p{Cf}","");
s = s.replaceAll("\\p{Cf}", "");
//去除零宽空格等
s = s.replaceAll("/[\\u200b-\\u200f\\ufeff\\u202a-\\u202e]/g","");
s = s.replaceAll(stopWord, "");
return s;
}
s = s.replaceAll("/[\\u200b-\\u200f\\ufeff\\u202a-\\u202e]/g", "");
s = s.replaceAll(stopWord, "");
return s;
}

static List<String> getStringSegs(String s) {
int codePointCount = s.codePointCount(0, s.length());
if (codePointCount <= SummaryHash.DEFAULT_K) {
return Lists.newArrayList(s);
}
int startOffset = 0;
List<String> stringSegs = new ArrayList<>(codePointCount - SummaryHash.DEFAULT_K + 1);
for (int i = 0; i < codePointCount - SummaryHash.DEFAULT_K + 1; i++) {
String subString = unicodeSubString(s, startOffset, SummaryHash.DEFAULT_K);
startOffset = s.offsetByCodePoints(startOffset, 1);
stringSegs.add(subString);
}
return stringSegs;
}
static List<String> getStringSegs(String s) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

此函数使用滑动窗口将字符串分段以便比较,用到的地方就是两个文本段相同部分的标记(compareArticle函数)。可以换成新的算法来标记(前端用上了)。

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

注释加上了。

可以换成新的算法来标记(前端用上了)。

这个怎么说?

int codePointCount = s.codePointCount(0, s.length());
if (codePointCount <= SummaryHash.DEFAULT_K) {
return Lists.newArrayList(s);
}
int startOffset = 0;
List<String> stringSegs = new ArrayList<>(codePointCount - SummaryHash.DEFAULT_K + 1);
for (int i = 0; i < codePointCount - SummaryHash.DEFAULT_K + 1; i++) {
String subString = unicodeSubString(s, startOffset, SummaryHash.DEFAULT_K);
startOffset = s.offsetByCodePoints(startOffset, 1);
stringSegs.add(subString);
}
return stringSegs;
}

static String unicodeSubString(String str, int idx, int len) {
return str.substring(idx, str.offsetByCodePoints(idx, len));
}
static String unicodeSubString(String str, int idx, int len) {
return str.substring(idx, str.offsetByCodePoints(idx, len));
}

public static float compareArticle(String article1, String article2) {
int codePointsCount = article1.codePointCount(0, article1.length());
List<String> article1Segs = getStringSegs(article1);
Set<String> article2Segs = new HashSet<>(getStringSegs(article2));
float count = 0;
Set<Integer> redList = new HashSet<>();
for (int i = 0; i < codePointsCount - SummaryHash.DEFAULT_K + 1; i++) {
String seg = article1Segs.get(i);
if (article2Segs.contains(seg)) {
for (int j = 0; j < SummaryHash.DEFAULT_K; j++) {
redList.add(i + j);
}
}
}
for (int i = 0; i < codePointsCount; i++) {
if (redList.contains(i)) {
count += 1;
}
}
/**
* 获取两篇小作文的文本重复度
*
* @param article1
* @param article2
* @return 文本重复率
*/
public static float compareArticle(String article1, String article2) {
int codePointsCount = article1.codePointCount(0, article1.length());
List<String> article1Segs = getStringSegs(article1);
Set<String> article2Segs = new HashSet<>(getStringSegs(article2));
float count = 0;
Set<Integer> redList = new HashSet<>();
for (int i = 0; i < codePointsCount - SummaryHash.DEFAULT_K + 1; i++) {
String seg = article1Segs.get(i);
if (article2Segs.contains(seg)) {
for (int j = 0; j < SummaryHash.DEFAULT_K; j++) {
redList.add(i + j);
}
}
}
for (int i = 0; i < codePointsCount; i++) {
if (redList.contains(i)) {
count += 1;
}
}

return count / (float) codePointsCount;
}
return count / (float) codePointsCount;
}

public static int textLength(String s) {
if (StringUtils.isBlank(s)) {
return 0;
}
return s.codePointCount(0, s.length());
}
/**
* 获取小作文字数
*
* @param s 小作文字符串
* @return 作文字数
*/
public static int textLength(String s) {
if (StringUtils.isBlank(s)) {
return 0;
}
return s.codePointCount(0, s.length());
}
}