Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add (but don't use) question and comment scanning tasks
- Loading branch information
Showing
8 changed files
with
494 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
package net.zomis.duga | ||
|
||
import net.zomis.duga.chat.DugaPoster | ||
import net.zomis.duga.utils.stackexchange.StackExchangeApi | ||
import net.zomis.duga.utils.stackexchange.CommentsScanTask | ||
import net.zomis.duga.utils.stackexchange.ProgrammersClassification | ||
import net.zomis.duga.utils.stackexchange.QuestionScanTask | ||
import net.zomis.machlearn.text.TextClassification | ||
|
||
class DugaTasks(private val poster: DugaPoster, private val stackApi: StackExchangeApi) { | ||
private val questionScanTask = QuestionScanTask(poster, stackApi, "codereview") | ||
private val programmersClassification: TextClassification | ||
private val commentsScanTask: CommentsScanTask | ||
|
||
init { | ||
val trainingData = this::class.java.classLoader.getResource("trainingset-programmers-comments.txt") | ||
val source = trainingData?.readText() | ||
val lines = source?.split("\n") | ||
this.programmersClassification = ProgrammersClassification.machineLearning(lines) | ||
this.commentsScanTask = CommentsScanTask(stackApi, programmersClassification, poster) | ||
} | ||
|
||
suspend fun commentScan() = commentsScanTask.run() | ||
suspend fun answerInvalidation() = questionScanTask.run() | ||
|
||
} |
98 changes: 98 additions & 0 deletions
98
duga-ktor/src/utils/stackexchange/AnswerInvalidationCheck.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
package net.zomis.duga.utils.stackexchange | ||
|
||
import com.fasterxml.jackson.databind.JsonNode | ||
import net.zomis.duga.chat.DugaPoster | ||
import net.zomis.duga.utils.stackexchange.StackExchangeApi | ||
import org.apache.commons.text.StringEscapeUtils | ||
import org.slf4j.LoggerFactory | ||
import java.time.Instant | ||
|
||
object AnswerInvalidationCheck { | ||
|
||
private val logger = LoggerFactory.getLogger(AnswerInvalidationCheck::class.java) | ||
|
||
suspend fun perform(poster: DugaPoster, result: JsonNode?, lastCheck: Instant, stackExchangeAPI: StackExchangeApi) { | ||
if (result == null) { | ||
logger.error("No questions gathered for Answer Invalidation") | ||
return | ||
} | ||
logger.debug("Answer invalidation check") | ||
val questions = result.get("items") | ||
questions.forEach { question -> | ||
// def created = it.creation_date | ||
// def activity = it.creation_date | ||
val edited = question.get("last_edit_date")?.asLong() ?: 0 | ||
val questionLink = question.get("link").asText() | ||
val op = formatDisplayName(question.get("owner").get("display_name").asText()) | ||
val questionId = question.get("question_id").asLong() | ||
if (edited >= lastCheck.epochSecond && question.get("answer_count").asLong() > 0) { | ||
logger.info("edited: $questionId") | ||
val edits = stackExchangeAPI.apiCall(editCall(questionId), "codereview", "!9YdnS7lAD") | ||
?: throw IllegalStateException("Unable to get edits for $questionId") | ||
poster.postMessage("20298", "Edits fetched for $questionId: ${edits.get("items").size()}. quota remaining $edits.quota_remaining") | ||
val possibleInvalidations = codeChanges(edits, lastCheck) | ||
if (!possibleInvalidations.isEmpty()) { | ||
val link = questionLink.replace(Regex("/questions/.*"), "/posts/$questionId/revisions") | ||
val editor = possibleInvalidations | ||
.map { formatDisplayName(it.get("user").get("display_name").asText()) } | ||
.joinToString(", ") | ||
poster.postMessage("8595", "*possible answer invalidation by $editor on question by $op:* $link") | ||
} | ||
} | ||
} | ||
} | ||
|
||
fun formatDisplayName(displayName: String): String { | ||
return StringEscapeUtils.unescapeHtml4(displayName) | ||
} | ||
|
||
fun codeChanged(edits: JsonNode, lastCheck: Instant): Boolean { | ||
return codeChanges(edits, lastCheck).isNotEmpty() | ||
} | ||
|
||
fun codeChanges(edits: JsonNode, lastCheck: Instant): List<JsonNode> { | ||
val result = mutableListOf<JsonNode>() | ||
edits.get("items").forEach { | ||
val lastBody = it.get("last_body") | ||
if (lastBody == null || lastBody.isNull) { | ||
return@forEach | ||
} | ||
if (it.get("creation_date").asLong() < lastCheck.epochSecond) { | ||
return@forEach | ||
} | ||
if (it.get("is_rollback").asBoolean()) { | ||
return@forEach | ||
} | ||
val code = stripNonCode(it.get("body").asText()) | ||
val codeBefore = stripNonCode(it.get("last_body").asText()) | ||
if (!code.equals(codeBefore)) { | ||
result.add(it) | ||
} | ||
} | ||
return result | ||
} | ||
|
||
fun editCall(id: Long) = "posts/$id/revisions" | ||
|
||
fun stripNonCode(original: String): String { | ||
var post = original.replace(Regex("[\\t ]"), "") | ||
var keepCount = 0 | ||
var index = post.indexOf("<code>") | ||
while (index >= 0) { | ||
val endIndex = post.indexOf("</code>") | ||
check(endIndex >= 0) | ||
val before = post.substring(0, keepCount) | ||
val code = post.substring(index + "<code>".length, endIndex) | ||
val after = post.substring(endIndex + "</code>".length) | ||
if (code.contains("\\n") || code.contains('\n')) { | ||
post = before + code + after | ||
keepCount += code.length | ||
} else { | ||
post = before + after | ||
} | ||
index = post.indexOf("<code>") | ||
} | ||
return post.substring(0, keepCount) | ||
} | ||
|
||
} |
94 changes: 94 additions & 0 deletions
94
duga-ktor/src/utils/stackexchange/CommentClassification.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
package net.zomis.duga.utils.stackexchange | ||
|
||
import com.fasterxml.jackson.databind.JsonNode | ||
import org.slf4j.LoggerFactory | ||
import java.util.regex.Pattern | ||
|
||
object CommentClassification { | ||
|
||
private val logger = LoggerFactory.getLogger(CommentClassification::class.java) | ||
|
||
val ML_THRESHOLD = 0.3f | ||
val REAL = 0.49f | ||
val DEBUG = 0f | ||
|
||
val PROG_LINK = Pattern.compile( | ||
Pattern.quote("<a href=\"http") | ||
+ "s?" + | ||
Pattern.quote("://softwareengineering.stackexchange.com") + "(/|/help/.*)?" + Pattern.quote("\">") | ||
) | ||
|
||
fun bodyContainsProgrammersLink(body: String): Boolean { | ||
return PROG_LINK.matcher(body).find(); | ||
} | ||
|
||
fun calcInterestingLevelProgrammers(comment: JsonNode): Float { | ||
val matchPattern = if (bodyContainsProgrammersLink(comment.get("body").asText())) 1.0f else 0f | ||
return matchPattern + calcInterestingLevelProgrammers(comment.get("body_markdown").asText()) | ||
} | ||
|
||
fun calcInterestingLevelSoftwareRecs(comment: JsonNode) = calcInterestingLevelSoftwareRecs(comment.get("body_markdown").asText()) | ||
|
||
fun calcInterestingLevelSoftwareRecs(comment: String): Float { | ||
var points = 0.4f; | ||
points += score(0.3f, comment, "software recommendations") | ||
points += score(0.3f, comment, "softwarerecs") | ||
// points -= score(0.25f, comment, "meta.softwarerecs.stackexchange.com/questions/336/"); | ||
// points -= score(0.25f, comment, "meta.softwarerecs.stackexchange.com/q/336/"); | ||
points -= score(0.55f, comment, "/336") | ||
return points; | ||
} | ||
|
||
fun calcInterestingLevelProgrammers(commentOriginal: String): Float { | ||
var comment = commentOriginal.toLowerCase() | ||
if (!comment.contains("programmers")) { | ||
return 0f | ||
} | ||
if (programmersIgnore(comment)) { | ||
return 0.42f; | ||
} | ||
var points = 0.4f; | ||
|
||
points += score(0.3f, comment, "better fit"); | ||
points += score(0.3f, comment, "better suited"); | ||
points += score(0.3f, comment, "better place"); | ||
|
||
points += score(0.01f, comment, "close"); | ||
points += score(0.05f, comment, "off-topic"); | ||
points += score(0.05f, comment, "design"); | ||
points += score(0.05f, comment, "whiteboard"); | ||
points += score(0.05f, comment, "this question"); | ||
points += score(0.15f, comment, "this site"); | ||
points += score(0.2f, comment, "programmers.se"); | ||
points += score(0.07f, comment, "help at"); | ||
points += score(0.07f, comment, "place to ask"); | ||
points += score(0.15f, comment, "migrate"); | ||
points += score(0.1f, comment, "belong"); | ||
points += score(0.02f, comment, "instead"); | ||
points += score(0.03f, comment, "the place for"); | ||
|
||
points += score(0.03f, comment, "try programmers"); | ||
points += score(0.03f, comment, "for programmers"); | ||
points += score(0.03f, comment, "on programmers"); | ||
points += score(0.03f, comment, "at programmers"); | ||
points += score(0.03f, comment, "to programmers"); | ||
|
||
return points; | ||
} | ||
|
||
private val programmersIgnore = arrayOf("please stop using programmers.se as your toilet bowl", | ||
"/7265", // http://meta.programmers.stackexchange.com/questions/7265/when-is-a-software-licensing-question-on-topic | ||
"/7182" // http://meta.programmers.stackexchange.com/questions/7182/what-goes-on-programmers-se-a-guide-for-stack-overflow | ||
) | ||
|
||
private fun programmersIgnore(comment: String) = programmersIgnore.any { comment.contains(it) } | ||
|
||
private fun score(f: Float, comment: String, string: String): Float { | ||
if (comment.contains(string)) { | ||
logger.info("$string --- $comment --- $f") | ||
return f | ||
} | ||
return 0f | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
package net.zomis.duga.utils.stackexchange | ||
|
||
import com.fasterxml.jackson.databind.JsonNode | ||
import kotlinx.coroutines.GlobalScope | ||
import kotlinx.coroutines.launch | ||
|
||
import java.time.Instant; | ||
|
||
import net.zomis.duga.chat.DugaPoster | ||
import net.zomis.machlearn.text.TextClassification | ||
import org.slf4j.LoggerFactory | ||
|
||
class CommentsScanTask(val stackAPI: StackExchangeApi, val programmersClassification: TextClassification, val poster: DugaPoster) { | ||
private val logger = LoggerFactory.getLogger(CommentsScanTask::class.java) | ||
|
||
private var nextFetch = Instant.now() | ||
private var lastComment: Long = 0 | ||
private var fromDate: Long = 0 | ||
private var remainingQuota: Long = 0 | ||
|
||
private val codeReview = poster.room("8595") | ||
private val debug = poster.room("20298") | ||
private val programmers = poster.room("21") | ||
private val softwareRecs = poster.room("20298") | ||
|
||
fun isInterestingCommentCR(comment: JsonNode): Boolean { | ||
val commentText = comment.get("body_markdown").asText().toLowerCase() | ||
return commentText.contains("code review") || commentText.contains("codereview") | ||
} | ||
|
||
suspend fun run() { | ||
if (!Instant.now().isAfter(nextFetch)) { | ||
return; | ||
} | ||
|
||
try { | ||
val comments = stackAPI.fetchComments("stackoverflow", fromDate) | ||
if (comments == null) { | ||
logger.error("Unable to get comments from $fromDate") | ||
return | ||
} | ||
val currentQuota = comments.get("quota_remaining").asLong() | ||
if (currentQuota > remainingQuota && fromDate != 0L) { | ||
debug.post(Instant.now().toString() + " Quota has been reset. Was " + | ||
remainingQuota + " is now " + currentQuota) | ||
} | ||
remainingQuota = currentQuota | ||
|
||
if (comments.get("backoff") != null) { | ||
nextFetch = Instant.now().plusSeconds(comments.get("backoff").asLong() + 10) | ||
debug.post(Instant.now().toString() + | ||
" Next fetch: " + nextFetch + " because of backoff " + comments.get("backoff")) | ||
return | ||
} | ||
|
||
val items = comments.get("items") ?: return | ||
if (items.size() >= 100) { | ||
debug.post(Instant.now().toString() + " Warning: Retrieved 100 comments. Might have missed some.") | ||
} | ||
|
||
val previousLastComment = lastComment | ||
for (comment in items.reversed()) { | ||
if (comment.get("comment_id").asLong() <= previousLastComment) { | ||
continue; | ||
} | ||
lastComment = Math.max(comment.get("comment_id").asLong(), lastComment) | ||
fromDate = Math.max(comment.get("creation_date").asLong(), fromDate) | ||
if (isInterestingCommentCR(comment)) { | ||
logComment(comment, "Code Review") | ||
codeReview.post(comment.get("link").asText()) | ||
} | ||
|
||
classifyProgrammers(comment) | ||
|
||
val softwareCertainty = CommentClassification.calcInterestingLevelSoftwareRecs(comment) | ||
|
||
if (softwareCertainty >= CommentClassification.REAL) { | ||
softwareRecs.post(comment.get("link").asText()) | ||
} | ||
} | ||
} catch (e: Exception) { | ||
logger.error("Error retrieving comments", e); | ||
debug.post(Instant.now().toString() + " Exception in comment task " + e) | ||
} | ||
} | ||
|
||
fun logComment(comment: JsonNode, site: String) { | ||
logger.info("$site comment $comment.comment_id " + | ||
"on $comment.post_type $comment.post_id " + | ||
"posted by $comment.owner.display_name " + | ||
"with $comment.owner.reputation reputation: $comment.body_markdown") | ||
} | ||
|
||
fun classifyProgrammers(comment: JsonNode) { | ||
val oldClassification = CommentClassification.calcInterestingLevelProgrammers(comment); | ||
val programmersMLscore = programmersMLscore(comment) | ||
|
||
if (programmersMLscore >= CommentClassification.ML_THRESHOLD) { | ||
programmers.postAsync(comment.get("link").asText()) | ||
} | ||
|
||
if (programmersMLscore >= CommentClassification.DEBUG) { | ||
logComment(comment, "Software Engineering (ML $programmersMLscore old $oldClassification)") | ||
val certaintyLevelMessage = | ||
"ML Classification " + programmersMLscore + | ||
" (Old classification " + oldClassification + ")"; | ||
GlobalScope.launch { | ||
debug.post(certaintyLevelMessage) | ||
debug.post(comment.get("link").asText()) | ||
} | ||
} | ||
} | ||
|
||
fun programmersMLscore(comment: JsonNode): Double { | ||
val text = comment.get("body_markdown").asText() | ||
if (!text.toLowerCase().contains("programmers") && !text.toLowerCase().contains("softwareeng") | ||
&& !text.toLowerCase().contains("software eng")) { | ||
// No need to check with the Machine Learning system in this case | ||
return -1.0 | ||
} | ||
return programmersClassification.score(text) | ||
} | ||
} |
Oops, something went wrong.