Skip to content

Commit

Permalink
Merge pull request #6985 from Charcoal-SE/Mak-add-edit-watcher
Browse files Browse the repository at this point in the history
Don't re-scan unchanged, recently-scanned posts; Add EditWatcher to detect grace period edits

autopull
  • Loading branch information
makyen committed May 17, 2022
2 parents 6a8af47 + 4aec6ba commit a3fd27f
Show file tree
Hide file tree
Showing 13 changed files with 1,466 additions and 284 deletions.
787 changes: 636 additions & 151 deletions bodyfetcher.py

Large diffs are not rendered by default.

59 changes: 50 additions & 9 deletions chatcommands.py
Expand Up @@ -938,17 +938,50 @@ def metasmoke(msg, alias_used):
" Auto status switch: **{}abled**.".format("dis" if forced else "en")


@command(aliases=["scan-stat", "statistics"])
@command(aliases=["scan-stat", "statistics", "stats"])
def stat():
""" Return post scan statistics. """
posts_scanned, scan_time, posts_per_second = GlobalVars.PostScanStat.get_stat()
stat_msg = "Posts scanned: {}; Scan time: {}".format(posts_scanned, scan_time)

rate_msg = ""
if posts_per_second:
rate_msg = "; Posts scanned per second: {}".format(posts_per_second)

return stat_msg + rate_msg
# As of Python 3.6+, dicts are iterated in insertion order.
report_order_with_defaults = {
'posts_scanned': 0,
'scan_time': 0,
'posts_per_second': 0,
'grace_period_edits': 0,
'unchanged_posts': 0,
'no_post_lock': 0,
'errors': 0,
'max_scan_time': 0,
'max_scan_time_post': '',
}
# posts_scanned, scan_time, posts_per_second = GlobalVars.PostScanStat.get_stat()
# get_stats() gets a copy of the stats, not the actual reference.
stats = GlobalVars.PostScanStat.get_stats()
# First, we deal with converting the max_scan_time_post into a Markdown link. We don't know if the post
# is an answer or question, and SE doesn't care wrt. the URL used.
site_post = stats.get('max_scan_time_post', None)
if site_post:
site, _, post_id = site_post.partition('/')
stats['max_scan_time_post'] = '[{}](//{}/q/{})'.format(site_post, site, post_id)
# posts_scanned, questions_scanned, answers_scanned = (stats.pop(key + '_scanned', 0)
posts_questions_answers_scanned = tuple(stats.pop(key + '_scanned', 0) for key in ['posts', 'questions', 'answers'])
stats['posts_scanned'] = '{}, Q({}), A({})'.format(*posts_questions_answers_scanned)
q_and_a_unchanged = tuple(stats.pop('unchanged_' + key, 0) for key in ['questions', 'answers'])
stats['unchanged_posts'] = '{}, Q({}), A({})'.format(sum(q_and_a_unchanged), *q_and_a_unchanged)
# Round all the floats to 2 digits after the decimal point
for key, value in stats.items():
if type(value) is float:
stats[key] = round(value, 2)
# For the stats we have a defined order; use that order.
# We're not using .capitalize() on the entire key, so any internal capitalization is preserved.
messages = ['{}: {}'.format(key[0].capitalize() + key[1:].replace('_', ' '), stats.get(key, default_value))
for key, default_value in report_order_with_defaults.items()]
for key in report_order_with_defaults.keys():
stats.pop(key, None)
# Add any additional stats we don't have in report_order_with_defaults
messages.extend(['{}: {}'.format(key[0].capitalize() + key[1:].replace('_', ' '), value)
for key, value in stats.items()])

return '; '.join(messages)


@command(aliases=["counter", "internal-counter", "ping-failure"])
Expand Down Expand Up @@ -2047,6 +2080,14 @@ def report_posts(urls, reported_by_owner, reported_in=None, blacklist_by=None, o
"It may already have been deleted.".format(index))
continue

# Watch for edits on the associated question
try:
if post_data.site and post_data.question_id:
Tasks.do(GlobalVars.edit_watcher.subscribe, hostname=post_data.site, question_id=post_data.question_id)
except AttributeError:
# This happens in some CI testing, because GlobalVars.edit_watcher isn't set up.
pass

if has_already_been_posted(post_data.site, post_data.post_id, post_data.title) and not is_false_positive(
(post_data.post_id, post_data.site)) and not is_forced:
# Don't re-report if the post wasn't marked as a false positive. If it was marked as a false positive,
Expand Down
169 changes: 146 additions & 23 deletions datahandling.py
Expand Up @@ -5,24 +5,38 @@
import zlib
import base64
from datetime import datetime
import metasmoke
import requests
import json
import time
import math
import threading

import requests
# noinspection PyCompatibility
import regex

from parsing import api_parameter_from_link, post_id_from_link
from globalvars import GlobalVars
import metasmoke
from parsing import api_parameter_from_link, post_id_from_link
import blacklists
from helpers import ErrorLogs, log, log_exception, redact_passwords
from tasks import Tasks

last_feedbacked = None
PICKLE_STORAGE = "pickles/"

queue_timings_data = list()
FLUSH_TIMINGS_THRES = 128
queue_timings_data_lock = threading.Lock()
FLUSH_TIMINGS_THRESHOLD = 128

SE_SITE_IDS_MAX_AGE_IN_SECONDS = 24 * 60 * 60
SE_SITE_IDS_MINIMUM_VALID_LENGTH = 200

bodyfetcher_max_ids_save_handle = None
bodyfetcher_max_ids_save_handle_lock = threading.Lock()
bodyfetcher_queue_save_handle = None
bodyfetcher_queue_save_handle_lock = threading.Lock()
recently_scanned_posts_save_handle = None
recently_scanned_posts_save_handle_lock = threading.Lock()


class Any:
Expand Down Expand Up @@ -137,6 +151,15 @@ def load_files():
with metasmoke.Metasmoke.ms_ajax_queue_lock:
metasmoke.Metasmoke.ms_ajax_queue = load_pickle("ms_ajax_queue.p")
log("debug", "Loaded {} entries into ms_ajax_queue".format(len(metasmoke.Metasmoke.ms_ajax_queue)))
if has_pickle("seSiteIds.p"):
with GlobalVars.site_id_dict_lock:
(GlobalVars.site_id_dict_timestamp,
GlobalVars.site_id_dict_issues_into_chat_timestamp,
GlobalVars.site_id_dict) = load_pickle("seSiteIds.p", encoding='utf-8')
fill_site_id_dict_by_id_from_site_id_dict()
if has_pickle("recentlyScannedPosts.p"):
with GlobalVars.recently_scanned_posts_lock:
GlobalVars.recently_scanned_posts = load_pickle("recentlyScannedPosts.p", encoding='utf-8')

blacklists.load_blacklists()

Expand Down Expand Up @@ -355,50 +378,95 @@ def clear_api_data():
dump_pickle("apiCalls.p", GlobalVars.api_calls_per_site)


def schedule_store_bodyfetcher_queue():
global bodyfetcher_queue_save_handle
with bodyfetcher_queue_save_handle_lock:
if bodyfetcher_queue_save_handle:
bodyfetcher_queue_save_handle.cancel()
bodyfetcher_queue_save_handle = Tasks.do(store_bodyfetcher_queue)


def store_bodyfetcher_queue():
dump_pickle("bodyfetcherQueue.p", GlobalVars.bodyfetcher.queue)
with GlobalVars.bodyfetcher.queue_lock:
dump_pickle("bodyfetcherQueue.p", GlobalVars.bodyfetcher.queue)


def schedule_store_bodyfetcher_max_ids():
global bodyfetcher_max_ids_save_handle
with bodyfetcher_max_ids_save_handle_lock:
if bodyfetcher_max_ids_save_handle:
bodyfetcher_max_ids_save_handle.cancel()
bodyfetcher_max_ids_save_handle = Tasks.do(store_bodyfetcher_max_ids)


def store_bodyfetcher_max_ids():
dump_pickle("bodyfetcherMaxIds.p", GlobalVars.bodyfetcher.previous_max_ids)
with bodyfetcher_max_ids_save_handle_lock:
if bodyfetcher_max_ids_save_handle:
bodyfetcher_max_ids_save_handle.cancel()
with GlobalVars.bodyfetcher.max_ids_lock:
max_ids_copy = GlobalVars.bodyfetcher.previous_max_ids.copy()
dump_pickle("bodyfetcherMaxIds.p", max_ids_copy)


def store_ms_ajax_queue():
with metasmoke.Metasmoke.ms_ajax_queue_lock:
dump_pickle("ms_ajax_queue.p", metasmoke.Metasmoke.ms_ajax_queue)


def add_queue_timing_data(site, time_in_queue):
def add_queue_timing_data(site, times_in_queue):
global queue_timings_data
queue_timings_data.append("{} {}".format(site, time_in_queue))
# time_in_queue comes first as it is an integer
# and hence won't contain any whitespace or trailing ones
if len(queue_timings_data) >= FLUSH_TIMINGS_THRES:
actually_add_queue_timings_data()
queue_timings_data = list()
new_times = ["{} {}".format(site, time_in_queue) for time_in_queue in times_in_queue]
with queue_timings_data_lock:
queue_timings_data.extend(new_times)
queue_length = len(queue_timings_data)
if queue_length >= FLUSH_TIMINGS_THRESHOLD:
flush_queue_timings_data()


def actually_add_queue_timings_data():
def flush_queue_timings_data():
global queue_timings_data
# Use .txt for cross platform compatibility
with open("pickles/bodyfetcherQueueTimings.txt", mode="a", encoding="utf-8") as stat_file:
stat_file.write("\n".join(queue_timings_data) + "\n")
with queue_timings_data_lock:
with open("pickles/bodyfetcherQueueTimings.txt", mode="a", encoding="utf-8") as stat_file:
stat_file.write("\n".join(queue_timings_data) + "\n")
queue_timings_data = list()


def schedule_store_recently_scanned_posts():
global recently_scanned_posts_save_handle
with recently_scanned_posts_save_handle_lock:
if recently_scanned_posts_save_handle:
recently_scanned_posts_save_handle.cancel()
recently_scanned_posts_save_handle = Tasks.do(store_recently_scanned_posts)


def store_recently_scanned_posts():
# While using a copy to avoid holding the lock while storing is generally desired,
# the expectation is that this will only be stored when shutting down.
with GlobalVars.recently_scanned_posts_lock:
with recently_scanned_posts_save_handle_lock:
if recently_scanned_posts_save_handle:
recently_scanned_posts_save_handle.cancel()
dump_pickle("recentlyScannedPosts.p", GlobalVars.recently_scanned_posts)


# methods that help avoiding reposting alerts:


def append_to_latest_questions(host, post_id, title):
GlobalVars.latest_questions.insert(0, (host, str(post_id), title))
if len(GlobalVars.latest_questions) > 50:
GlobalVars.latest_questions.pop()
with GlobalVars.latest_questions_lock:
GlobalVars.latest_questions.insert(0, (host, str(post_id), title))
if len(GlobalVars.latest_questions) > 50:
GlobalVars.latest_questions.pop()


# noinspection PyMissingTypeHints
def has_already_been_posted(host, post_id, title):
for post in GlobalVars.latest_questions:
if post[0] == host and post[1] == str(post_id):
return True
return False
with GlobalVars.latest_questions_lock:
for post in GlobalVars.latest_questions:
if post[0] == host and post[1] == str(post_id):
return True
return False


# method to get data from the error log:
Expand Down Expand Up @@ -722,3 +790,58 @@ def load(cls, s, merge=False):
raise Warning("Warning: " + ', '.join(warnings))
except (ValueError, zlib.error) as e:
raise ValueError(str(e)) from None


def store_site_id_dict():
with GlobalVars.site_id_dict_lock:
to_dump = (GlobalVars.site_id_dict_timestamp,
GlobalVars.site_id_dict_issues_into_chat_timestamp,
GlobalVars.site_id_dict.copy())
dump_pickle("seSiteIds.p", to_dump)


def fill_site_id_dict_by_id_from_site_id_dict():
GlobalVars.site_id_dict_by_id = {site_id: site for site, site_id in GlobalVars.site_id_dict.items()}


def refresh_site_id_dict():
message = requests.get('https://meta.stackexchange.com/topbar/site-switcher/all-pinnable-sites')
data = json.loads(message.text)
site_ids_dict = {entry['hostname']: entry['siteid'] for entry in data}
if len(site_ids_dict) >= SE_SITE_IDS_MINIMUM_VALID_LENGTH:
with GlobalVars.site_id_dict_lock:
GlobalVars.site_id_dict = site_ids_dict
fill_site_id_dict_by_id_from_site_id_dict()
GlobalVars.site_id_dict_timestamp = time.time()


def is_se_site_id_list_length_valid():
with GlobalVars.site_id_dict_lock:
to_return = len(GlobalVars.site_id_dict) >= SE_SITE_IDS_MINIMUM_VALID_LENGTH
return to_return


def is_se_site_id_list_out_of_date():
return GlobalVars.site_id_dict_timestamp < time.time() - SE_SITE_IDS_MAX_AGE_IN_SECONDS


def refresh_site_id_dict_if_needed_and_get_issues():
issues = []
if not is_se_site_id_list_length_valid() or is_se_site_id_list_out_of_date():
try:
refresh_site_id_dict()
except Exception:
# We ignore any problems with getting or refreshing the list of SE sites, as we handle it by
# testing to see if we have valid data (i.e. SD doesn't need to fail for an exception here).
log_exception(*sys.exc_info())
issues.append("An exception occurred when trying to get the SE site ID list."
" See the error log for details.")
if is_se_site_id_list_length_valid():
store_site_id_dict()
if is_se_site_id_list_out_of_date():
issues.insert(0, "The site ID list is more than a day old.")
if not is_se_site_id_list_length_valid():
with GlobalVars.site_id_dict_lock:
issues.insert(0, "The SE site ID list has "
"{} entries, which isn't considered valid.".format(len(GlobalVars.site_id_dict)))
return issues

0 comments on commit a3fd27f

Please sign in to comment.