Skip to content

Commit

Permalink
Refactor report storing, runtime performace gains
Browse files Browse the repository at this point in the history
To be able to move the session.flush() from after each report to after
the completion of the processing of all report files, a new helper
method was needed to be introduced: __add_report_context().

This will handle the addition of notes, macro expansions in a separate
step, after the reports have been flushed to the db, and already gotten
their primary keys assigned.

Also added a helper class to handle review statuses stored in source
code.
  • Loading branch information
vodorok committed Mar 8, 2023
1 parent b47b45c commit 8a5c6ec
Showing 1 changed file with 151 additions and 106 deletions.
257 changes: 151 additions & 106 deletions web/server/codechecker_server/api/mass_store_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@
from ..database.database import DBSession
from ..database.run_db_model import AnalysisInfo, AnalyzerStatistic, \
BugPathEvent, BugReportPoint, ExtendedReportData, File, FileContent, \
Report as DBReport, ReviewStatus, Run, RunHistory, RunLock
Report as DBReport, ReviewStatus as ReviewStatusRule, Run, RunHistory, \
RunLock
from ..metadata import checker_is_unavailable, MetadataInfoParser

from .report_server import ThriftRequestHandler
Expand Down Expand Up @@ -195,6 +196,29 @@ def get_blame_file_data(
return blame_info, remote_url, tracking_branch


class SourceReviewStatus:
"""
Helper class for handling in source review statuses.
Collect the same info as a review status rule.
FIXME Change this to dataclass when available
"""
def __init__(
self,
status: str,
message: bytes,
bug_hash: Any,
author: str,
date: datetime,
in_source: bool,
):
self.status = status
self.message = message
self.bug_hash = bug_hash
self.author = author
self.date = date
self.in_source = in_source


class MassStoreRun:
def __init__(
self,
Expand Down Expand Up @@ -226,6 +250,7 @@ def __init__(
self.__severity_map: Dict[str, int] = {}
self.__new_report_hashes: Dict[str, Tuple] = {}
self.__all_report_checkers: Set[str] = set()
self.__added_reports: List[Tuple[DBReport, Report]] = list()

@property
def __manager(self):
Expand Down Expand Up @@ -713,8 +738,7 @@ def __add_report(
run_id: int,
report: Report,
file_path_to_id: Dict[str, int],
review_status: Dict[str, Any],
review_status_is_in_source: bool,
review_status: SourceReviewStatus,
detection_status: str,
detection_time: datetime,
run_history_time: datetime,
Expand All @@ -739,58 +763,72 @@ def __add_report(
run_id, report.report_hash, file_path_to_id[report.file.path],
report.message, checker_name or 'NOT FOUND',
report.category, report.type, report.line, report.column,
severity, review_status["status"], review_status["author"],
review_status["message"], run_history_time,
review_status_is_in_source,
severity, review_status.status, review_status.author,
review_status.message, run_history_time,
review_status.in_source,
detection_status, detection_time,
len(report.bug_path_events), analyzer_name)

db_report.fixed_at = fixed_at

session.add(db_report)
session.flush()

LOG.debug("Storing bug path positions.")
for i, p in enumerate(report.bug_path_positions):
session.add(BugReportPoint(
p.range.start_line, p.range.start_col,
p.range.end_line, p.range.end_col,
i, file_path_to_id[p.file.path], db_report.id))

LOG.debug("Storing bug path events.")
for i, event in enumerate(report.bug_path_events):
session.add(BugPathEvent(
event.range.start_line, event.range.start_col,
event.range.end_line, event.range.end_col,
i, event.message, file_path_to_id[event.file.path],
db_report.id))

LOG.debug("Storing notes.")
for note in report.notes:
data_type = report_extended_data_type_str(
ttypes.ExtendedReportDataType.NOTE)

session.add(ExtendedReportData(
note.range.start_line, note.range.start_col,
note.range.end_line, note.range.end_col,
note.message, file_path_to_id[note.file.path],
db_report.id, data_type))

LOG.debug("Storing macro expansions.")
for macro in report.macro_expansions:
data_type = report_extended_data_type_str(
ttypes.ExtendedReportDataType.MACRO)

session.add(ExtendedReportData(
macro.range.start_line, macro.range.start_col,
macro.range.end_line, macro.range.end_col,
macro.message, file_path_to_id[macro.file.path],
db_report.id, data_type))

if analysis_info:
db_report.analysis_info.append(analysis_info)

session.add(db_report)
self.__added_reports.append((db_report, report))

# THE id is none at this point of time
# wondering if not returning anything is good?
# The report is already handled at the above lines
return db_report.id

except Exception as ex:
raise codechecker_api_shared.ttypes.RequestFailed(
codechecker_api_shared.ttypes.ErrorCode.GENERAL,
str(ex))

def __add_report_context(self, session, file_path_to_id):
try:
for db_report, report in self.__added_reports:
LOG.debug("Storing bug path positions.")
for i, p in enumerate(report.bug_path_positions):
session.add(BugReportPoint(
p.range.start_line, p.range.start_col,
p.range.end_line, p.range.end_col,
i, file_path_to_id[p.file.path], db_report.id))

LOG.debug("Storing bug path events.")
for i, event in enumerate(report.bug_path_events):
session.add(BugPathEvent(
event.range.start_line, event.range.start_col,
event.range.end_line, event.range.end_col,
i, event.message, file_path_to_id[event.file.path],
db_report.id))

LOG.debug("Storing notes.")
for note in report.notes:
data_type = report_extended_data_type_str(
ttypes.ExtendedReportDataType.NOTE)

session.add(ExtendedReportData(
note.range.start_line, note.range.start_col,
note.range.end_line, note.range.end_col,
note.message, file_path_to_id[note.file.path],
db_report.id, data_type))

LOG.debug("Storing macro expansions.")
for macro in report.macro_expansions:
data_type = report_extended_data_type_str(
ttypes.ExtendedReportDataType.MACRO)

session.add(ExtendedReportData(
macro.range.start_line, macro.range.start_col,
macro.range.end_line, macro.range.end_col,
macro.message, file_path_to_id[macro.file.path],
db_report.id, data_type))

session.flush()

except Exception as ex:
raise codechecker_api_shared.ttypes.RequestFailed(
codechecker_api_shared.ttypes.ErrorCode.GENERAL,
Expand All @@ -816,45 +854,40 @@ def __process_report_file(
return True

def get_review_status_from_source(
report: Report) -> Tuple[Dict[str, Any], bool]:
report: Report) -> SourceReviewStatus:
"""
Return the review status belonging to the given report and a
boolean value which indicates whether the review status comes from
Return the review status set in the source code belonging to
the given report whether the review status comes from
a source code comment.
- Return the review status and True based on the source code
comment.
- Return the review status if it is set in the source code.
- If the review status is ambiguous (i.e. multiple source code
comments belong to it) then (unreviewed, False) returns.
- If there is no source code comment then the default review status
and True returns based on review_statuses database table.
- If the report doesn't have a default review status then an
"unreviewed" review status and False returns.
comments belong to it) then (unreviewed, in_source=False)
returns.
"""
# The original file path is needed here, not the trimmed, because
# the source files are extracted as the original file path.
source_file_name = os.path.realpath(os.path.join(
source_root, report.file.original_path.strip("/")))

review_status = {}
review_status["status"] = 'unreviewed'
review_status["message"] = b''
review_status["bug_hash"] = report.report_hash
review_status["author"] = self.user_name
review_status["date"] = run_history_time

if os.path.isfile(source_file_name):
src_comment_data = parse_codechecker_review_comment(
source_file_name, report.line, report.checker_name)

if len(src_comment_data) == 1:
data = src_comment_data[0]
rs = data.status, bytes(data.message, 'utf-8')

review_status["status"] = rs[0]
review_status["message"] = rs[1]

return review_status, True
status, message = data.status, bytes(data.message, 'utf-8')

review_status = SourceReviewStatus(
status=status,
message=message,
bug_hash=report.report_hash,
author=self.user_name,
date=run_history_time,
in_source=True
)

return review_status
elif len(src_comment_data) > 1:
LOG.warning(
"Multiple source code comment can be found "
Expand All @@ -866,7 +899,17 @@ def get_review_status_from_source(
f"{source_file_name}|{report.line}|"
f"{report.checker_name}")

return review_status, False
# A better way to handle reports where the review status is not
# set in the source is to return None, and set the reviews status
# and set the review status info at report addition time.
return SourceReviewStatus(
status="unreviewed",
message=b'',
bug_hash=report.report_hash,
author=self.user_name,
date=run_history_time,
in_source=False
)

def get_missing_file_ids(report: Report) -> List[str]:
""" Returns file paths which database file id is missing. """
Expand Down Expand Up @@ -919,12 +962,14 @@ def get_missing_file_ids(report: Report) -> List[str]:
analyzer_name = mip.checker_to_analyzer.get(
report.checker_name, report.analyzer_name)

rs_from_source, scc = get_review_status_from_source(report)
rs_from_source = get_review_status_from_source(report)

# False positive and intentional reports are considered as closed
# reports which is indicated with non-null "fixed_at" date.
fixed_at = None
if rs_from_source["status"] in ['false_positive', 'intentional']:
if rs_from_source.status in ['false_positive', 'intentional']:
# Keep in mind that now this is not handling review status
# rules, only review status source code comments
if old_report and old_report.review_status in \
['false_positive', 'intentional']:
fixed_at = old_report.review_status_date
Expand All @@ -933,11 +978,11 @@ def get_missing_file_ids(report: Report) -> List[str]:

report_id = self.__add_report(
session, run_id, report, file_path_to_id,
rs_from_source, scc, detection_status, detected_at,
rs_from_source, detection_status, detected_at,
run_history_time, analysis_info, analyzer_name, fixed_at)

self.__new_report_hashes[report.report_hash] = \
rs_from_source["status"]
rs_from_source.status
self.__already_added_report_hashes.add(report_path_hash)

LOG.debug("Storing report done. ID=%d", report_id)
Expand Down Expand Up @@ -986,20 +1031,15 @@ def get_skip_handler(
.all()

report_to_report_id = defaultdict(list)
for report in all_reports:
report_to_report_id[report.bug_id].append(report)
for db_report in all_reports:
report_to_report_id[db_report.bug_id].append(db_report)

enabled_checkers: Set[str] = set()
disabled_checkers: Set[str] = set()

# Processing analyzer result files.
processed_result_file_count = 0

for root_dir_path, _, report_file_paths in os.walk(report_dir):
for f in report_file_paths:
if not report_file.is_supported(f):
continue

for root_dir_path, _, report_file_paths in os.walk(report_dir):
LOG.debug("Get reports from '%s' directory", root_dir_path)

Expand All @@ -1022,36 +1062,41 @@ def get_skip_handler(
skip_handler, report_to_report_id)
processed_result_file_count += 1

session.flush()

self.__add_report_context(session, file_path_to_id)
# Get all relevant review_statuses for the newly stored reports
# TODO Call self.getReviewStatusRules instead of the beloew query
# CHHECK: Call self.getReviewStatusRules instead of the below query
# but before first check the performance
reports_to_rs_rules = session.query(ReviewStatus, DBReport) \
.join(DBReport, DBReport.bug_id == ReviewStatus.bug_hash) \
reports_to_rs_rules = session.query(ReviewStatusRule, DBReport) \
.join(DBReport, DBReport.bug_id == ReviewStatusRule.bug_hash) \
.filter(sqlalchemy.and_(DBReport.run_id == run_id,
ReviewStatus.bug_hash.
ReviewStatusRule.bug_hash.
in_(self.__new_report_hashes)))

# Create the sqlalchemy mappings for the bulk update
review_status_change = []
for rs_info in reports_to_rs_rules:
print(rs_info[0].bug_hash,
rs_info[0].status,
rs_info[0].author,
rs_info[0].message,
rs_info[0].date)
review_status_change.append(
{
"id": rs_info[1].id,
"review_status": rs_info[0].status,
"review_status_author": rs_info[0].author,
"review_status_date": rs_info[0].date,
"review_status_is_in_source": False,
"review_status_message": rs_info[0].message,
"fixed_at": rs_info[1].review_status_date
}
)
# Update all newly stored reports if there are any rs-rules for them
session.bulk_update_mappings(DBReport, review_status_change)
# Set the newly stored reports
for review_status, db_report in reports_to_rs_rules:
old_report = None
if db_report.bug_id in report_to_report_id:
old_report = report_to_report_id[db_report.bug_id][0]
fixed_at = None
if review_status.status in ['false_positive', 'intentional']:
# Keep in mind that now this is not handling review status
# rules, only review status source code comments
if old_report and old_report.review_status in \
['false_positive', 'intentional']:
fixed_at = old_report.review_status_date
else:
fixed_at = run_history_time

db_report.review_status = review_status.status
db_report.review_statuses_author = review_status.author
db_report.review_status_message = review_status.message
db_report.review_statuses_date = review_status.date
db_report.fixed_at = fixed_at
db_report.review_status_is_in_source = False

session.flush()

LOG.info("[%s] Processed %d analyzer result file(s).", self.__name,
processed_result_file_count)
Expand Down

0 comments on commit 8a5c6ec

Please sign in to comment.