In [120]:
import os, json
import esparto, tinycss2
import seaborn as sns
import pandas as pd
from datetime import date
from textwrap import shorten, dedent
from azure_notebook_reporting import KQL, BlobPath

os.environ.update(json.load(open(f"{os.environ['HOME']}/cloudfiles/code/nbenv.json")))

report_date, month = date.today().strftime("%B %Y"), date.today().strftime("%b%Y")

kp = KQL(BlobPath(os.environ["AZURE_STORAGE_CONTAINER"], os.environ["AZURE_SUBSCRIPTION"])).set_agency("...")
kp_sample = KQL(BlobPath(os.environ["AZURE_STORAGE_CONTAINER"], os.environ["AZURE_SUBSCRIPTION"])).set_agency("...")
sample_only = False # if True, build report with only mock data
# if False, build report as usual, only substituting missing data with sample data
# sections should 'anonymise' sample data prior to rendering

sns.set_theme(style="darkgrid", context="paper", font_scale=0.7, rc={"figure.figsize": (7, 4), "figure.constrained_layout.use": True, "legend.loc": "upper right"})

background_svg = os.getcwd() + "/background.svg"
extra_css = tinycss2.parse_stylesheet("""
@media print {
    .es-page-title,
    .es-section-title,
    .es-row-title,
    .es-column-title {
        page-break-after: avoid;
        color: #CC5733;
        font-weight: bold;
    }

    .es-row-body,
    .es-column-body,
    .es-card {
        page-break-inside: avoid;
    }

    .es-column-body,
    .es-card,
    .es-card-body {
        flex: 1 !important;
    }
}

body > main {
    font-family: arial;
    font-size: 0.8em;
    color: #444;
}

@page {
    size: A4 portrait;
    font-family: arial;
    margin: 1.5cm 1cm;
    @bottom-right {
        font-size: 0.6em;
        line-height: 1.5em;
        margin-bottom: -0.2cm;
        margin-right: -0.5cm;
        color: #aaa;
        """ + f"content: 'WA SOC Sentinel Report ({kp.agency_name})\\a{report_date} | ' counter(page) ' / ' counter(pages)" + """;
        white-space: pre;
    }
    background: url('""" + background_svg + """');
    background-position: -1cm -1.5cm;
    background-size: 210mm 297mm;
}

html > body {
    background-color: transparent !important;
}
""")



def minitable(df):
    html = df.to_html(classes=["table", "table-hover", "small"])
    html = f"<div class='table-responsive es-table'><small><small>{html}</small></small></div>"
    return html


def dfago(df, timespan, col="TimeGenerated"):
    df = df.copy(deep=True)
    return df[df[col] >= (df[col].max() - pd.to_timedelta(timespan))].reset_index()

In [94]:
%%capture
queries = {
    "Incident Details": "siemhealth/incidentdetail.kql",
    "Alert Details": "siemhealth/alertdetail.kql",
    "Local Admins": "siemhealth/localadminlogons.kql",
    "Email Delivery": "siemhealth/emaildelivery.kql",
    "External Files": "siemhealth/externaldownloads.kql",
    "Guest Tenants": "siemhealth/guestdownloads.kql",
    "On Premise Logons": "siemhealth/identitylogonevents.kql",
    "Azure AD Logons": "siemhealth/signins.kql",
    "Operating Systems": "siemhealth/operatingsystems.kql",
    "Ingestion Detail": "siemhealth/usage.kql",
}
querystats = {}
for key, kql in queries.items():
    if sample_only:
        queries[key] = kp.kql2df((kp.kql / kql).open().read() + "| take 0")
    else:
        queries[key] = kp.kql2df(kql)
    if queries[key].count().max() == 1:
        querystats[key] = [0, f"{queries[key].columns[0]} - {queries[key].iloc[0,0]}"]
        queries[key] = kp_sample.kql2df(kql)
    else:
        querystats[key] = [queries[key].count().max(), len(queries[key].columns)]
querystats = pd.DataFrame(querystats).T.rename(columns={0: "Rows", 1: "Columns"}).sort_values("Rows")

In [95]:
%matplotlib agg
report = esparto.Page(title=f"WA SOC Sentinel Report (Preview)", table_of_contents=True)
section = "Executive Summary"
report[section] = f"*{kp.agency_name} - {report_date}*"
exec_summary = kp.nbpath / "exec_summaries" / f"{kp.agency}-{month}.md"
if not exec_summary.exists():
    exec_summary = kp.nbpath / "exec_summaries" / "default.md"
report[section] += exec_summary.open().read()

report += dedent("""
    ### Data volume queries
    This report aggregates information from several backend tables across the past 30 days. Some data is aggregated, while other elements include a top N type analysis of key useful entries.
    Below is a summary of data visible to the WA SOC in this environment from least to most. If there are any rows with **No data in timespan** under "Columns" 
    please review the [WA SOC Microsoft Sentinel Connector Guidance](https://github.com/wagov/soc-onboarding/blob/main/Sentinel-Connector-Guidance.md) for details on how to ingest additional data.

    **Report Data Sources**
""")

report += minitable(querystats)

section = "Tactics and Rules"
report[section] = dedent("""
    The below charts show what tactics and rules are triggering your Sentinel incidents. An incident can include multiple alerts. 
    It's an aggregation of all the relevant evidence for a specific investigation. An incident is created based on [analytics rules](https://learn.microsoft.com/en-us/azure/sentinel/detect-threats-built-in) configured on the Analytics page. 
    The properties related to the alerts, such as severity and status, are set at the incident level. After you let Microsoft Sentinel know what kinds of threats you're looking for and how to find them, you can monitor detected threats by investigating incidents.
    For more information see [Using Sentinel to investigate Security Incidents](https://learn.microsoft.com/en-us/azure/sentinel/investigate-cases).
""")

if querystats["Rows"]["Incident Details"] == 0:
    report[section] += "*MISSING DATA: Please confirm there are [analytics rules](https://learn.microsoft.com/en-us/azure/sentinel/detect-threats-built-in) configured on the Analytics page.*"
    report[section] += f"** SAMPLE DATASET BELOW (substituting for {querystats['Columns']['Incident Details']} **"
df = queries["Incident Details"].groupby(["Status", "Classification", "Severity", "Tactics", "Title"])
df = df["IncidentNumber"].nunique().unstack(level=[0,1]).dropna(axis=1, how="all").fillna(0)
df["Total"] = df.sum(numeric_only=True, axis=1)
df = df.sort_values("Total", ascending=False).replace({0: ""}).head(5)
report[section] += minitable(df)

df, interval = dfago(queries["Incident Details"], "7D"), "3H"
tactics = df.groupby("Tactics").resample(interval, on="TimeGenerated")["IncidentNumber"].nunique().reset_index()
report[section] += KQL.df2fig(tactics, "Detections by Tactic", "TimeGenerated", "IncidentNumber", "Tactics")
rules = df.groupby("Title").resample(interval, on="TimeGenerated")["IncidentNumber"].nunique().reset_index().rename(columns={"Title": "Rule"})
report[section] += KQL.df2fig(rules, "Detections by Rule", "TimeGenerated", "IncidentNumber", "Rule")

In [96]:
section = "Cost Optimisation"
report[
    section
] = """
Microsoft Sentinel has builtin [queries to understand your data ingestion](https://docs.microsoft.com/en-us/azure/sentinel/billing-monitor-costs#run-queries-to-understand-your-data-ingestion) at a per table level. To get further granularity you can look at specific devices sending a lot of data using [additional usage queries](https://docs.microsoft.com/en-us/azure/azure-monitor/logs/log-analytics-workspace-insights-overview#additional-usage-queries) or directly run manual queries from [Investigate your Log Analytics usage](https://docs.microsoft.com/en-us/azure/azure-monitor/logs/manage-cost-storage#investigate-your-log-analytics-usage).

Once you have identified the high cost items, you can reduce the events generated at the source, using a [Logstash filter](https://docs.microsoft.com/en-us/azure/sentinel/connect-logstash) for a custom source or with configuration in Sentinel itself:

- [Ingestion time transformations](https://docs.microsoft.com/en-us/azure/azure-monitor/logs/ingestion-time-transformations) - should be used to eliminate low value logs before they are persisted within Log Analytics & Sentinel
- [Basic Logs](https://docs.microsoft.com/en-us/azure/azure-monitor/logs/basic-logs-configure?tabs=cli-1%2Cportal-1) - should be used for high volume tables that aren't queried regularly (approx 1/4 cost per GB ingested)

The below chart shows your ingestion usage by table over the past week. 
Typically your ingestion should be under 30MB per licensed user in your tenant (5MB of that is included for free under the 
[Microsoft Sentinel benefit for Microsoft 365 E5, A5, F5, and G5 customers](https://azure.microsoft.com/en-us/offers/sentinel-microsoft-365-offer/)).
"""
df = dfago(queries["Ingestion Detail"], "5D")
df["GB"] = df.IngestionVolume.map(lambda v: v / 1000)
report[section] += KQL.df2fig(df[df["Billable"] == "True"], "Billable Ingestion (GB) by Table", "TimeGenerated", "GB", "Table")

In [121]:
if not esparto.options.esparto_css == "pdf.css": # once off tweak default css
    base_css = tinycss2.parse_stylesheet(open(esparto.options.esparto_css).read())
    base_css = [r for r in base_css if not hasattr(r, "at_keyword")] # strip media/print styles so we can replace
    esparto.options.esparto_css = "pdf.css"

with open(esparto.options.esparto_css, "w") as pdf_css:
    for rule in (base_css + extra_css):
        pdf_css.write(rule.serialize())
# report.save_html("report.html")
# (kp.reports / kp.agency / f"{kp.agency}-{month}-siemhealth.html").write_text(open("report.html").read())
# report.save_pdf("report.pdf")
# (kp.reports / kp.agency / f"{kp.agency}-{month}-siemhealth.pdf").write_bytes(open("report.pdf", "rb").read())
html = report.save_pdf("report.pdf", return_html=True)
open("report.pdf.html", "w").write(html)
from IPython.display import IFrame
IFrame("report.pdf", width=1200, height=800)