In [None]:
import os, json
import esparto
import seaborn as sns
import pandas as pd
from datetime import date
from textwrap import shorten, dedent
from azure_notebook_reporting import KQL, BlobPath

os.environ.update(json.load(open(f"{os.environ['HOME']}/cloudfiles/code/nbenv.json")))

sns.set_theme(style="white", context="paper", font_scale=0.7, rc={
    'figure.figsize': (4,3),
    'figure.constrained_layout.use': True,
    'legend.loc': 'upper right'
})

kp = KQL(BlobPath(os.environ["AZURE_STORAGE_CONTAINER"], os.environ["AZURE_SUBSCRIPTION"])).set_agency("...")

In [None]:
queries = {
    "Incident Details": kp.kql2df("siemhealth/incidentdetail.kql"),
    "Alert Details": kp.kql2df("siemhealth/alertdetail.kql"),
    "Local Admins": kp.kql2df("siemhealth/localadminlogons.kql"),
    "Email Delivery": kp.kql2df("siemhealth/emaildelivery.kql"),
    "External Files": kp.kql2df("siemhealth/externaldownloads.kql"),
    "Guest Tenants": kp.kql2df("siemhealth/guestdownloads.kql"),
    "On Premise Logons": kp.kql2df("siemhealth/identitylogonevents.kql"),
    "Azure AD Logons": kp.kql2df("siemhealth/signins.kql"),
    "Operating Systems": kp.kql2df("siemhealth/operatingsystems.kql"),
    "Ingestion Detail": kp.kql2df("siemhealth/usage.kql")
}

In [None]:
querystats = {}
for key in queries:
    if queries[key].count().max() == 1:
        querystats[key] = [0, f"{queries[key].columns[0]} - {queries[key].iloc[0,0]}"]
    else:
        querystats[key] = [queries[key].count().max(), len(queries[key].columns)]
querystats = pd.DataFrame(querystats).T.rename(columns={0: "Rows", 1: "Columns"}).sort_values("Rows")

In [None]:
report_date, month = date.today().strftime("%B %Y"), date.today().strftime("%b%Y")

exec_summary = kp.nbpath / "exec_summaries" / f"{kp.agency}-{month}.md"
if not exec_summary.exists():
    exec_summary = kp.nbpath / "exec_summaries" / "default.md"

def minitable(df):
    html = df.to_html(classes=["table", "table-hover", "small"])
    html = f"<div class='table-responsive es-table'><small><small>{html}</small></small></div>"
    return(html)

report = esparto.Page(title=f"SIEM Health Report", table_of_contents=True)
report += f"#### {kp.agency_name} - {report_date}\n{exec_summary.open().read()}"
report += dedent(f"""
    ### Data volume queries
    This report aggregates information from several backend tables across the past 30 days. Some data is aggregated, while other elements include a top N type analysis of key useful entries.
    Below is a summary of data visible to the WA SOC in this environment from least to most. If there are any rows with **No data in timespan** under "Columns" 
    please review the [WA SOC Microsoft Sentinel Connector Guidance](https://github.com/wagov/soc-onboarding/blob/main/Sentinel-Connector-Guidance.md) for details on how to ingest additional data.
""")
report += minitable(querystats)
report["Mitre Tactics and Detection Rules"] = dedent("""
    The below charts show what tactics and rules are triggering your Sentinel incidents. An incident can include multiple alerts. 
    It's an aggregation of all the relevant evidence for a specific investigation. An incident is created based on [analytics rules](https://learn.microsoft.com/en-us/azure/sentinel/detect-threats-built-in) configured on the Analytics page. 
    The properties related to the alerts, such as severity and status, are set at the incident level. After you let Microsoft Sentinel know what kinds of threats you're looking for and how to find them, you can monitor detected threats by investigating incidents.
    For more information see [Using Sentinel to investigate Security Incidents](https://learn.microsoft.com/en-us/azure/sentinel/investigate-cases).
    """)

section = "Mitre Tactics and Detection Rules"
df = queries["Incident Details"].groupby(["Status", "Classification", "Severity", "Tactics", "Title"])
df = df["IncidentNumber"].nunique().unstack(level=[0,1]).dropna(axis=1, how="all").fillna(0)
df["Total"] = df.sum(numeric_only=True, axis=1)
df = df.sort_values("Total", ascending=False).replace({0: ""}).head(5)
report[section] += minitable(df)

df, interval = queries["Incident Details"].copy(deep=True), "1H"
df = df[df["TimeGenerated"] >= (df["TimeGenerated"].max() - pd.to_timedelta("5D"))].reset_index()
tactics = df.groupby("Tactics").resample(interval, on="TimeGenerated")["IncidentNumber"].nunique().reset_index()
report[section] += KQL.df2fig(tactics, "Detections by Tactic", "TimeGenerated", "IncidentNumber", "Tactics")
rules = df.groupby("Title").resample(interval, on="TimeGenerated")["IncidentNumber"].nunique().reset_index().rename(columns={"Title": "Rule"})
report[section] += KQL.df2fig(rules, "Detections by Rule", "TimeGenerated", "IncidentNumber", "Rule")

In [None]:
report["Cost Optimisation"] = """
Microsoft Sentinel has builtin [queries to understand your data ingestion](https://docs.microsoft.com/en-us/azure/sentinel/billing-monitor-costs#run-queries-to-understand-your-data-ingestion) at a per table level. To get further granularity you can look at specific devices sending a lot of data using [additional usage queries](https://docs.microsoft.com/en-us/azure/azure-monitor/logs/log-analytics-workspace-insights-overview#additional-usage-queries) or directly run manual queries from [Investigate your Log Analytics usage](https://docs.microsoft.com/en-us/azure/azure-monitor/logs/manage-cost-storage#investigate-your-log-analytics-usage).

Once you have identified the high cost items, you can reduce the events generated at the source, using a [Logstash filter](https://docs.microsoft.com/en-us/azure/sentinel/connect-logstash) for a custom source or with configuration in Sentinel itself:

- [Ingestion time transformations](https://docs.microsoft.com/en-us/azure/azure-monitor/logs/ingestion-time-transformations) - should be used to eliminate low value logs before they are persisted within Log Analytics & Sentinel
- [Basic Logs](https://docs.microsoft.com/en-us/azure/azure-monitor/logs/basic-logs-configure?tabs=cli-1%2Cportal-1) - should be used for high volume tables that aren't queried regularly (approx 1/4 cost per GB ingested)

The below chart shows your ingestion usage by table over the past week. 
Typically your ingestion should be under 30MB per licensed user in your tenant (5MB of that is included for free under the 
[Microsoft Sentinel benefit for Microsoft 365 E5, A5, F5, and G5 customers](https://azure.microsoft.com/en-us/offers/sentinel-microsoft-365-offer/)).
"""
df = queries["Ingestion Detail"].copy(deep=True)
df = df[df["TimeGenerated"] >= (df["TimeGenerated"].max() - pd.to_timedelta("5D"))].reset_index()
df["GB"] = df.IngestionVolume.map(lambda v: v / 1000)
report["Cost Optimisation"] += KQL.df2fig(df[df["Billable"] == "True"], "Billable Ingestion (GB) by Table", "TimeGenerated", "GB", "Table")

In [None]:
report.save_html("report.html")
(kp.reports / kp.agency / f"{kp.agency}-{month}-siemhealth.html").write_text(open("report.html").read())
report.save_pdf("report.pdf")
(kp.reports / kp.agency / f"{kp.agency}-{month}-siemhealth.pdf").write_bytes(open("report.pdf", "rb").read())
report