In [1]:
# add default values for parameters here

# Guided Hunting - Anomaly detection with Isolation Forest on Windows Logon data

 <details>
     <summary>&nbsp;<u>Details...</u></summary>

__Notebook Version:__ 1.0<br>
__Python Version:__ Python 3.8 - AzureML<br>
__Required Packages:__  Msticpy, Msticnb, matplotlib, ipywidgets <br>
__Platforms Supported:__  Azure Machine Learning Notebooks
     
__Data Source Required:__ Yes

__Data Source:__ SecurityEvents

</details>

**Description**

In this sample guided scenario notebook, we will demonstrate how to hunt for anamalous user logon activity using [Isolation forest](https://en.wikipedia.org/wiki/Isolation_forest) model.
<br>We will start with reading historical windows logon data from Microsoft Sentinel workspace, then we will
prepocess the dataset using series of data preparation steps such as aggregation, summarization, data type conversion, deriving new fields etc. Then we will perform [Feature Engineering](https://en.wikipedia.org/wiki/Feature_engineering) and select subset of features
from the data prepared from previous steps to create isolation forest model. Finally, we will run the model to score the results and identify anomalies with higher score.

<br>The isolation forest algorithm will split the data into two parts based on random threshold value. It will recursively continue the splitting until each data point is isolated. Then we will detect anomalies using isolation (how far a data point is to the rest of the data).
To detect an anomaly the isolation forest takes the average path length (number of splits to isolate a sample) of all the trees
for a given instance and uses this to determine if it is an anomaly (average shorter path lengths indicate anomalies)


***Please run the cells sequentially to avoid errors.
<br>Please do not use "run all cells".***

## Notebook initialization

- Checks for the correct Python version
- Checks versions and optionally installs required packages
- Imports the required packages into the notebook
- Sets a number of configuration options.

In [None]:
import subprocess
import sys

# List of required modules
required_modules = [
    'pandas', 'msticpy', 'ipywidgets', 'seaborn', 'matplotlib', 'scikit-learn'
]

# Function to check and install missing modules
def install_module(module_name):
    try:
        __import__(module_name)  # Attempt to import the module
    except ImportError:
        print(f"Module '{module_name}' not found. Attempting to install...")
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", module_name])  # Install the module using pip
            print(f"Successfully installed '{module_name}'.")
        except subprocess.CalledProcessError as e:
            print(f"Failed to install '{module_name}'. Error: {e}")
            sys.exit(1)  # Exit if installation fails

# Check and install required modules
for module in required_modules:
    install_module(module)

# After ensuring all modules are installed, import them
try:
    import pandas as pd
except ImportError as e:
    print(f"Error importing 'pandas': {e}")
    sys.exit(1)

try:
    import msticpy as mp
except ImportError as e:
    print(f"Error importing 'msticpy': {e}")
    sys.exit(1)

try:
    import ipywidgets as widgets
except ImportError as e:
    print(f"Error importing 'ipywidgets': {e}")
    sys.exit(1)

try:
    import seaborn as sns
except ImportError as e:
    print(f"Error importing 'seaborn': {e}")
    sys.exit(1)

try:
    import numpy as np
except ImportError as e:
    print(f"Error importing 'numpy': {e}")
    sys.exit(1)

try:
    import matplotlib.pyplot as plt
except ImportError as e:
    print(f"Error importing 'matplotlib': {e}")
    sys.exit(1)

try:
    from sklearn.decomposition import PCA
except ImportError as e:
    print(f"Error importing 'PCA' from 'sklearn.decomposition': {e}")
    sys.exit(1)

try:
    from sklearn.preprocessing import StandardScaler
except ImportError as e:
    print(f"Error importing 'StandardScaler' from 'sklearn.preprocessing': {e}")
    sys.exit(1)

try:
    from mpl_toolkits.mplot3d import Axes3D
except ImportError as e:
    print(f"Error importing 'Axes3D' from 'mpl_toolkits.mplot3d': {e}")
    sys.exit(1)

try:
    from sklearn.ensemble import IsolationForest
except ImportError as e:
    print(f"Error importing 'IsolationForest' from 'sklearn.ensemble': {e}")
    sys.exit(1)

# Additional setup
try:
    REQ_PYTHON_VER = (3, 11)
    REQ_MSTICPY_VER = (1, 7, 0)

    print("<h3>Starting Notebook setup...</h3>")

    mp.init_notebook(namespace=globals())
except Exception as e:
    print(f"Error during MSTICPy initialization: {e}")
    sys.exit(1)

# Setting widget defaults
try:
    WIDGET_DEFAULTS = {
        "layout": widgets.Layout(width="95%"),
        "style": {"description_width": "initial"},
    }
except Exception as e:
    print(f"Error setting widget defaults: {e}")
    sys.exit(1)


### Authentication to LA Workspace
 <details>
     <summary>&nbsp;<u>Details...</u></summary>
If you are using user/device authentication, run the following cell.
- Click the 'Copy code to clipboard and authenticate' button.
- This will pop up an Azure Active Directory authentication dialog (in a new tab or browser window). The device code will have been copied to the clipboard.
- Select the text box and paste (Ctrl-V/Cmd-V) the copied value.
- You should then be redirected to a user authentication page where you should authenticate with a user account that has permission to query your Log Analytics workspace.

Use the following syntax if you are authenticating using an Azure Active Directory AppId and Secret:
```
%kql loganalytics://tenant(aad_tenant).workspace(WORKSPACE_ID).clientid(client_id).clientsecret(client_secret)
```
instead of
```
%kql loganalytics://code().workspace(WORKSPACE_ID)
```

Note: you may occasionally see a JavaScript error displayed at the end of the authentication - you can safely ignore this.<br>
On successful authentication you should see a ```popup schema``` button.
To find your Workspace Id go to [Log Analytics](https://ms.portal.azure.com/#blade/HubsExtension/Resources/resourceType/Microsoft.OperationalInsights%2Fworkspaces). Look at the workspace properties to find the ID.
</details>

In [3]:
# See if we have a Microsoft Sentinel Workspace defined in our config file.
# If not, let the user specify Workspace and Tenant IDs

#ws_config = WorkspaceConfig(workspace="Default")
#if not ws_config.config_loaded:
    #ws_config.prompt_for_ws()

#qry_prov = QueryProvider(data_environment="MSSentinel")
# Authentication
#qry_prov.connect(ws_config)
#table_index = qry_prov.schema_tables

## Data Preparation
In this step, we will prepare the Windows logon events and do some preprocessing before we do data modelling. For this case, we are primarily considering logon event ids 4624, 4625 with specific logon type.

4624 and 4625 events are related to Successful sign in and Failed Sign-in. You can check more about the event Ids in below links.

- [4624(S): An account was successfully logged on.](https://docs.microsoft.com/windows/security/threat-protection/auditing/event-4624)
- [4625(F): An account failed to log on.](https://docs.microsoft.com/windows/security/threat-protection/auditing/event-4625)

### Historical Data Processing
For this model, we can consider upto 21 days of historical data.
For this demo, we are retrieving data from the original table. We also have provided demo dataset if you want to test the notebook without connecting to your workspace.

In [None]:
#Loading data from dataset in github for testing
dataset_url = "https://raw.githubusercontent.com/Azure/Azure-Sentinel-Notebooks/master/src/data/iforest-demo-data.csv"
win_agg_df = pd.read_csv(dataset_url)
# Generate profile report
win_agg_df

## Feature Engineering

In this step, we are creating additional features/columns.

We have selected 4 columns(features) with numeric data points
- FailedLogons
- SuccessfulLogons
- ComputersSuccessfulAccess
- SrcIpSuccessfulAccess

and also deriving additional columns by calculating mean, standard deviation and zscores on each of them. Converting to zscores is not necessary for numerical features as Isolation forest are scale invariant but this pre-processing is done so as to use these features later in the visualizations such as PCA.  We have also done log scaling as part of data pre-processing steps which is not required but based on various data studies in production environment we have seen it gives finer results. You can skip or add this step based on data study and analyzing results.

In [None]:
def get_zscore(value, mean, std):
    # calculate z-score or number of standard deviations from mean
    if (
        std == 0
        or std is None
        or str(std).lower() in ["nan", "none", "null"]
        or mean is None
    ):
        if value == 0.0:
            return 0.0
        elif value != 0:
            return np.log10(value + 1)
    ans = (value - mean) / std
    # only interested in increases
    ans = max(0.0, ans)
    # take log to dampen numbers
    ans = np.log10(ans + 1)
    return float(ans)


data = win_agg_df.copy()

zscore_columns = [
    "FailedLogons",
    "SuccessfulLogons",
    "ComputersSuccessfulAccess",
    "SrcIpSuccessfulAccess",
]
means = [x + "_mean" for x in zscore_columns]
stds = [x + "_std" for x in zscore_columns]
zscores = [x + "_zscore" for x in zscore_columns]

ind = ["DstDomain", "DstUser", "Date"]

zscore = data[zscore_columns + ind]
zscore = zscore.fillna(0)

# getting means for user, domain and logon type combination
zscore[means] = zscore.groupby(["DstDomain", "DstUser"])[zscore_columns].transform(
    "mean"
)

# getting standard deviation for user, domain and logon type combination
zscore[stds] = zscore.groupby(["DstDomain", "DstUser"])[zscore_columns].transform(
    "std", ddof=1
)

zscore = zscore.drop_duplicates(["DstDomain", "DstUser"])

zscore = zscore[means + stds + ["DstDomain", "DstUser"]]

data = data.merge(zscore, how="left", on=["DstDomain", "DstUser"])

# Calculating z scores
for column in zscore_columns:
    data[f"{column}_zscore"] = data.apply(
        lambda row: get_zscore(
            row[f"{column}"], row[f"{column}_mean"], row[f"{column}_std"]
        ),
        axis=1,
    )

print("Operations on the dataframe succesful , and here is a sample of the  newly created data.")
data.head()

## Data Modelling
In this step we will specify features to be modelled and run isolation forest algorithm against the data.

###  Isolation Forest Anomaly detection

In this step, we will select subset of features generated from previous step and use it for data modelling. We will also use Isolation Forest model on the data with selected features and calculate the anomalies.

In [None]:
def apply_isolation_forest(df, n_estimators, contamination=0.01):
    """Applies Isolation Forest to a given dataset and returns the predicted anomalies."""
    clf = IsolationForest(
        n_estimators=n_estimators,
        max_samples="auto",
        contamination=contamination,
        max_features=6,
        bootstrap=False,
        n_jobs=-1,
        random_state=42,
        verbose=0,
    )
    clf.fit(df.values)
    pred = clf.predict(df.values)
    scores = clf.decision_function(df.values)
    return clf, pred, scores

# specify the metrics column names to be modelled
features = [
    "FailedLogons_zscore",
    "SuccessfulLogons_zscore",
    "ComputersSuccessfulAccess_zscore",
    "ComputerDomainsSuccessfulAccess",
    "SrcIpSuccessfulAccess_zscore",
    "SrcHostNameSuccessfulAccess",
]

data[features] = data[features].fillna(0)

X = data[features].copy()
if X.shape[0] < 500:
    n_estimators = len(features) * 4 + X.shape[0] * 2
else:
    n_estimators = 100



clf, pred, scores = apply_isolation_forest(X, n_estimators, contamination=0.01)
data["anomaly"] = pred
data["score"] = scores * -1
# excluding users who do not have any successful logon history.
data = data.loc[data["SuccessfulLogons"] > 0]
outliers = data.loc[data["anomaly"] == -1]

# Columns to drop
columns_to_drop = [
    'FailedLogons_mean', 'SuccessfulLogons_mean', 'ComputersSuccessfulAccess_mean', 'SrcIpSuccessfulAccess_mean',
    'FailedLogons_std', 'SuccessfulLogons_std', 'ComputersSuccessfulAccess_std', 'SrcIpSuccessfulAccess_std',
    'FailedLogons_zscore', 'SuccessfulLogons_zscore', 'ComputersSuccessfulAccess_zscore', 'SrcIpSuccessfulAccess_zscore'
]

# Drop the columns
data_to_display = outliers.drop(columns=columns_to_drop)

print("Model trained succesfuly\nHere are its predictions")
print("Number of outliers:", outliers.shape[0])
print("Top anomalies by score")
data_to_display = data_to_display.sort_values(by=["score"], ascending=False)
data_to_display.head()






## Data Visualization
In this step, we will explore various ways we can visualize the outliers identified from previous step.

### Reducing the dimensionality of the data

In [7]:
pca3 = PCA(n_components=3)  # Reduce to k=3 dimensions
scaler = StandardScaler()
# normalize the metrics
X = scaler.fit_transform(data[features])
X_reduce = pca3.fit_transform(X)


# Add destination user to label
data["labels"] = np.where(
    data["anomaly"] == -1,
    data["DstDomain"].astype("str")+" "
    + data["DstUser"].astype("str"),
    "non-anomalous",
)




### 2D ScatterPlot

In [None]:
# Create a 2D scatter plot using seaborn
fig, ax = plt.subplots(figsize=(10, 8))
sns.scatterplot(x=X_reduce[:, 0], y=X_reduce[:, 1], hue=data['labels'], palette='viridis', ax=ax)

# Set titles and labels
ax.set_title(f'Distribution of data points in 2-dimensional space based on the model predictions')

plt.legend(loc='best')
plt.show()

### 3D ScatterPlot

In [None]:
# Create a 3D scatter plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
# Scatter plot for non-anomalous data
non_anomalous = data[data["anomaly"] != -1]
ax.scatter(
    X_reduce[non_anomalous.index, 0],
    X_reduce[non_anomalous.index, 1],
    X_reduce[non_anomalous.index, 2],
    c='blue', label='Non-anomalous'
)

# Scatter plot for anomalous data
anomalous = data[data["anomaly"] == -1]
sc = ax.scatter(
    X_reduce[anomalous.index, 0],
    X_reduce[anomalous.index, 1],
    X_reduce[anomalous.index, 2],
    c='red', label='Anomalous'
)

# Annotate the anomalies with detailed labels
offset = 0.1
for i in anomalous.index:
    ax.text(
        X_reduce[i, 0]+offset ,
        X_reduce[i, 1]+offset,
        X_reduce[i, 2]+offset,
        data["labels"][i],
        color='red',
        fontsize=8
    )


# Set titles and labels
ax.set_title(f'Distribution of data points in 3-dimensional space based on the model predictions')
# Create legend
ax.legend(loc='best')
plt.show()

### Conclusion
This notebook has identified and visualized anomalous logon activities using the Isolation Forest model. SOC analysts can use the provided summaries and visualizations to investigate and respond to potential security threats.
