# Isolation Forest

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.linear_model import SGDClassifier


In [2]:
df = pd.read_csv("../data/all_data.csv")  # Replace with your actual data source 

# -----------------------------------------------------------
# 1. Convert timestamps to Python datetime
# -----------------------------------------------------------
df["timestamp_parsed"] = pd.to_datetime(df["timestamp_parsed"])
df["last_seen"] = pd.to_datetime(df["last_seen"])

# -----------------------------------------------------------
# 2. Feature Engineering (timestamps → numeric features)
# -----------------------------------------------------------
df["hour"] = df["timestamp_parsed"].dt.hour
df["day"] = df["timestamp_parsed"].dt.day
df["weekday"] = df["timestamp_parsed"].dt.weekday
df["time_alive"] = (df["last_seen"] - df["timestamp_parsed"]).dt.total_seconds()

# -----------------------------------------------------------
# 3. Select numeric features only
# -----------------------------------------------------------
numeric_cols = ["pid", "cpu", "ram", "occurrence_count", "hour", "day", "weekday", "time_alive"]

X = df[numeric_cols]

# -----------------------------------------------------------
# 4. Scale numeric features
# -----------------------------------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Optional: supervised mode (if Class exists)
y = df["Class"] if "Class" in df.columns else None

# -----------------------------------------------------------
# 5. Train-test split (only if supervised)
# -----------------------------------------------------------
if y is not None:
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42
    )

In [3]:
iso_model = IsolationForest(
    n_estimators=200,
    contamination=0.01,     # % of anomalies expected
    random_state=42,
    bootstrap=True
)

iso_model.fit(X_scaled)


0,1,2
,n_estimators,200
,max_samples,'auto'
,contamination,0.01
,max_features,1.0
,bootstrap,True
,n_jobs,
,random_state,42
,verbose,0
,warm_start,False


In [4]:
df["anomaly"] = iso_model.predict(X_scaled)

# IsolationForest outputs:
# -1 → anomaly
#  1 → normal


In [5]:
import joblib
import os

os.makedirs("../models", exist_ok=True)

joblib.dump(iso_model, "../models/iso_model.pkl")
joblib.dump(scaler, "../models/scaler.pkl")

print("Model + scaler saved!")


Model + scaler saved!


In [6]:
import joblib
import numpy as np

iso_model = joblib.load("../models/iso_model.pkl")
scaler = joblib.load("../models/scaler.pkl")

def predict_realtime(pid, cpu, ram, occurrence_count, hour, day, weekday, time_alive):
    sample = np.array([[pid, cpu, ram, occurrence_count, hour, day, weekday, time_alive]])
    sample_scaled = scaler.transform(sample)
    result = iso_model.predict(sample_scaled)[0]
    return "ANOMALY" if result == -1 else "NORMAL"


In [7]:
predict_realtime(
    pid=450,
    cpu=12.3,
    ram=50,
    occurrence_count=10,
    hour=16,
    day=1,
    weekday=0,
    time_alive=120.0
)




'ANOMALY'

# DBSCAN

In [8]:
dbscan_model = DBSCAN(
    eps=0.7,           # distance threshold → tune this carefully
    min_samples=5,     # how many points needed for a dense cluster
    metric='euclidean'
)

dbscan_labels = dbscan_model.fit_predict(X_scaled)

In [9]:
df["dbscan_anomaly"] = (dbscan_labels == -1).astype(int)

In [10]:
import joblib
import os

os.makedirs("../models", exist_ok=True)

joblib.dump(dbscan_model, "../models/dbscan_model.pkl")
joblib.dump(scaler, "../models/dbscan_scaler.pkl")

print("DBSCAN model saved!")

DBSCAN model saved!


In [11]:
import numpy as np
import joblib

dbscan_model = joblib.load("../models/dbscan_model.pkl")
scaler = joblib.load("../models/dbscan_scaler.pkl")

def dbscan_predict_realtime(pid, cpu, ram, occurrence_count, hour, day, weekday, time_alive):
    sample = np.array([[pid, cpu, ram, occurrence_count, hour, day, weekday, time_alive]])
    sample_scaled = scaler.transform(sample)

    # distance from sample to all core samples
    core_samples = dbscan_model.components_
    distances = np.linalg.norm(core_samples - sample_scaled, axis=1)

    # check if normal or anomaly
    if np.any(distances <= dbscan_model.eps):
        return "NORMAL"
    else:
        return "ANOMALY"


In [12]:
dbscan_predict_realtime(
    pid=450,
    cpu=22.5,
    ram=40,
    occurrence_count=5,
    hour=18,
    day=3,
    weekday=2,
    time_alive=200.0
)




'ANOMALY'

In [13]:

y = df['anomaly']  # required for AdaBoost (supervised)

In [14]:
# Base estimator that supports partial_fit
base = SGDClassifier(
    loss="log_loss",
    learning_rate="optimal",
    random_state=42
)

# AdaBoost with online base learner
ada = AdaBoostClassifier(
    estimator=base,
    n_estimators=50,
    learning_rate=0.1,
    algorithm="SAMME",
    random_state=42
)

# Initial training (full batch)
ada.fit(X_scaled, y)



0,1,2
,estimator,SGDClassifier...ndom_state=42)
,n_estimators,50
,learning_rate,0.1
,algorithm,'SAMME'
,random_state,42

0,1,2
,loss,'log_loss'
,penalty,'l2'
,alpha,0.0001
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [15]:
# Update base estimator inside AdaBoost
def adaboost_partial_fit(model, X_new, y_new, classes):
    # Update base learner
    for estimator in model.estimators_:
        estimator.partial_fit(X_new, y_new, classes=classes)

    return model


In [16]:
classes = y.unique()

X_new = X_scaled[:10]      # new incoming realtime samples
y_new = y[:10]

ada = adaboost_partial_fit(ada, X_new, y_new, classes)

In [17]:
import joblib
joblib.dump(ada, "../models/ada_model.pkl")
joblib.dump(scaler, "../models/ada_scaler.pkl")

['../models/ada_scaler.pkl']

In [18]:
import numpy as np
import joblib

ada = joblib.load("../models/ada_model.pkl")
scaler = joblib.load("../models/ada_scaler.pkl")

def adaboost_predict_realtime(pid, cpu, ram, occurrence_count, hour, day, weekday, time_alive):
    x = np.array([[pid, cpu, ram, occurrence_count, hour, day, weekday, time_alive]])
    x_scaled = scaler.transform(x)
    pred = ada.predict(x_scaled)[0]
    return pred

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42826 entries, 0 to 42825
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   id                42826 non-null  object             
 1   timestamp         42826 non-null  object             
 2   pid               42826 non-null  int64              
 3   name              42826 non-null  object             
 4   cpu               42826 non-null  float64            
 5   ram               42826 non-null  float64            
 6   process_key       42826 non-null  object             
 7   occurrence_count  42826 non-null  int64              
 8   timestamp_parsed  42826 non-null  datetime64[ns, UTC]
 9   last_seen         42826 non-null  datetime64[ns, UTC]
 10  hour              42826 non-null  int32              
 11  day               42826 non-null  int32              
 12  weekday           42826 non-null  int32              
 13  t

In [20]:
df

Unnamed: 0,id,timestamp,pid,name,cpu,ram,process_key,occurrence_count,timestamp_parsed,last_seen,hour,day,weekday,time_alive,anomaly,dbscan_anomaly
0,407bda89-3ae7-4d5c-9c68-dcd21989c0bd,2025-11-30T21:45:47.342014+00:00,226,loginwindow,1.3,37.437500,226||loginwindow,82,2025-11-30 21:45:47.342014+00:00,2025-12-01 16:33:36.828203+00:00,21,30,6,67669.486189,1,0
1,e8d1c65e-a41c-4d58-aa0f-1cdeffd7e728,2025-11-30T21:45:47.342014+00:00,454,distnoted,0.1,3.121094,454||distnoted,82,2025-11-30 21:45:47.342014+00:00,2025-12-01 16:33:36.828203+00:00,21,30,6,67669.486189,1,0
2,4dddc5d9-0745-4ce6-ba25-aec89983820e,2025-11-30T21:45:47.342014+00:00,455,cfprefsd,0.0,5.687500,455||cfprefsd,82,2025-11-30 21:45:47.342014+00:00,2025-12-01 16:33:36.828203+00:00,21,30,6,67669.486189,1,0
3,ac64f4ab-ec7d-436c-b34d-b8e81e9dda09,2025-11-30T21:45:47.342014+00:00,456,UserEventAgent,0.0,9.820312,456||UserEventAgent,82,2025-11-30 21:45:47.342014+00:00,2025-12-01 16:33:36.828203+00:00,21,30,6,67669.486189,1,0
4,0fdaadfd-3b53-412f-8fe2-ee392e5015e5,2025-11-30T21:45:47.342014+00:00,461,neagent,0.0,4.515625,461||neagent,82,2025-11-30 21:45:47.342014+00:00,2025-12-01 16:33:36.828203+00:00,21,30,6,67669.486189,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42821,a7d35935-71d2-4d80-b89c-a65480302ca4,2025-12-01T16:33:36.828203+00:00,95982,MTLCompilerService,0.0,0.089844,95982||MTLCompilerService,82,2025-12-01 16:33:36.828203+00:00,2025-12-01 16:33:36.828203+00:00,16,1,0,0.000000,1,0
42822,c0f486ac-5f44-4b1d-b3d5-5b775c8b146a,2025-12-01T16:33:36.828203+00:00,96067,screencaptureui,0.0,25.210938,96067||screencaptureui,82,2025-12-01 16:33:36.828203+00:00,2025-12-01 16:33:36.828203+00:00,16,1,0,0.000000,1,0
42823,149f6591-73fe-4a24-a45d-23fead345427,2025-12-01T16:33:36.828203+00:00,96906,com.apple.WebKit.WebContent,0.0,25.855469,96906||com.apple.WebKit.WebContent,82,2025-12-01 16:33:36.828203+00:00,2025-12-01 16:33:36.828203+00:00,16,1,0,0.000000,1,0
42824,4d6ebaa3-e9c6-45ce-92eb-eddd94d94bf6,2025-12-01T16:33:36.828203+00:00,97130,com.apple.WebKit.WebContent,0.0,106.296875,97130||com.apple.WebKit.WebContent,55,2025-12-01 16:33:36.828203+00:00,2025-12-01 16:33:36.828203+00:00,16,1,0,0.000000,1,0


In [21]:
df['dbscan_anomaly'].unique()

array([0, 1])

In [22]:
df['anomaly'].unique()

array([ 1, -1])