In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import os

print(os.listdir("/kaggle/input"))


['spi-dataset']


In [2]:
print(os.listdir("/kaggle/input/spi-dataset"))


['LDAP', 'logon.csv', 'device.csv', 'psychometric.csv', 'http.csv', 'email.csv', 'file.csv']


In [25]:
import pandas as pd
import numpy as np
import gc
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from functools import reduce

BASE_PATH = "/kaggle/input/spi-dataset/"
CHUNK = 300_000


In [26]:
print("Processing logon.csv ...")

logon_list = []

for c in pd.read_csv(BASE_PATH + "logon.csv", chunksize=CHUNK):
    c['date'] = pd.to_datetime(c['date'])
    
    daily = c.groupby(
        [c['user'], c['date'].dt.date]
    ).agg(
        logon_count=('activity', lambda x: (x == 'Logon').sum()),
        logoff_count=('activity', lambda x: (x == 'Logoff').sum()),
        unique_pcs=('pc', 'nunique')
    ).reset_index()
    
    daily.columns = ['user', 'date', 'logon_count', 'logoff_count', 'unique_pcs']
    logon_list.append(daily)

logon_features = pd.concat(logon_list)
del logon_list; gc.collect()

print("Logon features shape:", logon_features.shape)


Processing logon.csv ...
Logon features shape: (331649, 5)


In [27]:
print("Processing http.csv ...")

http_list = []

for c in pd.read_csv(BASE_PATH + "http.csv", chunksize=CHUNK):
    c['date'] = pd.to_datetime(c['date']).dt.date
    
    daily = c.groupby(['user', 'date']).agg(
        http_requests=('id', 'count'),
        unique_urls=('url', 'nunique')
    ).reset_index()
    
    http_list.append(daily)

http_features = pd.concat(http_list)
del http_list; gc.collect()

print("HTTP features shape:", http_features.shape)


Processing http.csv ...
HTTP features shape: (401551, 4)


In [17]:
import pandas as pd

sample = pd.read_csv(
    "/kaggle/input/spi-dataset/file.csv",
    nrows=5
)

print(sample.columns.tolist())
sample.head()


['id', 'date', 'user', 'pc', 'filename', 'content']


Unnamed: 0,id,date,user,pc,filename,content
0,{L9G8-J9QE34VM-2834VDPB},01/02/2010 07:23:14,MOH0273,PC-6699,EYPC9Y08.doc,D0-CF-11-E0-A1-B1-1A-E1 during difficulty over...
1,{H0W6-L4FG38XG-9897XTEN},01/02/2010 07:26:19,MOH0273,PC-6699,N3LTSU3O.pdf,25-50-44-46-2D carpenters 25 landed strait dis...
2,{M3Z0-O2KK89OX-5716MBIM},01/02/2010 08:12:03,HPH0075,PC-2417,D3D3WC9W.doc,D0-CF-11-E0-A1-B1-1A-E1 union 24 declined impo...
3,{E1I4-S4QS61TG-3652YHKR},01/02/2010 08:17:00,HPH0075,PC-2417,QCSW62YS.doc,D0-CF-11-E0-A1-B1-1A-E1 becoming period begin ...
4,{D4R7-E7JL45UX-0067XALT},01/02/2010 08:24:57,HSB0196,PC-8001,AU75JV6U.jpg,FF-D8


In [18]:
print("Processing file.csv ...")

file_list = []

for c in pd.read_csv(BASE_PATH + "file.csv", chunksize=CHUNK):
    c['date'] = pd.to_datetime(c['date']).dt.date

    daily = c.groupby(['user', 'date']).agg(
        file_events=('id', 'count'),
        unique_files=('filename', 'nunique'),
        avg_filename_length=('filename', lambda x: x.astype(str).str.len().mean())
    ).reset_index()

    file_list.append(daily)

file_features = pd.concat(file_list)
del file_list
import gc; gc.collect()

print("File features shape:", file_features.shape)


Processing file.csv ...
File features shape: (45950, 5)


In [19]:
print("Processing email.csv ...")

email_list = []

for c in pd.read_csv(BASE_PATH + "email.csv", chunksize=CHUNK):
    c['date'] = pd.to_datetime(c['date']).dt.date
    
    daily = c.groupby(['user', 'date']).agg(
        emails_sent=('id', 'count'),
        avg_email_size=('size', 'mean'),
        attachments=('attachments', lambda x: (x > 0).sum()),
        external_mails=('to', lambda x: x.str.contains("@", na=False).sum())
    ).reset_index()
    
    email_list.append(daily)

email_features = pd.concat(email_list)
del email_list; gc.collect()

print("Email features shape:", email_features.shape)


Processing email.csv ...
Email features shape: (329654, 6)


In [20]:
print("Processing device.csv ...")

device_list = []

for c in pd.read_csv(BASE_PATH + "device.csv", chunksize=CHUNK):
    c['date'] = pd.to_datetime(c['date']).dt.date
    
    daily = c.groupby(['user', 'date']).agg(
        usb_connect=('activity', lambda x: (x == 'Connect').sum()),
        usb_disconnect=('activity', lambda x: (x == 'Disconnect').sum())
    ).reset_index()
    
    device_list.append(daily)

device_features = pd.concat(device_list)
del device_list; gc.collect()

print("Device features shape:", device_features.shape)


Processing device.csv ...
Device features shape: (55754, 4)


In [22]:
print("Processing psychometric.csv ...")

psych = pd.read_csv(BASE_PATH + "psychometric.csv")
psych = psych.rename(columns={'user_id': 'user'})


Processing psychometric.csv ...


In [23]:
print(psych.shape)
psych.head()


(1000, 7)


Unnamed: 0,employee_name,user,O,C,E,A,N
0,Calvin Edan Love,CEL0561,40,39,36,19,40
1,Christine Reagan Deleon,CRD0624,26,22,17,39,32
2,Jade Felicia Caldwell,JFC0557,22,16,23,40,33
3,Aquila Stewart Dejesus,ASD0577,40,48,36,14,37
4,Micah Abdul Rojas,MAR0955,36,44,23,44,25


In [28]:
print("Merging all features ...")

from functools import reduce

dfs = [
    logon_features,
    file_features,
    email_features,
    device_features,
    http_features
]

final_df = reduce(
    lambda l, r: pd.merge(l, r, on=['user', 'date'], how='left'),
    dfs
)

final_df = pd.merge(final_df, psych, on='user', how='left')
final_df.fillna(0, inplace=True)

print("Final dataset shape:", final_df.shape)
final_df.head()


Merging all features ...
Final dataset shape: (407112, 22)


Unnamed: 0,user,date,logon_count,logoff_count,unique_pcs,file_events,unique_files,avg_filename_length,emails_sent,avg_email_size,...,usb_connect,usb_disconnect,http_requests,unique_urls,employee_name,O,C,E,A,N
0,AAE0190,2010-01-04,1,1,1,0.0,0.0,0.0,14.0,31523.428571,...,0.0,0.0,143.0,54.0,August Armando Evans,36,30,14,50,29
1,AAE0190,2010-01-05,1,1,1,0.0,0.0,0.0,13.0,27350.153846,...,0.0,0.0,143.0,53.0,August Armando Evans,36,30,14,50,29
2,AAE0190,2010-01-06,1,1,1,0.0,0.0,0.0,14.0,38046.214286,...,0.0,0.0,143.0,51.0,August Armando Evans,36,30,14,50,29
3,AAE0190,2010-01-07,1,1,1,0.0,0.0,0.0,14.0,33902.214286,...,0.0,0.0,43.0,24.0,August Armando Evans,36,30,14,50,29
4,AAE0190,2010-01-07,1,1,1,0.0,0.0,0.0,14.0,33902.214286,...,0.0,0.0,100.0,39.0,August Armando Evans,36,30,14,50,29


In [29]:
import pandas as pd
import numpy as np


In [32]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import pandas as pd

# Step 1: Drop non-feature columns
X = final_df.drop(columns=['date'])  # Keep 'user' for encoding

# Step 2: Automatically one-hot encode all categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Step 3: Scale numeric data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Step 4: Train Isolation Forest
model = IsolationForest(
    n_estimators=300,
    contamination=0.05,
    random_state=42,
    n_jobs=-1
)
model.fit(X_scaled)

# Step 5: Predict anomalies
final_df['anomaly'] = model.predict(X_scaled)

print("Training completed")
print(final_df[['user', 'date', 'anomaly']].head())


Training completed
      user        date  anomaly
0  AAE0190  2010-01-04        1
1  AAE0190  2010-01-05        1
2  AAE0190  2010-01-06        1
3  AAE0190  2010-01-07        1
4  AAE0190  2010-01-07        1


In [33]:
print(final_df['anomaly'].value_counts())


anomaly
 1    386756
-1     20356
Name: count, dtype: int64


In [34]:
final_df.to_csv("/kaggle/working/spi_features_with_anomalies.csv", index=False)
print("Saved spi_features_with_anomalies.csv")


Saved spi_features_with_anomalies.csv


In [35]:
import joblib

# Save model
joblib.dump(model, '/kaggle/working/spi_isolation_forest_model.pkl')
print("Model saved as spi_isolation_forest_model.pkl")

# Load it later
# loaded_model = joblib.load('/kaggle/working/spi_isolation_forest_model.pkl')


Model saved as spi_isolation_forest_model.pkl
