In [26]:
import pandas as pd
import dask.dataframe as dd

df = dd.read_csv("C:/Users/myaka/Desktop/Final_Year_project/data/device.csv")

In [24]:
print(df.head())


                         id                 date     user       pc  \
0  {Z2Q8-K3AV28BE-9353JIRT}  01/02/2010 07:17:18  SDH2394  PC-5849   
1  {C7F1-G7LE60RU-2483DAXS}  01/02/2010 07:22:42  JKS2444  PC-6961   
2  {T9A4-D4RV69OF-1704NINW}  01/02/2010 07:31:42  CBA1023  PC-1570   
3  {S8L0-O6QQ15NL-0636OYNV}  01/02/2010 07:33:28  GNT0221  PC-6427   
4  {U0F1-R1FX27FM-6954TTVU}  01/02/2010 07:33:55  JKS2444  PC-6961   

                                           file_tree    activity  
0                          R:\;R:\22B5gX4;R:\SDH2394     Connect  
1                                     R:\;R:\JKS2444     Connect  
2  R:\;R:\42gY283;R:\48rr4y2;R:\59ntt61;R:\76xCQG...     Connect  
3                                     R:\;R:\GNT0221     Connect  
4                                               <NA>  Disconnect  


In [25]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from dask_ml.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest
from datetime import datetime


# Convert date to datetime format
df["date"] = dd.to_datetime(df["date"])

# Extract features from timestamp
df["hour_of_day"] = df["date"].dt.hour
df["day_of_week"] = df["date"].dt.dayofweek  # 0 = Monday, 6 = Sunday
df["is_weekend"] = df["day_of_week"].apply(lambda x: 1 if x >= 5 else 0, meta=('x', 'int64'))
df["is_midnight_activity"] = df["hour_of_day"].apply(lambda x: 1 if x < 5 else 0, meta=('x', 'int64'))

# Compute final dataset before sorting
df = df.compute()

# Sort by user and timestamp
df = df.sort_values(["id", "date"])

# Compute time since last activity
df["time_since_last_activity"] = df.groupby("id")["date"].diff().dt.total_seconds().fillna(0)

# Apply log transformation to time_since_last_activity
df["log_time_since_last_activity"] = np.log1p(df["time_since_last_activity"])

# Encode activity types
encoder = LabelEncoder()
df["activity_encoded"] = encoder.fit_transform(df["activity"])

# Select relevant features for anomaly detection
features = ["hour_of_day", "day_of_week", "is_weekend", "is_midnight_activity", "log_time_since_last_activity", "activity_encoded"]
X = df[features]

# Train Isolation Forest for anomaly detection
model = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)
model.fit(X)  # Fit the model before making predictions
df["anomaly_score_raw"] = model.decision_function(X)

# Normalize anomaly score using standardization
# Normalize anomaly score to a scale of 0 to 10
df["anomaly_score"] = ((df["anomaly_score_raw"] - df["anomaly_score_raw"].min()) / 
                         (df["anomaly_score_raw"].max() - df["anomaly_score_raw"].min())) * 10

# Flag anomalies (1: normal, -1: suspicious)
df["is_anomalous"] = df["anomaly_score_raw"].map(lambda x: 1 if x < -0.1 else 0)

# Print flagged anomalies with scores
print(df[df["is_anomalous"] == 1][["id", "date", "activity", "anomaly_score"]])


                              id                date    activity  \
289140  {A0A0-C8SQ42CK-2477BXSG} 2010-12-12 12:54:34  Disconnect   
653171  {A0A0-F4VU94RC-0487BQGH} 2010-07-28 03:03:48  Disconnect   
689437  {A0A1-D9FE11AT-7598DJHA} 2011-04-30 08:36:26     Connect   
473999  {A0A1-O0ON57PK-3129JVST} 2010-05-29 15:41:58     Connect   
591049  {A0A2-S5IT49VW-4618HHFG} 2011-03-26 10:08:48  Disconnect   
...                          ...                 ...         ...   
324368  {Z9Z7-U3AI43IF-4930VPJF} 2010-12-25 15:36:03  Disconnect   
383917  {Z9Z7-Y8TU02JE-5072UKER} 2010-05-01 15:56:59     Connect   
73828   {Z9Z8-K6AS68QI-5899APZK} 2010-09-30 02:11:44  Disconnect   
754475  {Z9Z9-C9ME83QV-8500ANHZ} 2010-08-30 01:17:25     Connect   
559764  {Z9Z9-Y1ZB92YK-2297YZAC} 2010-06-27 08:22:10     Connect   

        anomaly_score  
289140       2.579664  
653171       2.818373  
689437       3.245137  
473999       3.261617  
591049       3.138433  
...               ...  
324368       3.

In [28]:
import pickle

with open("anomaly_device.pkl", "wb") as model_file:
    pickle.dump(model, model_file)