In [1]:
import pandas as pd 
import numpy as np
from scipy.stats import entropy
df=pd.read_csv("synthetic_auth_logs.csv")
df["timestamp"]=pd.to_datetime(df["timestamp"])
print(df.head())

                               event_id                  timestamp  \
0  daa36c67-e78d-4770-a9d9-c9e6ec20140e 2026-01-27 06:58:09.395577   
1  e4520439-23bd-4be3-b5f7-92afbe8f6a00 2026-01-27 07:49:45.395577   
2  94a0b46a-087a-461b-84f9-b61933111d5d 2026-01-27 08:39:49.395577   
3  75d1c0b5-bedb-45b0-ae2d-6af6d6d73826 2026-01-27 06:17:52.395577   
4  0c140d61-fec3-46fa-93c3-e348e034b183 2026-01-27 08:29:19.395577   

                user country  device     resource  success auth_method  \
0  user1@example.com  Canada  Laptop  payroll.csv        1    password   
1  user1@example.com  Canada  Laptop        email        1    password   
2  user1@example.com  Canada  Laptop    code_repo        1         mfa   
3  user1@example.com  Canada  Laptop        email        1         mfa   
4  user1@example.com  Canada  Laptop        email        1    password   

   is_attack  
0          0  
1          0  
2          0  
3          0  
4          0  


In [2]:
df['hour']=df['timestamp'].dt.hour
grouped=df.groupby('user')
avg_login_hour=grouped['hour'].mean()
std_login_hour=grouped['hour'].std()
unique_countries = grouped["country"].nunique()
unique_devices = grouped["device"].nunique()
failed_login_rate = grouped["success"].apply(lambda x: 1 - x.mean())


In [3]:
def resource_entropy(series):#calculating probab distri for each resource
    probs = series.value_counts(normalize=True)
    return entropy(probs)

resource_entropy_val = grouped["resource"].apply(resource_entropy)


In [4]:
features = pd.DataFrame({
    "avg_login_hour": avg_login_hour,
    "std_login_hour": std_login_hour,
    "unique_countries": unique_countries,
    "unique_devices": unique_devices,
    "failed_login_rate": failed_login_rate,
    "resource_entropy": resource_entropy_val
})

# Ground truth (for evaluation only)
attack_label = grouped["is_attack"].max()
features["is_attack"] = attack_label

print(features)


                   avg_login_hour  std_login_hour  unique_countries  \
user                                                                  
user1@example.com        7.023256        1.299970                 2   
user2@example.com        8.261905        0.912235                 1   
user3@example.com        6.952381        0.935802                 1   
user4@example.com       17.023256        2.815674                 1   
user5@example.com       12.095238        0.957882                 1   
user6@example.com        7.404762        0.989198                 1   
user7@example.com       17.142857        0.751305                 1   
user8@example.com       16.952381        0.730933                 1   

                   unique_devices  failed_login_rate  resource_entropy  \
user                                                                     
user1@example.com               2           0.000000          1.780865   
user2@example.com               1           0.000000          1.734