In [3]:
import polars as pl

# Lazy scan CSV (no full load in memory)
pl.scan_csv("data/GUIDE_Train.csv").sink_parquet("data/GUIDE_Train.parquet")
pl.scan_csv("data/GUIDE_Test.csv").sink_parquet("data/GUIDE_Test.parquet")

In [4]:
import pandas as pd

In [1]:
import pandas as pd

# Read only small sample
df_train = pd.read_parquet("data/GUIDE_Train.parquet")
df_test = pd.read_parquet("data/GUIDE_Test.parquet")

print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)

print("\nTrain columns:")
print(df_train.dtypes)


Train shape: (9516837, 45)
Test shape: (4147992, 46)

Train columns:
Id                      int64
OrgId                   int64
IncidentId              int64
AlertId                 int64
Timestamp              object
DetectorId              int64
AlertTitle              int64
Category               object
MitreTechniques        object
IncidentGrade          object
ActionGrouped          object
ActionGranular         object
EntityType             object
EvidenceRole           object
DeviceId                int64
Sha256                  int64
IpAddress               int64
Url                     int64
AccountSid              int64
AccountUpn              int64
AccountObjectId         int64
AccountName             int64
DeviceName              int64
NetworkMessageId        int64
EmailClusterId        float64
RegistryKey             int64
RegistryValueName       int64
RegistryValueData       int64
ApplicationId           int64
ApplicationName         int64
OAuthApplicationId      int64
T

In [2]:
# Missing values count
print(df_train.isnull().sum().sort_values(ascending=False).head(20))

# Quick numerical stats
print(df_train.describe().T)

# Unique values per column (for categorical detection)
print(df_train.nunique().sort_values().head(20))


ResourceType         9509762
ActionGrouped        9460773
ActionGranular       9460773
ThreatFamily         9441956
EmailClusterId       9420025
AntispamDirection    9339535
Roles                9298686
SuspicionLevel       8072708
LastVerdict          7282572
MitreTechniques      5468386
IncidentGrade          51340
Timestamp                  0
Id                         0
EntityType                 0
DetectorId                 0
AlertTitle                 0
Category                   0
OrgId                      0
IncidentId                 0
AlertId                    0
dtype: int64
                        count          mean           std       min  \
Id                  9516837.0  8.425494e+11  4.962499e+11       0.0   
OrgId               9516837.0  1.815800e+02  3.867784e+02       0.0   
IncidentId          9516837.0  7.066349e+04  1.208369e+05       0.0   
AlertId             9516837.0  4.065188e+05  4.592827e+05       0.0   
DetectorId          9516837.0  1.106724e+02  4.35103

In [1]:
import pandas as pd
import json

def summarize_dataframe(df, name="dataset"):
    summary = {
        "name": name,
        "shape": {"rows": df.shape[0], "columns": df.shape[1]},
        "columns": {}
    }
    
    for col in df.columns:
        col_summary = {
            "dtype": str(df[col].dtype),
            "missing_values": int(df[col].isnull().sum()),
            "unique_values": int(df[col].nunique())
        }
        
        # Add sample stats for numeric
        if pd.api.types.is_numeric_dtype(df[col]):
            col_summary["stats"] = {
                "mean": float(df[col].mean()) if not df[col].isnull().all() else None,
                "std": float(df[col].std()) if not df[col].isnull().all() else None,
                "min": float(df[col].min()) if not df[col].isnull().all() else None,
                "25%": float(df[col].quantile(0.25)) if not df[col].isnull().all() else None,
                "50%": float(df[col].median()) if not df[col].isnull().all() else None,
                "75%": float(df[col].quantile(0.75)) if not df[col].isnull().all() else None,
                "max": float(df[col].max()) if not df[col].isnull().all() else None
            }
        else:
            # Add top categories (useful for LLMs)
            top_vals = df[col].value_counts(dropna=True).head(5).to_dict()
            col_summary["top_values"] = {str(k): int(v) for k, v in top_vals.items()}
        
        summary["columns"][col] = col_summary
    
    return summary


# Load datasets
train = pd.read_parquet("data/GUIDE_Train.parquet")
test = pd.read_parquet("data/GUIDE_Test.parquet")

# Summarize both
summary = {
    "train": summarize_dataframe(train, "train"),
    "test": summarize_dataframe(test, "test")
}

# Save JSON
with open("data/dataset_summary.json", "w") as f:
    json.dump(summary, f, indent=4)
