In [55]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.feature_extraction import FeatureHasher
df_train = pd.read_csv(r"D:\Project\MachineLearningAlgorithm\forestMachineLearning\data\processes_train.csv")  
df_test = pd.read_csv(r"D:\Project\MachineLearningAlgorithm\forestMachineLearning\data\processes_test.csv")  
df_valid = pd.read_csv(r"D:\Project\MachineLearningAlgorithm\forestMachineLearning\data\processes_valid.csv")

# Utils

In [56]:
# 把字符串列哈希为数值特征
def _hash_string_series_to_df(s: pd.Series, n_features: int, prefix: str) -> pd.DataFrame:
    """
    将字符串列（Series）用 FeatureHasher 编码为 n_features 维的数值特征。
    - 为避免不同字段间 token 碰撞，为每个值加上列名前缀。
    - 返回 DataFrame（float64），列名为 {prefix}_h0, {prefix}_h1, ...
    """
    n_features = int(max(1, n_features))
    # 统一成 str，并将缺失值置为空串
    s = s.astype(str).fillna("")
    tokens = prefix + "=" + s  # 加前缀防止不同字段间的 token 碰撞
    hasher = FeatureHasher(n_features=n_features, input_type="string", alternate_sign=False)
    X = hasher.transform(tokens.tolist()).toarray()
    cols = [f"{prefix}_h{i}" for i in range(n_features)]
    return pd.DataFrame(X, index=s.index, columns=cols, dtype=np.float64)

# data preprocess

## 哈希编码分布

In [57]:
import numpy as np
import pandas as pd
from typing import Dict, List, Optional, Union
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import LabelEncoder

# =========================
# 工具：哈希 + LabelEncoder
# =========================

def _hash_string_series_to_df(s: pd.Series, n_features: int, prefix: str) -> pd.DataFrame:
    """
    将字符串列哈希为 n_features 维数值特征（稳定且不需拟合）。
    使用 input_type='dict' 避免“Samples can not be a single string”错误。
    """
    n_features = int(max(1, n_features))
    s = s.astype(str).fillna("")
    raw = [{f"{prefix}={val}": 1.0} for val in s]  # 每个样本一个 dict
    hasher = FeatureHasher(n_features=n_features, input_type="dict", alternate_sign=False)
    X = hasher.transform(raw).toarray()
    cols = [f"{prefix}_h{i}" for i in range(n_features)]
    return pd.DataFrame(X, index=s.index, columns=cols, dtype=np.float64)

def _clean_cat_series(s: pd.Series) -> pd.Series:
    """将各种缺失写法统一为空串，避免 LE 报错；保持为字符串。"""
    s = s.astype(str)
    return s.replace({np.nan: "", "nan": "", "NaN": "", "<NA>": "", "None": "", "NaT": ""}).fillna("")

def fit_label_encoders(df_list: List[pd.DataFrame],
                       cat_cols: List[str] = ["processName","hostName","eventName"]) -> Dict[str, LabelEncoder]:
    """在 train+valid(+test) 的并集上拟合每个类别列的 LabelEncoder。"""
    encoders: Dict[str, LabelEncoder] = {}
    for c in cat_cols:
        ser = [ _clean_cat_series(d[c]) for d in df_list if d is not None and c in d.columns ]
        if not ser:  # 三个数据集中都没有该列
            continue
        all_vals = pd.concat(ser, axis=0)
        encoders[c] = LabelEncoder().fit(all_vals)
    return encoders

# =========================
# 三个“同名”处理函数（哈希/LE 二合一）
#   - method='hash'：返回 [DataFrame]（多列哈希）
#   - method='label'：返回 [Series]（单列整数编码，列名 *_le）
#   - 仍可直接 feats += process_xxx(...)
# =========================

def process_processName(df: pd.DataFrame,
                        n_features: int = 32,
                        method: str = "hash",
                        encoders: Optional[Dict[str, LabelEncoder]] = None):
    s = df.get("processName", pd.Series("", index=df.index))
    if method == "hash":
        df_h = _hash_string_series_to_df(s, n_features=n_features, prefix="processName")
        return [df_h]
    elif method == "label":
        if encoders is None or "processName" not in encoders:
            raise ValueError("encoders['processName'] 未提供（method='label' 需要预先拟合的编码器）")
        le = encoders["processName"]
        m = {cls: i for i, cls in enumerate(le.classes_)}
        s_clean = _clean_cat_series(s)
        unk = len(le.classes_)
        ser = s_clean.map(lambda v: m.get(v, unk)).astype("int64").rename("processName_le")
        return [ser]
    else:
        raise ValueError("method 必须是 'hash' 或 'label'")

def process_hostName(df: pd.DataFrame,
                     n_features: int = 32,
                     method: str = "hash",
                     encoders: Optional[Dict[str, LabelEncoder]] = None):
    s = df.get("hostName", pd.Series("", index=df.index))
    if method == "hash":
        df_h = _hash_string_series_to_df(s, n_features=n_features, prefix="hostName")
        return [df_h]
    elif method == "label":
        if encoders is None or "hostName" not in encoders:
            raise ValueError("encoders['hostName'] 未提供（method='label' 需要预先拟合的编码器）")
        le = encoders["hostName"]
        m = {cls: i for i, cls in enumerate(le.classes_)}
        s_clean = _clean_cat_series(s)
        unk = len(le.classes_)
        ser = s_clean.map(lambda v: m.get(v, unk)).astype("int64").rename("hostName_le")
        return [ser]
    else:
        raise ValueError("method 必须是 'hash' 或 'label'")

def process_eventName(df: pd.DataFrame,
                      n_features: int = 32,
                      method: str = "hash",
                      encoders: Optional[Dict[str, LabelEncoder]] = None):
    s = df.get("eventName", pd.Series("", index=df.index))
    if method == "hash":
        df_h = _hash_string_series_to_df(s, n_features=n_features, prefix="eventName")
        return [df_h]
    elif method == "label":
        if encoders is None or "eventName" not in encoders:
            raise ValueError("encoders['eventName'] 未提供（method='label' 需要预先拟合的编码器）")
        le = encoders["eventName"]
        m = {cls: i for i, cls in enumerate(le.classes_)}
        s_clean = _clean_cat_series(s)
        unk = len(le.classes_)
        ser = s_clean.map(lambda v: m.get(v, unk)).astype("int64").rename("eventName_le")
        return [ser]
    else:
        raise ValueError("method 必须是 'hash' 或 'label'")

In [58]:
# 将 timestamp 转为相对时间（相对于每个 processId 的最小 timestamp）
def make_timestamp_relative(df: pd.DataFrame) -> pd.Series:
    # 按 processId 分组：timestamp - min(timestamp)
    if "timestamp" not in df.columns or "processId" not in df.columns:
        return df.get("timestamp", pd.Series(0.0, index=df.index))
    ts = df["timestamp"].astype("float64")
    pid = df["processId"]
    rel = ts - ts.groupby(pid).transform("min")
    return rel

def process_userId(df: pd.DataFrame) -> pd.Series:
    """userId: 二值化 + 保留外来账号强度"""
    if 'userId' not in df.columns:
        return pd.Series(0, index=df.index)
    s = df['userId']
    flag = (s >= 1000).astype(int)
    strength = np.where(s >= 1000, np.log1p(s - 999), 0)
    return pd.Series(flag, name="userId_flag"), pd.Series(strength, name="userId_log_strength")

def process_argsNum(df: pd.DataFrame) -> pd.Series:
    """argsNum: 对数变换"""
    if 'argsNum' not in df.columns:
        return pd.Series(0, index=df.index)
    s = df['argsNum'].fillna(0)
    return pd.Series(np.log1p(s), name="argsNum_log")

def process_stackAddresses(df: pd.DataFrame) -> pd.Series:
    """stackAddresses: 栈深度（健壮解析 list/tuple/str）"""
    if 'stackAddresses' not in df.columns:
        return pd.Series(0, index=df.index)
    def count_depth(x):
        if isinstance(x, (list, tuple)):
            return len(x)
        if isinstance(x, str):
            s = x.strip()
            if not s:
                return 0
            seps = ['|', ',', ';']
            for sep in seps:
                if sep in s:
                    parts = [p for p in s.split(sep) if p]
                    return len(parts)
            return len(s.split())
        return 0
    depth = df['stackAddresses'].apply(count_depth)
    return pd.Series(np.log1p(depth), name="stack_depth_log")


def process_parentProcessId(df: pd.DataFrame) -> pd.Series:
    """parentProcessId: 是否为系统级(0,1,2)"""
    if 'parentProcessId' not in df.columns:
        return pd.Series(0, index=df.index)
    return pd.Series(df['parentProcessId'].isin([0,1,2]).astype(int), name="parentPID_sys_flag")

def process_processId(df: pd.DataFrame) -> pd.Series:
    """processId: 是否为系统级(0,1,2)"""
    if 'processId' not in df.columns:
        return pd.Series(0, index=df.index)
    return pd.Series(df['processId'].isin([0,1,2]).astype(int), name="process_sys_flag")

def basic_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    自定义特征处理（返回 DataFrame，不会引入额外行）
    生成列：
      - eventId
      - argsNum
      - processId_is_os, parentProcessId_is_os, userId_is_os
      - mountNamespace_is_default
      - returnValue_mapped (-1/0/1)
    """
    out = pd.DataFrame(index=df.index)

    # 安全取列（缺失时填 0）
    # out["eventId"] = df["eventId"] if "eventId" in df.columns else 0
    #out["argsNum"] = df["argsNum"] if "argsNum" in df.columns else 0

    if "processId" in df.columns:
        out["processId_is_os"] = df["processId"].isin([0, 1, 2]).astype(int)
    else:
        out["processId_is_os"] = 0

    if "parentProcessId" in df.columns:
        out["parentProcessId_is_os"] = df["parentProcessId"].isin([0, 1, 2]).astype(int)
    else:
        out["parentProcessId_is_os"] = 0

    if "userId" in df.columns:
        out["userId_is_os"] = (df["userId"] < 1000).astype(int)
    else:
        out["userId_is_os"] = 0

    if "mountNamespace" in df.columns:
        out["mountNamespace_is_default"] = (df["mountNamespace"] == 4026531840).astype(int)
    else:
        out["mountNamespace_is_default"] = 0

    if "returnValue" in df.columns:
        rv = pd.to_numeric(df["returnValue"], errors="coerce").fillna(0)
        out["returnValue_mapped"] = np.select(
            [rv < 0, rv == 0, rv > 0],
            [-1, 0, 1],
            default=0
        )
    else:
        out["returnValue_mapped"] = 0

    # 统一数值类型 & 缺失填充
    for c in out.columns:
        out[c] = pd.to_numeric(out[c], errors="coerce").fillna(0)

    # 可选：快速自检，确保行数不变
    # assert out.shape[0] == df.shape[0]

    return out

# ------------------------------------------------------
# 2️⃣  主控制函数
# ------------------------------------------------------

def preprocess(df: pd.DataFrame,
               n_hash_features: int = 32,
               cat_encoding: str = "hash",           # 'hash' 或 'label'
               encoders: Optional[Dict[str, LabelEncoder]] = None
               ) -> pd.DataFrame:
    """
    主预处理函数：
      - 数值/自定义特征：沿用你现有实现
      - 三个类别列：由 cat_encoding 选择 'hash' 或 'label'
        * 'hash'：每列哈希到 n_hash_features//3 维
        * 'label'：每列 1 维整数（*_le）
    """
    feats = []

    # 1) 直接作为新列拼进去（确保有列名）
    feats.append(make_timestamp_relative(df).rename("timestamp_rel"))

    # === 数值 / 二值类（你的原实现） ===
    uflag, ulog = process_userId(df)
    feats += [uflag, ulog,
              process_argsNum(df),
              process_stackAddresses(df),
              process_parentProcessId(df),
              process_processId(df)
              ]

    # === 基础特征类（你的原实现） ===
    feats.append(basic_features(df))

    # === 三个类别列 ===
    if cat_encoding == "hash":
        per_col = max(1, n_hash_features // 3)
        feats += process_processName(df, n_features=per_col, method="hash")
        feats += process_hostName(df, n_features=per_col, method="hash")
        feats += process_eventName(df, n_features=per_col, method="hash")
    elif cat_encoding == "label":
        feats += process_processName(df, method="label", encoders=encoders)  # 返回 [Series('processName_le')]
        feats += process_hostName(df, method="label", encoders=encoders)     # 返回 [Series('hostName_le')]
        feats += process_eventName(df, method="label", encoders=encoders)    # 返回 [Series('eventName_le')]
    else:
        raise ValueError("cat_encoding 只能是 'hash' 或 'label'")

    # === 拼接/清理 ===
    df_out = pd.concat(feats, axis=1).fillna(0)
    df_out = df_out.select_dtypes(include=[np.number])
    return df_out


In [59]:
# 先按你的流程 basic_clean / ensure_req_features 得到 train_df/valid_df/test_df
encoders = fit_label_encoders([df_train, df_valid, df_test], ["processName","hostName","eventName"])

df_train_feat = preprocess(df_train, cat_encoding="label", encoders=encoders)
df_valid_feat = preprocess(df_valid, cat_encoding="label", encoders=encoders)
df_test_feat  = preprocess(df_test,  cat_encoding="label", encoders=encoders)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [60]:
# 保存 Id 列
test_ids = df_test["Id"].copy()  
test_ids.head()

0    0
1    1
2    2
3    3
4    4
Name: Id, dtype: int64

In [61]:
df_test_feat.head()

Unnamed: 0,timestamp_rel,userId_flag,userId_log_strength,argsNum_log,stack_depth_log,parentPID_sys_flag,process_sys_flag,processId_is_os,parentProcessId_is_os,userId_is_os,mountNamespace_is_default,returnValue_mapped,processName_le,hostName_le,eventName_le
0,2659.699107,0,0.0,1.098612,0.693147,0,0,0,0,1,1,0,72,8,41
1,1166.446252,0,0.0,0.693147,1.098612,0,0,0,0,1,1,0,72,6,8
2,3806.862472,0,0.0,0.693147,0.693147,1,1,1,1,1,1,0,86,7,8
3,1479.45483,0,0.0,0.693147,0.693147,0,0,0,0,1,1,0,72,8,8
4,732.280369,0,0.0,1.609438,0.693147,0,0,0,0,1,1,1,72,3,28


# 模型训练 + 验证 + CSV

In [62]:
from sklearn.ensemble import IsolationForest
# model = IsolationForest(
#     n_estimators=100,
#     contamination=0.005,
#     max_features=0.6,
#     max_samples=0.6,
#     random_state=42,
#     n_jobs=-1
# )
model = IsolationForest(
    n_estimators=100,
    contamination=0.015,
    max_features=0.8,
    max_samples=0.8,
    random_state=42,
    n_jobs=-1
)
# model = IsolationForest(
#     n_estimators=150,
#     contamination=0.04,
#     max_features=1.0,
#     max_samples=20000,
#     random_state=42,
#     n_jobs=-1
# )
model.fit(df_train_feat)

0,1,2
,n_estimators,100
,max_samples,0.8
,contamination,0.015
,max_features,0.8
,bootstrap,False
,n_jobs,-1
,random_state,42
,verbose,0
,warm_start,False


In [63]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, average_precision_score, precision_recall_fscore_support
# ========== 5. 模型评估 ==========
print("\nMaking predictions on validation data...")
y_valid_target = df_valid['target']

# 异常分数（越大越异常）
scores = -model.decision_function(df_valid_feat)
scores_norm = (scores - scores.min()) / (scores.max() - scores.min())
# Average Precision
ap = average_precision_score(y_valid_target, scores_norm)
# 固定 percentile=97 计算 F1、P、R
thr = np.percentile(scores_norm, 97)
y_pred = (scores_norm >= thr).astype(int)
p, r, f1, _ = precision_recall_fscore_support(y_valid_target, y_pred, average="binary")

print("\nModel Evaluation:" + "precision: {:.4f}, recall: {:.4f}, F1-score: {:.4f}, AP: {:.4f}".format(p, r, f1, ap))
#print("Accuracy:", accuracy_score(y_valid_target, y_pred))
#print("\nConfusion Matrix:")
#print(confusion_matrix(y_valid_target, y_pred))
#print("\nClassification Report:")
#print(classification_report(y_valid_target, y_pred, zero_division=0))


Making predictions on validation data...

Model Evaluation:precision: 0.9925, recall: 0.7810, F1-score: 0.8741, AP: 0.9327


In [65]:
# Evaluating combination 1/54: {'n_estimators': 100, 'max_samples': 0.6, 'max_features': 0.6, 'contamination': 0.005, 'random_state': 42}
# ========== 5. 模型测试 》》》 输出csv文件，提交kaggle ==========
print("\nMaking predictions on test data...")
if True:
    # 手动设置阈值 percentile = 0.97
    # === 3️⃣ 手动设置阈值 percentile = 97 ===
    # === 2️⃣ 计算测试集异常分数 ===
    score_test = -model.decision_function(df_test_feat)
    # === 2️⃣ 将异常分数归一化到 [0, 1] ===
    score_norm = (score_test - score_test.min()) / (score_test.max() - score_test.min())
    # === 3️⃣ 构建提交 DataFrame ===
    submission = pd.DataFrame({
        "Id": df_test["Id"],   # ⚠️ 必须与你的 processes_test.csv 中的 Id 对应
        "target": score_norm               # Kaggle 要求列名为 target
    })

    # === 4️⃣ 保存为 CSV 文件（无 index） ===
    submission.to_csv("submission.csv", index=False)

    print("✅ submission.csv 已生成，可直接上传至 Kaggle！")


Making predictions on test data...
✅ submission.csv 已生成，可直接上传至 Kaggle！
