In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

N_SAMPLES = 10000

permissions = [
    "perm_internet", "perm_access_network_state", "perm_access_wifi_state",

    "perm_read_sms", "perm_receive_sms", "perm_send_sms",
    "perm_read_phone_numbers", "perm_read_phone_state",

    "perm_read_basic_phone_state", "perm_read_privileged_phone_state",
    "perm_get_accounts",

    "perm_read_external_storage", "perm_write_external_storage",
    "perm_read_media_images", "perm_read_media_video", "perm_read_media_audio",

    "perm_camera", "perm_record_audio",

    "perm_access_fine_location", "perm_access_coarse_location",
    "perm_access_background_location",

    "perm_read_contacts", "perm_write_contacts",
    "perm_read_call_log", "perm_write_call_log",

    "perm_system_alert_window", "perm_draw_over_other_apps",
    "perm_bind_accessibility_service",

    "perm_request_install_packages", "perm_delete_packages",
    "perm_query_all_packages", "perm_package_usage_stats",

    "perm_foreground_service", "perm_wake_lock",
    "perm_receive_boot_completed", "perm_disable_keyguard",
    "perm_use_full_screen_intent",

    "perm_use_biometric", "perm_use_fingerprint", "perm_keyguard_service",

    "perm_read_logs", "perm_write_settings",
    "perm_request_ignore_battery_optimizations"
]

def bernoulli(p, size):
    return np.random.binomial(1, p, size)

def clipped_normal(mean, std, size, low=0.0, high=1.0):
    return np.clip(np.random.normal(mean, std, size), low, high)

df = pd.DataFrame()

for perm in permissions:
    df[perm] = bernoulli(np.random.uniform(0.18, 0.45), N_SAMPLES)

permission_density = df[permissions].mean(axis=1)

# -----------------------------
# Metadata features
# -----------------------------
df["app_size_mb"] = np.clip(np.random.normal(24, 9, N_SAMPLES), 6, 110)
df["min_sdk"] = np.random.randint(19, 28, N_SAMPLES)
df["target_sdk"] = np.random.randint(26, 34, N_SAMPLES)
df["activity_count"] = np.random.poisson(6, N_SAMPLES)
df["certificate_age_days"] = np.clip(np.random.normal(900, 600, N_SAMPLES), 30, 4000)
df["debuggable_flag"] = bernoulli(0.07, N_SAMPLES)
df["obfuscation_score"] = clipped_normal(0.46, 0.18, N_SAMPLES)

df["network_activity_density_score"] = clipped_normal(0.46, 0.15, N_SAMPLES)
df["background_presence_score"] = clipped_normal(0.42, 0.17, N_SAMPLES)
df["event_dependency_score"] = clipped_normal(0.36, 0.15, N_SAMPLES)
df["communication_regularness_score"] = clipped_normal(0.44, 0.16, N_SAMPLES)
df["endpoint_stability_score"] = clipped_normal(0.56, 0.17, N_SAMPLES)

df["permission_behavior_alignment_score"] = np.clip(
    1.0 - np.abs(permission_density - df["background_presence_score"]),
    0.0, 1.0
)

df["metadata_behavior_consistency_score"] = np.clip(
    1.0 - np.abs(df["obfuscation_score"] - df["communication_regularness_score"]),
    0.0, 1.0
)

df["api_permission_coherence_score"] = clipped_normal(0.56, 0.18, N_SAMPLES)
df["temporal_stability_index"] = clipped_normal(0.52, 0.14, N_SAMPLES)

latent_risk = (
    0.22 * permission_density +
    0.20 * df["background_presence_score"] +
    0.18 * df["obfuscation_score"] +
    0.15 * (1 - df["permission_behavior_alignment_score"]) +
    0.15 * (1 - df["metadata_behavior_consistency_score"]) +
    0.10 * (1 - df["endpoint_stability_score"])
)

risk_bias = np.random.beta(2.2, 4.5, N_SAMPLES)
latent_risk = np.clip(latent_risk + 0.18 * risk_bias, 0, 1)

df["label"] = (
    latent_risk > np.random.uniform(0.40, 0.55, N_SAMPLES)
).astype(int)

OUTPUT_FILE = "synthetic_hidden_pattern_dataset_v2.csv"
df.to_csv(OUTPUT_FILE, index=False)

print("Dataset generated:", OUTPUT_FILE)
print("Shape:", df.shape)
print(df["label"].value_counts(normalize=True))


Dataset generated: synthetic_hidden_pattern_dataset_v2.csv
Shape: (10000, 60)
label
0    0.8399
1    0.1601
Name: proportion, dtype: float64


In [7]:
import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    average_precision_score
)

df = pd.read_csv("synthetic_hidden_pattern_dataset_v2.csv")

X = df.drop(columns=["label"])
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

model = lgb.LGBMClassifier(
    objective="binary",
    n_estimators=400,
    learning_rate=0.03,
    max_depth=-1,
    num_leaves=31,
    min_child_samples=40,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    class_weight={0: 1, 1: pos_weight},
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nROC-AUC:", roc_auc_score(y_test, y_prob))
print("PR-AUC:", average_precision_score(y_test, y_prob))

[LightGBM] [Info] Number of positive: 1281, number of negative: 6719
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009424 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3185
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 59
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Confusion Matrix:
[[1488  192]
 [  96  224]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9394    0.8857    0.9118      1680
           1     0.5385    0.7000    0.6087       320

    accuracy                         0.8560      2000
   macro avg     0.7389    0.7929    0.7602      2000
weighted avg     0.8752    0.8560    0.8633      2000


ROC-AUC: 0.8925204613095238
PR-AUC: 0.662183796119

In [8]:
pip install -q lightgbm onnxmltools onnx onnxruntime


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m352.5/352.5 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.1/18.1 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
import onnxmltools
from onnxmltools.convert.common.data_types import FloatTensorType

n_features = X_train.shape[1]

onnx_model = onnxmltools.convert_lightgbm(
    model,
    initial_types=[("input", FloatTensorType([None, n_features]))]
)

with open("lightgbm_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

print("ONNX conversion successful: lightgbm_model.onnx")


ONNX conversion successful: lightgbm_model.onnx


In [14]:
import onnxruntime as ort
import numpy as np

sess = ort.InferenceSession("lightgbm_model.onnx")
input_name = sess.get_inputs()[0].name

X_sample = X_test.values.astype(np.float32)
onnx_outputs = sess.run(None, {input_name: X_sample})
onnx_probs = np.array([d[1] for d in onnx_outputs[1]])

print("ONNX inference OK, sample outputs:", onnx_probs[:5])

ONNX inference OK, sample outputs: [0.00214791 0.00400317 0.01130736 0.34930706 0.01210207]


In [15]:
model.booster_.save_model("lightgbm_model.txt")


<lightgbm.basic.Booster at 0x7c21c966a330>

In [16]:
import lightgbm as lgb

booster = lgb.Booster(model_file="lightgbm_model.txt")

onnx_model = onnxmltools.convert_lightgbm(
    booster,
    initial_types=[("input", FloatTensorType([None, n_features]))]
)

with open("lightgbm_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())
