In [None]:
import numpy as np
import pandas as pd

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

PERMISSIONS = [
    "INTERNET", "READ_SMS", "RECEIVE_SMS", "SEND_SMS",
    "SYSTEM_ALERT_WINDOW", "BIND_ACCESSIBILITY_SERVICE",
    "REQUEST_INSTALL_PACKAGES", "READ_CONTACTS"
]

RISK_SCORES = [
    "network_activity_score",
    "background_persistence_score",
    "api_anomaly_score",
    "permission_behavior_alignment_score",
    "infrastructure_stability_score"
]

METADATA = [
    "app_size_mb",
    "min_sdk",
    "target_sdk",
    "certificate_age_days",
    "obfuscation_score"
]


In [5]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    average_precision_score
)

# -----------------------------
# Load dataset
# -----------------------------
df = pd.read_csv("synthetic_hidden_pattern_dataset_v2.csv")

X = df.drop(columns=["label"])
y = df["label"]

# -----------------------------
# 80â€“20 stratified split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# -----------------------------
# Class imbalance handling
# -----------------------------
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

# -----------------------------
# XGBoost model
# -----------------------------
model = xgb.XGBClassifier(
    objective="binary:logistic",
    max_depth=6,
    learning_rate=0.05,
    n_estimators=200,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    eval_metric="aucpr",
    random_state=42
)

# -----------------------------
# Train
# -----------------------------
model.fit(X_train, y_train)

# -----------------------------
# Evaluate
# -----------------------------
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred, digits=4))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("PR-AUC:", average_precision_score(y_test, y_proba))


Classification Report:
              precision    recall  f1-score   support

           0     0.9314    0.8810    0.9055      1680
           1     0.5134    0.6594    0.5773       320

    accuracy                         0.8455      2000
   macro avg     0.7224    0.7702    0.7414      2000
weighted avg     0.8645    0.8455    0.8530      2000

Confusion Matrix:
[[1480  200]
 [ 109  211]]
ROC-AUC: 0.8906733630952381
PR-AUC: 0.6589980715711659


In [None]:
import numpy as np
import pandas as pd

np.random.seed(42)

# -----------------------------
# Dataset size
# -----------------------------
N_SAMPLES = 10000

# -----------------------------
# Permission list (binary)
# -----------------------------
permissions = [
    "perm_internet", "perm_access_network_state", "perm_access_wifi_state",

    "perm_read_sms", "perm_receive_sms", "perm_send_sms",
    "perm_read_phone_numbers", "perm_read_phone_state",

    "perm_read_basic_phone_state", "perm_read_privileged_phone_state",
    "perm_get_accounts",

    "perm_read_external_storage", "perm_write_external_storage",
    "perm_read_media_images", "perm_read_media_video", "perm_read_media_audio",

    "perm_camera", "perm_record_audio",

    "perm_access_fine_location", "perm_access_coarse_location",
    "perm_access_background_location",

    "perm_read_contacts", "perm_write_contacts",
    "perm_read_call_log", "perm_write_call_log",

    "perm_system_alert_window", "perm_draw_over_other_apps",
    "perm_bind_accessibility_service",

    "perm_request_install_packages", "perm_delete_packages",
    "perm_query_all_packages", "perm_package_usage_stats",

    "perm_foreground_service", "perm_wake_lock",
    "perm_receive_boot_completed", "perm_disable_keyguard",
    "perm_use_full_screen_intent",

    "perm_use_biometric", "perm_use_fingerprint", "perm_keyguard_service",

    "perm_read_logs", "perm_write_settings",
    "perm_request_ignore_battery_optimizations"
]

# -----------------------------
# Helper functions
# -----------------------------
def bernoulli(p, size):
    return np.random.binomial(1, p, size)

def clipped_normal(mean, std, size, low=0.0, high=1.0):
    return np.clip(np.random.normal(mean, std, size), low, high)

# -----------------------------
# Initialize dataframe
# -----------------------------
df = pd.DataFrame()

# -----------------------------
# Permission generation
# Overlapping probabilities (no single decisive permission)
# -----------------------------
for perm in permissions:
    base_prob = np.random.uniform(0.15, 0.45)
    df[perm] = bernoulli(base_prob, N_SAMPLES)

# -----------------------------
# Metadata features
# -----------------------------
df["app_size_mb"] = np.clip(np.random.normal(22, 8, N_SAMPLES), 5, 100)
df["min_sdk"] = np.random.randint(19, 28, N_SAMPLES)
df["target_sdk"] = np.random.randint(26, 34, N_SAMPLES)
df["activity_count"] = np.random.poisson(6, N_SAMPLES)
df["certificate_age_days"] = np.clip(np.random.normal(900, 600, N_SAMPLES), 30, 4000)
df["debuggable_flag"] = bernoulli(0.08, N_SAMPLES)
df["obfuscation_score"] = clipped_normal(0.45, 0.2, N_SAMPLES)

# -----------------------------
# Abstract network & behavior indicators (safe)
# -----------------------------
df["network_activity_density_score"] = clipped_normal(0.45, 0.15, N_SAMPLES)
df["background_presence_score"] = clipped_normal(0.40, 0.18, N_SAMPLES)
df["event_dependency_score"] = clipped_normal(0.35, 0.15, N_SAMPLES)
df["communication_regularness_score"] = clipped_normal(0.42, 0.16, N_SAMPLES)
df["endpoint_stability_score"] = clipped_normal(0.55, 0.18, N_SAMPLES)

# -----------------------------
# Consistency / alignment features (key for hidden patterns)
# -----------------------------
permission_density = df[permissions].mean(axis=1)

df["permission_behavior_alignment_score"] = np.clip(
    1.0 - np.abs(permission_density - df["background_presence_score"]),
    0.0, 1.0
)

df["metadata_behavior_consistency_score"] = np.clip(
    1.0 - np.abs(df["obfuscation_score"] - df["communication_regularness_score"]),
    0.0, 1.0
)

df["api_permission_coherence_score"] = clipped_normal(0.55, 0.2, N_SAMPLES)
df["temporal_stability_index"] = clipped_normal(0.50, 0.15, N_SAMPLES)

# -----------------------------
# Label generation (probabilistic, non-rule-based)
# -----------------------------
latent_risk = (
    0.25 * permission_density +
    0.20 * df["background_presence_score"] +
    0.15 * (1 - df["permission_behavior_alignment_score"]) +
    0.15 * (1 - df["metadata_behavior_consistency_score"]) +
    0.10 * (1 - df["endpoint_stability_score"]) +
    0.15 * df["obfuscation_score"]
)

latent_risk = np.clip(latent_risk + np.random.normal(0, 0.05, N_SAMPLES), 0, 1)

df["label"] = (latent_risk > np.random.uniform(0.45, 0.55, N_SAMPLES)).astype(int)

# -----------------------------
# Save to CSV
# -----------------------------
OUTPUT_FILE = "synthetic_hidden_pattern_dataset.csv"
df.to_csv(OUTPUT_FILE, index=False)

print(f"Dataset generated: {OUTPUT_FILE}")
print("Shape:", df.shape)
print(df["label"].value_counts())import numpy as np
import pandas as pd

np.random.seed(42)

# -----------------------------
# Dataset size
# -----------------------------
N_SAMPLES = 10000

# -----------------------------
# Permission list (binary)
# -----------------------------
permissions = [
    "perm_internet", "perm_access_network_state", "perm_access_wifi_state",

    "perm_read_sms", "perm_receive_sms", "perm_send_sms",
    "perm_read_phone_numbers", "perm_read_phone_state",

    "perm_read_basic_phone_state", "perm_read_privileged_phone_state",
    "perm_get_accounts",

    "perm_read_external_storage", "perm_write_external_storage",
    "perm_read_media_images", "perm_read_media_video", "perm_read_media_audio",

    "perm_camera", "perm_record_audio",

    "perm_access_fine_location", "perm_access_coarse_location",
    "perm_access_background_location",

    "perm_read_contacts", "perm_write_contacts",
    "perm_read_call_log", "perm_write_call_log",

    "perm_system_alert_window", "perm_draw_over_other_apps",
    "perm_bind_accessibility_service",

    "perm_request_install_packages", "perm_delete_packages",
    "perm_query_all_packages", "perm_package_usage_stats",

    "perm_foreground_service", "perm_wake_lock",
    "perm_receive_boot_completed", "perm_disable_keyguard",
    "perm_use_full_screen_intent",

    "perm_use_biometric", "perm_use_fingerprint", "perm_keyguard_service",

    "perm_read_logs", "perm_write_settings",
    "perm_request_ignore_battery_optimizations"
]

def bernoulli(p, size):
    return np.random.binomial(1, p, size)

def clipped_normal(mean, std, size, low=0.0, high=1.0):
    return np.clip(np.random.normal(mean, std, size), low, high)

df = pd.DataFrame()

# -----------------------------
# Permissions (overlapping)
# -----------------------------
for perm in permissions:
    df[perm] = bernoulli(np.random.uniform(0.18, 0.45), N_SAMPLES)

permission_density = df[permissions].mean(axis=1)

# -----------------------------
# Metadata features
# -----------------------------
df["app_size_mb"] = np.clip(np.random.normal(24, 9, N_SAMPLES), 6, 110)
df["min_sdk"] = np.random.randint(19, 28, N_SAMPLES)
df["target_sdk"] = np.random.randint(26, 34, N_SAMPLES)
df["activity_count"] = np.random.poisson(6, N_SAMPLES)
df["certificate_age_days"] = np.clip(np.random.normal(900, 600, N_SAMPLES), 30, 4000)
df["debuggable_flag"] = bernoulli(0.07, N_SAMPLES)
df["obfuscation_score"] = clipped_normal(0.46, 0.18, N_SAMPLES)

# -----------------------------
# Abstract behavior indicators (safe)
# -----------------------------
df["network_activity_density_score"] = clipped_normal(0.46, 0.15, N_SAMPLES)
df["background_presence_score"] = clipped_normal(0.42, 0.17, N_SAMPLES)
df["event_dependency_score"] = clipped_normal(0.36, 0.15, N_SAMPLES)
df["communication_regularness_score"] = clipped_normal(0.44, 0.16, N_SAMPLES)
df["endpoint_stability_score"] = clipped_normal(0.56, 0.17, N_SAMPLES)

# -----------------------------
# Alignment / consistency (hidden patterns)
# -----------------------------
df["permission_behavior_alignment_score"] = np.clip(
    1.0 - np.abs(permission_density - df["background_presence_score"]),
    0.0, 1.0
)

df["metadata_behavior_consistency_score"] = np.clip(
    1.0 - np.abs(df["obfuscation_score"] - df["communication_regularness_score"]),
    0.0, 1.0
)

df["api_permission_coherence_score"] = clipped_normal(0.56, 0.18, N_SAMPLES)
df["temporal_stability_index"] = clipped_normal(0.52, 0.14, N_SAMPLES)

# -----------------------------
# Latent anomaly likelihood
# -----------------------------
latent_risk = (
    0.22 * permission_density +
    0.20 * df["background_presence_score"] +
    0.18 * df["obfuscation_score"] +
    0.15 * (1 - df["permission_behavior_alignment_score"]) +
    0.15 * (1 - df["metadata_behavior_consistency_score"]) +
    0.10 * (1 - df["endpoint_stability_score"])
)

# Controlled density shaping (no rules)
risk_bias = np.random.beta(2.2, 4.5, N_SAMPLES)
latent_risk = np.clip(latent_risk + 0.18 * risk_bias, 0, 1)

# -----------------------------
# Label assignment (probabilistic)
# -----------------------------
df["label"] = (
    latent_risk > np.random.uniform(0.40, 0.55, N_SAMPLES)
).astype(int)

# -----------------------------
# Save dataset
# -----------------------------
OUTPUT_FILE = "synthetic_hidden_pattern_dataset_v2.csv"
df.to_csv(OUTPUT_FILE, index=False)

print("Dataset generated:", OUTPUT_FILE)
print("Shape:", df.shape)
print(df["label"].value_counts(normalize=True))



Dataset generated: synthetic_hidden_pattern_dataset.csv
Shape: (10000, 60)
label
0    9702
1     298
Name: count, dtype: int64


In [None]:
import numpy as np
import pandas as pd

np.random.seed(42)

# -----------------------------
# Dataset size
# -----------------------------
N_SAMPLES = 10000

# -----------------------------
# Permission list (binary)
# -----------------------------
permissions = [
    "perm_internet", "perm_access_network_state", "perm_access_wifi_state",

    "perm_read_sms", "perm_receive_sms", "perm_send_sms",
    "perm_read_phone_numbers", "perm_read_phone_state",

    "perm_read_basic_phone_state", "perm_read_privileged_phone_state",
    "perm_get_accounts",

    "perm_read_external_storage", "perm_write_external_storage",
    "perm_read_media_images", "perm_read_media_video", "perm_read_media_audio",

    "perm_camera", "perm_record_audio",

    "perm_access_fine_location", "perm_access_coarse_location",
    "perm_access_background_location",

    "perm_read_contacts", "perm_write_contacts",
    "perm_read_call_log", "perm_write_call_log",

    "perm_system_alert_window", "perm_draw_over_other_apps",
    "perm_bind_accessibility_service",

    "perm_request_install_packages", "perm_delete_packages",
    "perm_query_all_packages", "perm_package_usage_stats",

    "perm_foreground_service", "perm_wake_lock",
    "perm_receive_boot_completed", "perm_disable_keyguard",
    "perm_use_full_screen_intent",

    "perm_use_biometric", "perm_use_fingerprint", "perm_keyguard_service",

    "perm_read_logs", "perm_write_settings",
    "perm_request_ignore_battery_optimizations"
]

def bernoulli(p, size):
    return np.random.binomial(1, p, size)

def clipped_normal(mean, std, size, low=0.0, high=1.0):
    return np.clip(np.random.normal(mean, std, size), low, high)

df = pd.DataFrame()

# -----------------------------
# Permissions (overlapping)
# -----------------------------
for perm in permissions:
    df[perm] = bernoulli(np.random.uniform(0.18, 0.45), N_SAMPLES)

permission_density = df[permissions].mean(axis=1)

# -----------------------------
# Metadata features
# -----------------------------
df["app_size_mb"] = np.clip(np.random.normal(24, 9, N_SAMPLES), 6, 110)
df["min_sdk"] = np.random.randint(19, 28, N_SAMPLES)
df["target_sdk"] = np.random.randint(26, 34, N_SAMPLES)
df["activity_count"] = np.random.poisson(6, N_SAMPLES)
df["certificate_age_days"] = np.clip(np.random.normal(900, 600, N_SAMPLES), 30, 4000)
df["debuggable_flag"] = bernoulli(0.07, N_SAMPLES)
df["obfuscation_score"] = clipped_normal(0.46, 0.18, N_SAMPLES)

# -----------------------------
# Abstract behavior indicators (safe)
# -----------------------------
df["network_activity_density_score"] = clipped_normal(0.46, 0.15, N_SAMPLES)
df["background_presence_score"] = clipped_normal(0.42, 0.17, N_SAMPLES)
df["event_dependency_score"] = clipped_normal(0.36, 0.15, N_SAMPLES)
df["communication_regularness_score"] = clipped_normal(0.44, 0.16, N_SAMPLES)
df["endpoint_stability_score"] = clipped_normal(0.56, 0.17, N_SAMPLES)

# -----------------------------
# Alignment / consistency (hidden patterns)
# -----------------------------
df["permission_behavior_alignment_score"] = np.clip(
    1.0 - np.abs(permission_density - df["background_presence_score"]),
    0.0, 1.0
)

df["metadata_behavior_consistency_score"] = np.clip(
    1.0 - np.abs(df["obfuscation_score"] - df["communication_regularness_score"]),
    0.0, 1.0
)

df["api_permission_coherence_score"] = clipped_normal(0.56, 0.18, N_SAMPLES)
df["temporal_stability_index"] = clipped_normal(0.52, 0.14, N_SAMPLES)

# -----------------------------
# Latent anomaly likelihood
# -----------------------------
latent_risk = (
    0.22 * permission_density +
    0.20 * df["background_presence_score"] +
    0.18 * df["obfuscation_score"] +
    0.15 * (1 - df["permission_behavior_alignment_score"]) +
    0.15 * (1 - df["metadata_behavior_consistency_score"]) +
    0.10 * (1 - df["endpoint_stability_score"])
)

# Controlled density shaping (no rules)
risk_bias = np.random.beta(2.2, 4.5, N_SAMPLES)
latent_risk = np.clip(latent_risk + 0.18 * risk_bias, 0, 1)

# -----------------------------
# Label assignment (probabilistic)
# -----------------------------
df["label"] = (
    latent_risk > np.random.uniform(0.40, 0.55, N_SAMPLES)
).astype(int)

# -----------------------------
# Save dataset
# -----------------------------
OUTPUT_FILE = "synthetic_hidden_pattern_dataset_v2.csv"
df.to_csv(OUTPUT_FILE, index=False)

print("Dataset generated:", OUTPUT_FILE)
print("Shape:", df.shape)
print(df["label"].value_counts(normalize=True))


Dataset generated: synthetic_hidden_pattern_dataset_v2.csv
Shape: (10000, 60)
label
0    0.8399
1    0.1601
Name: proportion, dtype: float64
