In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

# =========================================================
# Configuration
# =========================================================
N_SAMPLES = 10000
OUTPUT_FILE = "synthetic_hidden_pattern_dataset_final.csv"

# =========================================================
# Permission List
# =========================================================
permissions = [
    "perm_internet", "perm_access_network_state", "perm_access_wifi_state",
    "perm_read_sms", "perm_receive_sms", "perm_send_sms",
    "perm_read_phone_numbers", "perm_read_phone_state",
    "perm_read_basic_phone_state", "perm_read_privileged_phone_state",
    "perm_get_accounts",
    "perm_read_external_storage", "perm_write_external_storage",
    "perm_read_media_images", "perm_read_media_video", "perm_read_media_audio",
    "perm_camera", "perm_record_audio",
    "perm_access_fine_location", "perm_access_coarse_location",
    "perm_access_background_location",
    "perm_read_contacts", "perm_write_contacts",
    "perm_read_call_log", "perm_write_call_log",
    "perm_system_alert_window", "perm_draw_over_other_apps",
    "perm_bind_accessibility_service",
    "perm_request_install_packages", "perm_delete_packages",
    "perm_query_all_packages", "perm_package_usage_stats",
    "perm_foreground_service", "perm_wake_lock",
    "perm_receive_boot_completed", "perm_disable_keyguard",
    "perm_use_full_screen_intent",
    "perm_use_biometric", "perm_use_fingerprint", "perm_keyguard_service",
    "perm_read_logs", "perm_write_settings",
    "perm_request_ignore_battery_optimizations"
]

sensitive_permissions = [
    "perm_read_sms", "perm_receive_sms", "perm_send_sms",
    "perm_bind_accessibility_service",
    "perm_system_alert_window",
    "perm_request_install_packages",
    "perm_read_contacts", "perm_read_call_log"
]

# =========================================================
# Helper Functions
# =========================================================
def bernoulli(p, size):
    return np.random.binomial(1, p, size)

def clipped_normal(mean, std, size, low=0.0, high=1.0):
    return np.clip(np.random.normal(mean, std, size), low, high)

# =========================================================
# Initialize DataFrame
# =========================================================
df = pd.DataFrame()

# =========================================================
# Permissions (Overlapping, Non-Deterministic)
# =========================================================
for perm in permissions:
    df[perm] = bernoulli(np.random.uniform(0.18, 0.45), N_SAMPLES)

permission_density = df[permissions].mean(axis=1)

# =========================================================
# Metadata Features
# =========================================================
df["app_size_mb"] = np.clip(np.random.normal(24, 9, N_SAMPLES), 6, 110)
df["min_sdk"] = np.random.randint(19, 28, N_SAMPLES)
df["target_sdk"] = np.random.randint(26, 34, N_SAMPLES)
df["activity_count"] = np.random.poisson(6, N_SAMPLES)
df["certificate_age_days"] = np.clip(np.random.normal(900, 600, N_SAMPLES), 30, 4000)
df["debuggable_flag"] = bernoulli(0.07, N_SAMPLES)
df["obfuscation_score"] = clipped_normal(0.46, 0.18, N_SAMPLES)

# =========================================================
# Abstract Behavioral Indicators (Safe)
# =========================================================
df["network_activity_density_score"] = clipped_normal(0.46, 0.15, N_SAMPLES)
df["background_presence_score"] = clipped_normal(0.42, 0.17, N_SAMPLES)
df["event_dependency_score"] = clipped_normal(0.36, 0.15, N_SAMPLES)
df["communication_regularness_score"] = clipped_normal(0.44, 0.16, N_SAMPLES)
df["endpoint_stability_score"] = clipped_normal(0.56, 0.17, N_SAMPLES)

# =========================================================
# Alignment & Consistency Features
# =========================================================
df["permission_behavior_alignment_score"] = np.clip(
    1.0 - np.abs(permission_density - df["background_presence_score"]),
    0.0, 1.0
)

df["metadata_behavior_consistency_score"] = np.clip(
    1.0 - np.abs(df["obfuscation_score"] - df["communication_regularness_score"]),
    0.0, 1.0
)

df["api_permission_coherence_score"] = clipped_normal(0.56, 0.18, N_SAMPLES)
df["temporal_stability_index"] = clipped_normal(0.52, 0.14, N_SAMPLES)

# =========================================================
# Derived Ratio-Based Features
# =========================================================
df["sensitive_permission_ratio"] = (
    df[sensitive_permissions].sum(axis=1) /
    (df[permissions].sum(axis=1) + 1)
)

df["background_to_activity_ratio"] = (
    df["background_presence_score"] / (df["activity_count"] + 1)
)

df["obfuscation_alignment_gap"] = np.abs(
    df["obfuscation_score"] - df["metadata_behavior_consistency_score"]
)

df["permission_entropy_proxy"] = np.clip(
    df[permissions].std(axis=1), 0, 1
)

df["behavior_variance_score"] = np.var(
    df[
        [
            "network_activity_density_score",
            "background_presence_score",
            "event_dependency_score",
            "communication_regularness_score",
            "endpoint_stability_score",
        ]
    ],
    axis=1
)

# =========================================================
# Feature Group Balance Indicators
# =========================================================
df["permission_to_metadata_ratio"] = (
    df[permissions].sum(axis=1) / (df["activity_count"] + 1)
)

df["behavior_to_metadata_ratio"] = (
    df["background_presence_score"] /
    (df["certificate_age_days"] / 1000 + 1)
)

# =========================================================
# Profile Coherence Score
# =========================================================
df["profile_coherence_score"] = np.clip(
    1.0
    - (
        0.4 * df["obfuscation_alignment_gap"]
        + 0.3 * df["behavior_variance_score"]
        + 0.3 * (1 - df["permission_behavior_alignment_score"])
    ),
    0.0, 1.0
)

# =========================================================
# Latent Anomaly Likelihood (No Rules)
# =========================================================
latent_risk = (
    0.20 * permission_density +
    0.18 * df["background_presence_score"] +
    0.18 * df["obfuscation_score"] +
    0.15 * (1 - df["permission_behavior_alignment_score"]) +
    0.15 * (1 - df["metadata_behavior_consistency_score"]) +
    0.14 * (1 - df["endpoint_stability_score"])
)

risk_bias = np.random.beta(2.2, 4.5, N_SAMPLES)
latent_risk = np.clip(latent_risk + 0.18 * risk_bias, 0, 1)

# =========================================================
# Label Assignment (â‰ˆ20% Label-1)
# =========================================================
df["label"] = (
    latent_risk > np.random.uniform(0.40, 0.55, N_SAMPLES)
).astype(int)

# =========================================================
# Controlled Numeric Noise (Final Step)
# =========================================================
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns
numeric_cols = numeric_cols.drop("label", errors="ignore")

df[numeric_cols] += np.random.normal(
    0, 0.01, size=df[numeric_cols].shape
)

# =========================================================
# Save Dataset
# =========================================================
df.to_csv(OUTPUT_FILE, index=False)

print("Final dataset generated:", OUTPUT_FILE)
print("Shape:", df.shape)
print("Label distribution:")
print(df["label"].value_counts(normalize=True))


Final dataset generated: synthetic_hidden_pattern_dataset_final.csv
Shape: (10000, 68)
Label distribution:
label
0    0.8328
1    0.1672
Name: proportion, dtype: float64
