In [39]:
import numpy as np
import pandas as pd

np.random.seed(42)

N_SAMPLES = 10000

permissions = [
    "perm_internet", "perm_access_network_state", "perm_access_wifi_state",

    "perm_read_sms", "perm_receive_sms", "perm_send_sms",
    "perm_read_phone_numbers", "perm_read_phone_state",

    "perm_read_basic_phone_state", "perm_read_privileged_phone_state",
    "perm_get_accounts",

    "perm_read_external_storage", "perm_write_external_storage",
    "perm_read_media_images", "perm_read_media_video", "perm_read_media_audio",

    "perm_camera", "perm_record_audio",

    "perm_access_fine_location", "perm_access_coarse_location",
    "perm_access_background_location",

    "perm_read_contacts", "perm_write_contacts",
    "perm_read_call_log", "perm_write_call_log",

    "perm_system_alert_window", "perm_draw_over_other_apps",
    "perm_bind_accessibility_service",

    "perm_request_install_packages", "perm_delete_packages",
    "perm_query_all_packages", "perm_package_usage_stats",

    "perm_foreground_service", "perm_wake_lock",
    "perm_receive_boot_completed", "perm_disable_keyguard",
    "perm_use_full_screen_intent",

    "perm_use_biometric", "perm_use_fingerprint", "perm_keyguard_service",

    "perm_read_logs", "perm_write_settings",
    "perm_request_ignore_battery_optimizations"
]

def bernoulli(p, size):
    return np.random.binomial(1, p, size)

def clipped_normal(mean, std, size, low=0.0, high=1.0):
    return np.clip(np.random.normal(mean, std, size), low, high)

df = pd.DataFrame()

for perm in permissions:
    df[perm] = bernoulli(np.random.uniform(0.18, 0.45), N_SAMPLES)

permission_density = df[permissions].mean(axis=1)

# -----------------------------
# Metadata features
# -----------------------------
df["app_size_mb"] = np.clip(np.random.normal(24, 9, N_SAMPLES), 6, 110)
df["min_sdk"] = np.random.randint(19, 28, N_SAMPLES)
df["target_sdk"] = np.random.randint(26, 34, N_SAMPLES)
df["activity_count"] = np.random.poisson(6, N_SAMPLES)
df["certificate_age_days"] = np.clip(np.random.normal(900, 600, N_SAMPLES), 30, 4000)
df["debuggable_flag"] = bernoulli(0.07, N_SAMPLES)
df["obfuscation_score"] = clipped_normal(0.46, 0.18, N_SAMPLES)

df["network_activity_density_score"] = clipped_normal(0.46, 0.15, N_SAMPLES)
df["background_presence_score"] = clipped_normal(0.42, 0.17, N_SAMPLES)
df["event_dependency_score"] = clipped_normal(0.36, 0.15, N_SAMPLES)
df["communication_regularness_score"] = clipped_normal(0.44, 0.16, N_SAMPLES)
df["endpoint_stability_score"] = clipped_normal(0.56, 0.17, N_SAMPLES)

df["permission_behavior_alignment_score"] = np.clip(
    1.0 - np.abs(permission_density - df["background_presence_score"]),
    0.0, 1.0
)

df["metadata_behavior_consistency_score"] = np.clip(
    1.0 - np.abs(df["obfuscation_score"] - df["communication_regularness_score"]),
    0.0, 1.0
)

df["api_permission_coherence_score"] = clipped_normal(0.56, 0.18, N_SAMPLES)
df["temporal_stability_index"] = clipped_normal(0.52, 0.14, N_SAMPLES)

latent_risk = (
    0.22 * permission_density +
    0.20 * df["background_presence_score"] +
    0.18 * df["obfuscation_score"] +
    0.15 * (1 - df["permission_behavior_alignment_score"]) +
    0.15 * (1 - df["metadata_behavior_consistency_score"]) +
    0.10 * (1 - df["endpoint_stability_score"])
)

risk_bias = np.random.beta(2.2, 4.5, N_SAMPLES)
latent_risk = np.clip(latent_risk + 0.18 * risk_bias, 0, 1)

df["label"] = (
    latent_risk > np.random.uniform(0.40, 0.55, N_SAMPLES)
).astype(int)

OUTPUT_FILE = "synthetic_hidden_pattern_dataset_v2.csv"
df.to_csv(OUTPUT_FILE, index=False)

print("Dataset generated:", OUTPUT_FILE)
print("Shape:", df.shape)
print(df["label"].value_counts(normalize=True))


Dataset generated: synthetic_hidden_pattern_dataset_v2.csv
Shape: (10000, 60)
label
0    0.8399
1    0.1601
Name: proportion, dtype: float64


In [47]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    average_precision_score
)

# -------------------------------------------------
# Load dataset
# -------------------------------------------------
df = pd.read_csv("synthetic_hidden_pattern_dataset_v2.csv")

X = df.drop(columns=["label"])
y = df["label"]

# -------------------------------------------------
# Train / Test split (stratified)
# -------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# -------------------------------------------------
# Class imbalance handling
# -------------------------------------------------
pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

# -------------------------------------------------
# Optimized XGBoost configuration
# -------------------------------------------------
model = xgb.XGBClassifier(
    objective="binary:logistic",
    n_estimators=300,
    learning_rate=0.04,
    max_depth=6,
    min_child_weight=5,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    scale_pos_weight=pos_weight,
    eval_metric="aucpr",
    # Removed base_score parameter to avoid onnxmltools parsing issue, relying on XGBoost default (0.5)
    random_state=42,
    tree_method="hist"
)

# -------------------------------------------------
# Train model
# -------------------------------------------------
model.fit(X_train, y_train)

# -------------------------------------------------
# Evaluation
# -------------------------------------------------
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nROC-AUC:", roc_auc_score(y_test, y_prob))
print("PR-AUC:", average_precision_score(y_test, y_prob))

Confusion Matrix:
[[1481  199]
 [ 108  212]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9320    0.8815    0.9061      1680
           1     0.5158    0.6625    0.5800       320

    accuracy                         0.8465      2000
   macro avg     0.7239    0.7720    0.7431      2000
weighted avg     0.8654    0.8465    0.8539      2000


ROC-AUC: 0.8921261160714287
PR-AUC: 0.6591162116355656


In [42]:
import pickle

# Define the output file name for the pickle file
output_pickle_file = "xgboost_model.pkl"

# Save the trained XGBoost model to a .pkl file
with open(output_pickle_file, 'wb') as f:
    pickle.dump(model, f)

print(f"Model saved to {output_pickle_file}")

Model saved to xgboost_model.pkl


In [43]:
import xgboost as xgb
import pickle

# Define the output file name for the pickle file
output_pickle_file = "xgboost_model.pkl"

# Load the trained XGBoost model from the .pkl file
with open(output_pickle_file, 'rb') as f:
    loaded_model = pickle.load(f)

print(f"Model loaded from {output_pickle_file}")
# You can now use loaded_model for predictions or further processing
# For example, to check the type:
# print(type(loaded_model))

Model loaded from xgboost_model.pkl


In [49]:
# Define the output file name for the JSON model
output_json_file = "xgboost_model.json"

# Save the loaded XGBoost model to a JSON file
loaded_model.save_model(output_json_file)

print(f"Model saved to {output_json_file}")

Model saved to xgboost_model.json


In [34]:
!pip install onnxmltools



In [53]:
import onnxmltools
from onnxmltools.convert.common.data_types import FloatTensorType
import pickle
import xgboost as xgb # Needed for Booster type hints and set_param

n_features = X_train.shape[1]

# Load the trained XGBoost model
output_pickle_file = "xgboost_model.pkl"
with open(output_pickle_file, 'rb') as f:
    loaded_model = pickle.load(f)

# Get the booster object from the loaded_model
booster = loaded_model.get_booster()

# HACKY WORKAROUND: Explicitly set base_score on the booster to ensure it's a clean float
# This tries to override any problematic internal string representation that onnxmltools might find.
booster.set_param('base_score', 0.5)

onnx_model = onnxmltools.convert_xgboost(
    booster, # Use the modified booster
    initial_types=[("input", FloatTensorType([None, n_features]))]
)

with open("xgb_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

print("ONNX conversion successful")

ValueError: could not convert string to float: '[5E-1]'

# Task
Modify cell_8772e6a0 to remove the explicit `booster.set_param('base_score', 0.5)` call.

## modify_cell_8772e6a0

### Subtask:
Remove the explicit `booster.set_param('base_score', 0.5)` call to let `onnxmltools` handle the `base_score` implicitly from the `XGBClassifier`.


## Summary:

### Data Analysis Key Findings
*   The explicit setting of `booster.set_param('base_score', 0.5)` was removed from the model configuration. This change was implemented to allow `onnxmltools` to implicitly handle the `base_score` parameter directly from the `XGBClassifier` model.

### Insights or Next Steps
*   This modification simplifies the model conversion process by allowing `onnxmltools` to infer the `base_score` parameter automatically, potentially reducing redundancy and ensuring consistent behavior with the original `XGBClassifier` settings.
*   The next step should involve verifying that the `onnxmltools` conversion accurately reflects the `base_score` from the `XGBClassifier` without the explicit setting, and that the converted model performs as expected.
