In [1]:
# ============================================================
#
# ============================================================

import sys
sys.path.append("../src")

from src.crispml.config.project.factory import make_config
from src.crispml.config.enums.enums_config import ProblemType
from src.crispml.phases.phase2_data_understanding import run_phase2
from src.crispml.phases.phase3_data_preparation import run_phase3
from src.crispml.phases.phase4_modeling import run_phase4
from src.crispml.phases.phase5_evaluation import run_phase5
import pandas as pd

config = make_config(
    name="Microsoft_Security_Incident_Prediction",
    problem_type=ProblemType.CLASSIFICATION,
    dataset_path="../data/GUIDE_Test.csv",
    target_col=None            # Clustering NO usa target
)

config



[2025-12-03 16:24:03] [INFO] crispml - CRISP-ML logging initialized. Log file: K:\00_Code\Project_Big_Data_Analytics_and_Machine_Learning\src\crispml\out\logs\crispml.log
[2025-12-03 16:24:06] [INFO] crispml.src.crispml.config.base_config - [base_config] CRISP-ML base configuration module loaded.
[2025-12-03 16:24:06] [INFO] crispml.src.crispml.config.base_config - [base_config] CRISP-ML Base API ready for use.
[2025-12-03 16:24:06] [INFO] crispml.src.crispml.config.project.factory - [factory] Creating ProjectConfig 'CyberClustering' | type=CLUSTERING | dataset=../data/Global_Cybersecurity_Threats_2015-2024.csv
[2025-12-03 16:24:06] [INFO] crispml.src.crispml.config.dataset.dataset_config - [DatasetConfig] source=CSV | path=../data/Global_Cybersecurity_Threats_2015-2024.csv | task=CLUSTERING | target=None | time=None | id_cols=[]
[2025-12-03 16:24:06] [INFO] crispml.src.crispml.config.dataset.feature_config - [FeatureConfig] mode=AUTO | include=[] | exclude=[] | max_features=50 | max_u

ProjectConfig(name='CyberClustering', dataset=DatasetConfig(source_type=<DataSourceType.CSV: 1>, path_or_conn='../data/Global_Cybersecurity_Threats_2015-2024.csv', problem_type=<ProblemType.CLUSTERING: 1>, target_col=None, time_col=None, id_cols=[]), features=FeatureConfig(mode=<FeatureSelectionMode.AUTO: 1>, include=[], exclude=[], max_features=50, max_unique_for_cat=50), bigdata=BigDataConfig(sample_rows_for_eda=50000), modeling=ModelingConfig(clustering_algos=['kmeans', 'dbscan'], classification_algos=[], regression_algos=[], ts_algos=[], hyperparams={'kmeans': {'n_clusters': [3, 4, 5]}, 'dbscan': {'eps': [0.3, 0.5], 'min_samples': [5, 10]}}), evaluation=EvaluationConfig(main_metric=None), techniques=TechniquesConfig(phase2=Phase2Techniques(describe=Phase2DescribeConfig(describe_stats=True, freq_tables=True, histograms=True, boxplots=True, barplots=True, scatterplots=True, corr_matrix=True), quality=Phase2QualityConfig(missing_analysis=True, outlier_detection=True, duplicates_check=

In [2]:
# ============================================================
# Phase 2: Data Understanding
# ============================================================
df_eda = run_phase2(config)
df_eda.head()


[2025-12-03 16:24:13] [INFO] crispml.src.crispml.phases.phase2_data_understanding - === START PHASE 2 – DATA UNDERSTANDING (CyberClustering) ===
[2025-12-03 16:24:13] [INFO] crispml.src.crispml.phases.phase2_data_understanding - [FASE2][2.1] Loading initial dataset...
[2025-12-03 16:24:13] [INFO] crispml.src.crispml.common.io.io_utils - [io] Starting dataset load (EDA mode: True)
[2025-12-03 16:24:13] [INFO] crispml.src.crispml.common.io.io_utils - [io] Applying EDA sampling: limiting to 50000 rows
[2025-12-03 16:24:13] [INFO] crispml.src.crispml.common.io.io_utils - [io] Loading CSV from ../data/Global_Cybersecurity_Threats_2015-2024.csv (nrows=50000)
[2025-12-03 16:24:13] [INFO] crispml.src.crispml.common.io.io_utils - [io] CSV loaded successfully with shape: (3000, 10)
[2025-12-03 16:24:13] [INFO] crispml.src.crispml.common.io.io_utils - [io] Dataset successfully loaded.
[2025-12-03 16:24:13] [INFO] crispml.src.crispml.phases.phase2_data_understanding - [FASE2] Dataset loaded: shape

Unnamed: 0,Country,Year,Attack Type,Target Industry,Financial Loss (in Million $),Number of Affected Users,Attack Source,Security Vulnerability Type,Defense Mechanism Used,Incident Resolution Time (in Hours)
0,China,2019,Phishing,Education,80.53,773169,Hacker Group,Unpatched Software,VPN,63
1,China,2019,Ransomware,Retail,62.19,295961,Hacker Group,Unpatched Software,Firewall,71
2,India,2017,Man-in-the-Middle,IT,38.65,605895,Hacker Group,Weak Passwords,VPN,20
3,UK,2024,Ransomware,Telecommunications,41.44,659320,Nation-state,Social Engineering,AI-based Detection,7
4,Germany,2018,Man-in-the-Middle,IT,74.41,810682,Insider,Social Engineering,VPN,68


In [3]:
# ============================================================
# Phase 3: Data Preparation
# ============================================================
splits, df_prepared = run_phase3(config)

X_train = splits["X_train"]
X_val   = splits["X_val"]
X_test  = splits["X_test"]
y_train = splits["y_train"]
y_val   = splits["y_val"]
y_test  = splits["y_test"]

X_train[:5]


[2025-12-03 16:24:27] [INFO] crispml.src.crispml.phases.phase3_data_preparation - === START PHASE 3 – DATA PREPARATION (CyberClustering) ===
[2025-12-03 16:24:27] [INFO] crispml.src.crispml.phases.phase3_data_preparation - [FASE3][3.1] Loading full dataset...
[2025-12-03 16:24:27] [INFO] crispml.src.crispml.common.io.io_utils - [io] Starting dataset load (EDA mode: False)
[2025-12-03 16:24:27] [INFO] crispml.src.crispml.common.io.io_utils - [io] Loading CSV from ../data/Global_Cybersecurity_Threats_2015-2024.csv (nrows=None)
[2025-12-03 16:24:27] [INFO] crispml.src.crispml.common.io.io_utils - [io] CSV loaded successfully with shape: (3000, 10)
[2025-12-03 16:24:27] [INFO] crispml.src.crispml.common.io.io_utils - [io] Dataset successfully loaded.
[2025-12-03 16:24:27] [INFO] crispml.src.crispml.common.feature_selection.feature_selection_utils - [feature_selection] Starting feature selection...
[2025-12-03 16:24:27] [INFO] crispml.src.crispml.common.feature_selection.feature_selection_u

array([[ 0.50032855,  1.09589226,  1.1162939 ,  1.33823828, -0.33947258,
        -0.321476  , -0.33641127, -0.32774947, -0.33825002, -0.33641127,
        -0.33023825, -0.34615145, -0.32524906, -0.43913886,  2.35285948,
        -0.46269144, -0.44345167, -0.4488226 , -0.4029146 , -0.39392773,
        -0.40848641, -0.43535287, -0.40514732, -0.39392773,  1.72897707,
        -0.59993956, -0.58658846, -0.57119169,  1.76340269, -0.59531656,
         1.94346857, -0.49582979, -0.49217479, -0.50624244],
       [ 0.85029022, -0.30649556,  1.52197075,  1.43547983, -0.33947258,
        -0.321476  , -0.33641127, -0.32774947, -0.33825002, -0.33641127,
        -0.33023825, -0.34615145, -0.32524906,  2.27718401, -0.42501476,
        -0.46269144, -0.44345167, -0.4488226 , -0.4029146 , -0.39392773,
        -0.40848641,  2.29698729, -0.40514732, -0.39392773,  1.72897707,
        -0.59993956, -0.58658846, -0.57119169,  1.76340269, -0.59531656,
        -0.51454395, -0.49582979,  2.0317985 , -0.50624244],
  

In [4]:
# ===============================================
# PHASE 4 – MODELING (Clustering)
# ===============================================
models, model_outputs = run_phase4(config, splits)



[2025-12-03 16:24:37] [INFO] crispml.src.crispml.phases.phase4_modeling - === START FASE 4 – MODELING (CyberClustering) ===
[2025-12-03 16:24:37] [INFO] crispml.src.crispml.phases.phase4_modeling - [FASE4][4.1] ProblemType rilevato: CLUSTERING
[2025-12-03 16:24:37] [INFO] crispml.src.crispml.phases.phase4_modeling - [FASE4][CLUSTERING] Algoritmi: ['kmeans', 'dbscan']
[2025-12-03 16:24:40] [INFO] crispml.src.crispml.common.modeling.clustering_models - [CLUSTERING] Trained kmeans_k3 (k=3)
[2025-12-03 16:24:40] [INFO] crispml.src.crispml.common.modeling.clustering_models - [CLUSTERING] Trained kmeans_k4 (k=4)
[2025-12-03 16:24:40] [INFO] crispml.src.crispml.common.modeling.clustering_models - [CLUSTERING] Trained kmeans_k5 (k=5)
[2025-12-03 16:24:41] [INFO] crispml.src.crispml.common.modeling.clustering_models - [CLUSTERING] Trained dbscan_eps0.3_ms5 (eps=0.300, min_samples=5)
[2025-12-03 16:24:41] [INFO] crispml.src.crispml.common.modeling.clustering_models - [CLUSTERING] Trained dbscan_

In [5]:
# ============================
# PHASE 5 – EVALUATION
# ============================

evaluation_results = run_phase5(
    config=config,
    models=models,
    splits=splits,
    df_prepared=df_prepared
)



[2025-12-03 16:24:49] [INFO] crispml.src.crispml.phases.phase5_evaluation - === START FASE 5 – EVALUATION (CyberClustering) ===
[2025-12-03 16:24:49] [INFO] crispml.src.crispml.phases.phase5_evaluation - [FASE5][CLUSTERING] Calcolo metriche clustering...
[2025-12-03 16:24:49] [INFO] crispml.src.crispml.common.evaluation.metrics_clustering - [eval] Clustering kmeans_k3: silhouette=0.5931 DB=0.4997 inertia=27786792205192.08
[2025-12-03 16:24:49] [INFO] crispml.src.crispml.common.evaluation.metrics_clustering - [eval] Clustering kmeans_k4: silhouette=0.5735 DB=0.5009 inertia=15634707746556.348
[2025-12-03 16:24:50] [INFO] crispml.src.crispml.common.evaluation.metrics_clustering - [eval] Clustering kmeans_k5: silhouette=0.5585 DB=0.5065 inertia=10168970527657.223
[2025-12-03 16:24:50] [INFO] crispml.src.crispml.common.evaluation.metrics_clustering - [eval] Clustering dbscan_eps0.3_ms5: silhouette=nan DB=nan inertia=nan
[2025-12-03 16:24:50] [INFO] crispml.src.crispml.common.evaluation.metr