In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import resample
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.neighbors import NearestNeighbors
from scipy.cluster.hierarchy import dendrogram, linkage

import warnings
warnings.filterwarnings('ignore')
sns.set(style="whitegrid")


In [None]:
# Caricamento dati
trans_final = pd.read_pickle('./trans_final.pkl')
trans_final.dropna(subset=['fraud'], inplace=True)

# Encoding variabili categoriche
le = LabelEncoder()
for column in ['mcc_description', 'merchant_city', 'merchant_state','use_chip','card_brand','card_type','gender','has_chip']:
    trans_final[column] = le.fit_transform(trans_final[column])

# Pulizia colonne non rilevanti
to_drop = ['id_trans','client_id_trans','card_id','client_id_card','id','retirement_age','address','expires','day','time']
trans_final = trans_final.drop(to_drop, axis=1)

trans_final = trans_final.apply(pd.to_numeric, downcast='integer')
trans_final = trans_final.apply(pd.to_numeric, downcast='float')

In [None]:

# Analisi bidimensionale: current_age vs yearly_income
data = trans_final[['current_age', 'yearly_income']].dropna()
x = data.values

# Normalizzazione
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)


In [None]:

# Elbow method
distortions = []
K = range(1, 11)
for k in K:
    kmeanModel = KMeans(n_clusters=k, random_state=0)
    kmeanModel.fit(x_scaled)
    distortions.append(kmeanModel.inertia_)

plt.figure(figsize=(6, 4))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method (2D)')
plt.show()


In [None]:
# 1. KMeans Clustering (con stile)
kmeans = KMeans(n_clusters=3, random_state=0)
labels_kmeans = kmeans.fit_predict(x_scaled)

# Usa gli stessi colori e nomi
colors_kmeans = ['yellow', 'magenta', 'pink']
labels_desc = {
    0: "Giovani a basso reddito",
    1: "Adulti a reddito medio",
    2: "Professionisti affermati"
}

plt.figure(figsize=(8, 6))
for i in range(3):
    plt.scatter(
        x[labels_kmeans == i, 0],  # Età originale
        x[labels_kmeans == i, 1],  # Reddito originale
        s=20,
        c=colors_kmeans[i],
        alpha=0.7,
        label=labels_desc[i]
    )

plt.title("KMeans Clustering: Età vs Reddito", fontsize=14)
plt.xlabel("Età (anni)", fontsize=12)
plt.ylabel("Reddito ($)", fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, linestyle=':', alpha=0.7)

# Formattazione asse Y in stile dollari
import matplotlib.ticker as mtick
plt.gca().yaxis.set_major_formatter(mtick.StrMethodFormatter('${x:,.0f}'))

plt.tight_layout()
plt.show()


In [None]:
sample_size = min(100000, len(x_scaled))
x_sampled, labels_sampled = resample(x_scaled, labels_kmeans, n_samples=sample_size, random_state=42)

if len(set(labels_sampled)) > 1:
    silhouette_vals = silhouette_samples(x_sampled, labels_sampled)
    silhouette_avg = silhouette_score(x_sampled, labels_sampled)
    print(f"Silhouette Score (2D): {silhouette_avg:.3f}")

    n_clusters = len(np.unique(labels_sampled))

    # Configurazione del plot con stile migliorato
    fig, ax = plt.subplots(figsize=(10, 6))
    y_lower = 10

    # Palette di colori pastello (Pastel1 per ≤9 cluster, nipy_spectral altrimenti)
    cmap = plt.cm.Pastel1 if n_clusters <= 9 else plt.cm.nipy_spectral

    for i in range(n_clusters):
        ith_cluster_silhouette_values = silhouette_vals[labels_sampled == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        # Colori tenui con trasparenza (alpha 0.6)
        color = cmap(float(i) / n_clusters)
        ax.fill_betweenx(np.arange(y_lower, y_upper),
                        0, ith_cluster_silhouette_values,
                        facecolor=color, edgecolor=color, alpha=0.6)

        # Etichette centrate e leggibili
        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, f'Cluster {i}',
                fontsize=10, va='center', ha='right')

        y_lower = y_upper + 10

    # Linea media con stile migliorato (rosso tenue #ff6b6b)
    avg_line = ax.axvline(x=silhouette_avg, color="#ff6b6b", linestyle="--",
                         linewidth=1.5, label=f'Media: {silhouette_avg:.2f}')

    # Titolo e assi
    ax.set_title("Analisi Silhouette - Clustering KMeans",
                fontsize=14, pad=20, fontweight='medium')
    ax.set_xlabel("Coefficiente Silhouette", fontsize=12)
    ax.set_ylabel("Cluster", fontsize=12)

    # Miglioramenti estetici
    ax.set_xlim([-0.5, 1])
    ax.set_ylim([0, y_upper + 20])
    ax.set_yticks([])
    ax.grid(axis='x', linestyle='--', alpha=0.3)

    # Rimuovere bordi non necessari
    for spine in ['top', 'right', 'left']:
        ax.spines[spine].set_visible(False)

    # Legenda con sfondo semitrasparente
    legend = ax.legend(loc='upper right', framealpha=0.8)
    legend.get_frame().set_edgecolor('#dddddd')

    plt.tight_layout()
    plt.show()
else:
    print("Silhouette plot non generato: identificato un solo cluster.")

In [None]:
# 2. Clustering Gerarchico sul campione
hc = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
sample_size = 20_000
np.random.seed(42)
idx = np.random.choice(len(x_scaled), sample_size, replace=False)
x_sample = x_scaled[idx]
x_sample_original = x[idx]
y_hc = hc.fit_predict(x_sample)

# --- Subplot 2: Clustering Gerarchico (sul campione) ---
plt.figure(figsize=(8, 6))
colors_hc = ['yellow', 'magenta', 'pink', 'cyan']
labels_hc = {
    0: "Giovani a basso reddito",
    1: "Adulti a reddito medio",
    2: "Professionisti affermati",
}

for i in range(3):
    plt.scatter(
        x_sample_original[y_hc == i, 0], x_sample_original[y_hc == i, 1],
        s=20, c=colors_hc[i],
        alpha=0.7,
        label=labels_hc[i]
    )
plt.title(f'Clustering Gerarchico (campione:  punti)', fontsize=14)
plt.xlabel('Età (anni)', fontsize=12)
plt.gca().yaxis.set_ticklabels([])
plt.legend(fontsize=10)
plt.grid(True, linestyle=':', alpha=0.7)

plt.suptitle('Comparazione Metodi di Clustering: Età vs Reddito', fontsize=16, y=1.02)
plt.tight_layout()

# Formattazione assi Y in dollari
for ax in plt.gcf().axes:
    ax.yaxis.set_major_formatter('${x:,.0f}')

plt.show()

In [None]:
# Normalizzazione
scaler = StandardScaler()
x_scaled = scaler.fit_transform(trans_final)
# PCA con 2 componenti
pca = PCA(n_components=2)
x_pca = pca.fit_transform(x_scaled)

# Elbow method con PCA
distortions_pca = []
K = range(1, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=0).fit(x_pca)
    distortions_pca.append(kmeans.inertia_)

plt.figure(figsize=(6, 4))
plt.plot(K, distortions_pca, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method (PCA)')
plt.show()


In [None]:
# PCA + KMeans clustering (grafico curato)
kmeans_pca = KMeans(n_clusters=3, random_state=0)
labels_pca = kmeans_pca.fit_predict(x_pca)

colors_pca = ['yellow', 'magenta', 'pink']
labels_desc = {
    0: "Cluster 1",
    1: "Cluster 2",
    2: "Cluster 3",
}

plt.figure(figsize=(8, 6))
for i in range(3):
    plt.scatter(
        x_pca[labels_pca == i, 0],
        x_pca[labels_pca == i, 1],
        s=20,
        c=colors_pca[i],
        alpha=0.7,
        label=labels_desc[i]
    )

plt.title("KMeans su PCA (riduzione dimensionale)", fontsize=14)
plt.xlabel("Componente Principale 1", fontsize=12)
plt.ylabel("Componente Principale 2", fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, linestyle=':', alpha=0.7)

plt.tight_layout()
plt.show()


In [None]:
x_pca_sampled, labels_pca_sampled = resample(x_pca, labels_pca, n_samples=sample_size, random_state=42)

if len(set(labels_pca_sampled)) > 1:
    silhouette_vals = silhouette_samples(x_pca_sampled, labels_pca_sampled)
    silhouette_avg = silhouette_score(x_pca_sampled, labels_pca_sampled)
    print(f"Silhouette Score (PCA): {silhouette_avg:.3f}")

    n_clusters = len(np.unique(labels_pca_sampled))

    # Configurazione del plot con stile migliorato
    fig, ax = plt.subplots(figsize=(10, 6))
    y_lower = 10

    # Palette di colori pastello
    cmap = plt.cm.Pastel1 if n_clusters <= 9 else plt.cm.nipy_spectral

    for i in range(n_clusters):
        ith_cluster_silhouette_values = silhouette_vals[labels_pca_sampled == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        # Colori tenui con trasparenza
        color = cmap(float(i) / n_clusters)
        ax.fill_betweenx(np.arange(y_lower, y_upper),
                        0, ith_cluster_silhouette_values,
                        facecolor=color, edgecolor=color, alpha=0.6)

        # Etichette centrate e leggibili
        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, f'Cluster {i}',
                fontsize=10, va='center', ha='right')

        y_lower = y_upper + 10

    # Linea media con stile migliorato
    avg_line = ax.axvline(x=silhouette_avg, color="#ff6b6b", linestyle="--",
                         linewidth=1.5, label=f'Media: {silhouette_avg:.2f}')

    # Titolo e assi
    ax.set_title("Analisi Silhouette - Clustering PCA",
                fontsize=14, pad=20, fontweight='medium')
    ax.set_xlabel("Coefficiente Silhouette", fontsize=12)
    ax.set_ylabel("Cluster", fontsize=12)

    # Miglioramenti estetici
    ax.set_xlim([-0.5, 1])
    ax.set_ylim([0, y_upper + 20])
    ax.set_yticks([])
    ax.grid(axis='x', linestyle='--', alpha=0.3)

    # Rimuovere bordi non necessari
    for spine in ['top', 'right', 'left']:
        ax.spines[spine].set_visible(False)

    # Legenda con sfondo semitrasparente
    legend = ax.legend(loc='upper right', framealpha=0.8)
    legend.get_frame().set_edgecolor('#dddddd')

    plt.tight_layout()
    plt.show()
else:
    print("Silhouette plot non generato: identificato un solo cluster.")

In [None]:
x_pca_sampled = resample(x_pca, n_samples=5000, random_state=42)
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(x_pca_sampled)
distances, indices = nbrs.kneighbors(x_pca_sampled)
distances = np.sort(distances[:, 1])
plt.figure(figsize=(12, 6))
plt.plot(distances)
plt.title("K-distance Graph DBSCAN")
plt.ylabel("Distanza")
plt.xlabel("Punti ordinati")
plt.grid(True)
plt.show()


In [None]:
# Clustering con DBSCAN
dbscan = DBSCAN(eps=0.2, min_samples=8)
db_labels = dbscan.fit_predict(x_pca_sampled)

unique_labels = np.unique(db_labels)

plt.figure(figsize=(10, 7))

for label in unique_labels:
    if label == -1:
        color = "black"
        label_name = "Rumore"
        alpha = 0.9
        marker_size = 10
    else:
        color = plt.cm.tab20(label % 20)  # fino a 20 colori distinti
        label_name = f"Cluster {label + 1}"
        alpha = 0.9
        marker_size = 30

    plt.scatter(
        x_pca_sampled[db_labels == label, 0],
        x_pca_sampled[db_labels == label, 1],
        s=marker_size,
        c=[color],
        edgecolors='k',           # bordo nero per rendere i punti più netti
        linewidths=0.2,
        label=label_name,
        alpha=alpha
    )

plt.title("DBSCAN su PCA (campione di 5000)", fontsize=14)
plt.xlabel("Componente Principale 1", fontsize=12)
plt.ylabel("Componente Principale 2", fontsize=12)
plt.grid(True, linestyle=':', alpha=0.6)

# Legenda in alto a destra con sfondo semitrasparente
plt.legend(
    loc='upper right',
    fontsize=10,
    frameon=True,
    framealpha=0.8
)

plt.tight_layout()
plt.show()
