In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.neighbors import NearestNeighbors
from scipy.cluster.hierarchy import dendrogram, linkage

import warnings
warnings.filterwarnings('ignore')
sns.set(style="whitegrid")

# Caricamento dati
trans_final = pd.read_pickle('./trans_final.pkl')
trans_final.dropna(subset=['fraud'], inplace=True)

# Encoding variabili categoriche
le = LabelEncoder()
for column in ['mcc_description', 'merchant_city', 'merchant_state','use_chip','card_brand','card_type','gender','has_chip']:
    trans_final[column] = le.fit_transform(trans_final[column])

# Pulizia colonne non rilevanti
to_drop = ['id_trans','client_id_trans','card_id','client_id_card','id','retirement_age','address','expires','day','time']
trans_final = trans_final.drop(to_drop, axis=1)

trans_final = trans_final.apply(pd.to_numeric, downcast='integer') 
trans_final = trans_final.apply(pd.to_numeric, downcast='float') 


In [None]:

# Analisi bidimensionale: current_age vs yearly_income
data = trans_final[['current_age', 'yearly_income']].dropna()
x = data.values

# Normalizzazione
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)


In [None]:

# Elbow method
distortions = []
K = range(1, 11)
for k in K:
    kmeanModel = KMeans(n_clusters=k, random_state=0)
    kmeanModel.fit(x_scaled)
    distortions.append(kmeanModel.inertia_)

plt.figure(figsize=(6, 4))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method (2D)')
plt.show()


In [None]:

# KMeans Clustering
kmeans = KMeans(n_clusters=3, random_state=0)
labels_kmeans = kmeans.fit_predict(x_scaled)

plt.figure(figsize=(6, 4))
plt.scatter(x_scaled[:, 0], x_scaled[:, 1], c=labels_kmeans, cmap='viridis')
plt.title("KMeans Clustering (2D)")
plt.xlabel("Age (scaled)")
plt.ylabel("Income (scaled)")
plt.show()


In [None]:

# Silhouette Score
silhouette_avg = silhouette_score(x_scaled, labels_kmeans)
print("Silhouette Score:", silhouette_avg)


In [None]:

# Clustering Gerarchico
linked = linkage(x_scaled, method='ward')
plt.figure(figsize=(10, 5))
dendrogram(linked, truncate_mode='level', p=5)
plt.title('Gerarchico - Dendrogramma')
plt.xlabel('Sample index')
plt.ylabel('Distance')
plt.show()


In [None]:

# PCA con 2 componenti
pca = PCA(n_components=2)
x_pca = pca.fit_transform(StandardScaler().fit_transform(trans_final.drop(columns='fraud')))

# Elbow method con PCA
distortions_pca = []
K = range(1, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=0).fit(x_pca)
    distortions_pca.append(kmeans.inertia_)

plt.figure(figsize=(6, 4))
plt.plot(K, distortions_pca, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method (PCA)')
plt.show()


In [None]:

# PCA + KMeans clustering
kmeans_pca = KMeans(n_clusters=3, random_state=0)
labels_pca = kmeans_pca.fit_predict(x_pca)

plt.figure(figsize=(6, 4))
plt.scatter(x_pca[:, 0], x_pca[:, 1], c=labels_pca, cmap='plasma')
plt.title('KMeans su PCA')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()


In [None]:

# Silhouette Score su PCA
silhouette_pca = silhouette_score(x_pca, labels_pca)
print("Silhouette Score (PCA):", silhouette_pca)


In [None]:

# Stima epsilon con Nearest Neighbors
neighbors = NearestNeighbors(n_neighbors=5)
neighbors_fit = neighbors.fit(x_pca)
distances, indices = neighbors_fit.kneighbors(x_pca)

distances = np.sort(distances[:, 4])
plt.figure(figsize=(6, 4))
plt.plot(distances)
plt.title("K-distance Graph (DBSCAN)")
plt.xlabel("Points sorted by distance")
plt.ylabel("5-NN distance")
plt.show()


In [None]:

# DBSCAN clustering
dbscan = DBSCAN(eps=1.5, min_samples=5)
db_labels = dbscan.fit_predict(x_pca)

plt.figure(figsize=(6, 4))
plt.scatter(x_pca[:, 0], x_pca[:, 1], c=db_labels, cmap='Spectral')
plt.title("DBSCAN su PCA")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()
