In [None]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [None]:
import opendatasets as od
od.download('https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud')

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:

#  Import Libraries

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import numpy as n
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, davies_bouldin_score, classification_report, confusion_matrix
import seaborn as sns

#  Load Dataset

In [None]:
data=pd.read_csv('/content/creditcardfraud/creditcard.csv')
data.head()

# Shape, Count, and Sum Dataset

In [None]:
data.shape

In [None]:
data.value_counts()

In [None]:
data.sum()

# Managing Dataset

In [None]:
Ones=data[data['Class']==1]

In [None]:
Zeroes=data[:500]

In [None]:
Ones.shape

In [None]:
Zeroes.shape

In [None]:
CuratedData=pd.concat([Ones,Zeroes])

In [None]:
print(CuratedData['Class'].value_counts())
CuratedData.head()

In [None]:
CuratedData=CuratedData.drop(['Amount','Class','Time'],axis=1)

In [None]:
reduced_data=np.array(CuratedData)

In [None]:
reduced_data.shape

#  Keep the true labels for later evaluation

In [None]:
true_labels = data["Class"]


In [None]:
pca = PCA(n_components=10)  # Reducing to 10 dimensions for DBSCAN
reduced_data = pca.fit_transform(reduced_data)

#  DBSCAN Clustering

In [None]:
dbscan = DBSCAN(eps=2.0, min_samples=5)  # eps may need tuning
cluster_labels = dbscan.fit_predict(reduced_data)


# Add cluster labels to dataframe

In [None]:
CuratedData["Cluster"] = cluster_labels

#  Evaluation (Unsupervised Metrics)

In [None]:
core_samples_mask = cluster_labels != -1
if len(set(cluster_labels)) > 1 and np.sum(core_samples_mask) > 1:
    sil_score = silhouette_score(reduced_data[core_samples_mask], cluster_labels[core_samples_mask])
    db_index = davies_bouldin_score(reduced_data[core_samples_mask], cluster_labels[core_samples_mask])
    print(f"Silhouette Score: {sil_score:.4f}")
    print(f"Davies–Bouldin Index: {db_index:.4f}")
else:
    print("Not enough core points for silhouette or DB score.")


#  Evaluate noise as fraud (label = -1)

In [None]:
detected_frauds = (cluster_labels == -1).astype(int)

# Get the true labels for the curated data
true_labels_curated = data.loc[CuratedData.index, 'Class']

print("\n🔍 Classification Report (Treating DBSCAN noise as predicted fraud):")
print(classification_report(true_labels_curated, detected_frauds, target_names=["Legit", "Fraud"]))

print("\n📊 Confusion Matrix:")
cm = confusion_matrix(true_labels_curated, detected_frauds)
sns.heatmap(cm, annot=True, fmt='d', cmap='coolwarm', xticklabels=["Pred Legit", "Pred Fraud"], yticklabels=["True Legit", "True Fraud"])
plt.show()


# Optional: 2D Visualization

In [None]:
pca_2d = PCA(n_components=2)
vis_data = pca_2d.fit_transform(reduced_data)

plt.figure(figsize=(10, 6))
sns.scatterplot(x=vis_data[:, 0], y=vis_data[:, 1], hue=cluster_labels, palette='tab10', legend='full', s=10)
plt.title("DBSCAN Clustering on Credit Card Data")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(title="Cluster")
plt.show()


In [None]:
Score=silhouette_score(reduced_data,true_labels_curated)

In [None]:
data['Class'].value_counts()

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(x=CuratedData['Cluster'], palette='Set2')
plt.title("Cluster Counts (DBSCAN Output)")
plt.xlabel("Cluster Label")
plt.ylabel("Number of Transactions")
plt.show()


In [None]:
labels = ['Legit (Cluster)', 'Fraud (Noise)']
sizes = [np.sum(detected_frauds == 0), np.sum(detected_frauds == 1)]
colors = ['skyblue', 'salmon']

plt.figure(figsize=(6,6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, colors=colors)
plt.title("DBSCAN Predicted Fraud vs Legit Transactions")
plt.axis('equal')
plt.show()

In [None]:
pca_df = pd.DataFrame(reduced_data, columns=[f'PC{i+1}' for i in range(reduced_data.shape[1])])

plt.figure(figsize=(10, 6))
sns.heatmap(pca_df.corr(), cmap='coolwarm', annot=True, fmt=".2f")
plt.title("Correlation Heatmap of PCA Components")
plt.show()