In [None]:
#IMPORTING LIBRARIES FOR HANDLING AND GENERATING FAKE DATAFRAME
import numpy as np
import pandas as pd
import random
from faker import Faker

In [1]:
#GENERTING FAKE DATA
fake = Faker()

num_customers = 2500
max_transactions_per_customer = 20 

customers = [f"CUST{str(i).zfill(6)}" for i in range(1, num_customers + 1)]

transaction_data = []
for customer_id in customers:
    num_transactions = random.randint(1, max_transactions_per_customer)
    for _ in range(num_transactions):
        transaction_data.append({
            "customer_id": customer_id,
            "transaction_id": f"TRANS{str(len(transaction_data) + 1).zfill(6)}",
            "transaction_amount": np.random.uniform(10, 1000),
            "transaction_type": random.choice(["deposit", "withdrawal"]),
            "transaction_date": fake.date_between(start_date='-1y', end_date='today'),
        })

transaction_data = pd.DataFrame(transaction_data)


In [None]:
#DISPLAY THE DATAFRAME
transaction_data.head()

In [None]:
#CHECKING THE TOTAL TRANSACTION AND COUNT THE TOTAL CUSTOMER 
print(f"Total transactions: {len(transaction_data)}")
print(f"Unique customers: {transaction_data['customer_id'].nunique()}")

In [None]:
#CHECKING THE NULL VALES 
transaction_data.isna().sum()

In [None]:
#CHECKING DUPLICATE VALUE
transaction_data.duplicated().sum()

In [None]:
#CHECK THE INFO OF DATAFRAME
transaction_data.info()

In [None]:
#DESCRIBE THE DATAFRAME 
transaction_data.describe()

In [None]:
#CHECKING THE ALL COLUMNS COUNTS
transaction_data.count()

In [None]:
#CHECK THE DATATYPE OF THE DATAFRAME
transaction_data.dtypes

In [None]:
#CHECK THE VALUE COUNT IN TRANSACTIO TYPE COLUMN
transaction_data['transaction_type'].value_counts()

In [11]:
#IMPORTING LIBRARIES FOR EDA
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#CHECKING THE OUTLIER
plt.figure(figsize=(6, 6))
sns.boxplot(data=transaction_data)

In [14]:
#CONVERTING THE COLUMN TO CORRECT DATATYPE
transaction_data['transaction_date'] = pd.to_datetime(transaction_data['transaction_date'])

In [16]:
#CONVERTING THE RUPEES TO DOLLAR
transaction_data["transaction_amount"] = transaction_data["transaction_amount"] / 86.66


In [17]:
#ROUNDING THE COLUMNS VALUES
transaction_data['transaction_amount'] = transaction_data['transaction_amount'].round(1).astype(float)

In [None]:
#DISPLAY THE DATAFRAME
transaction_data.head()

In [19]:
#CREATING NEW COLUMNS
elements = ['withdrawal','deposit']

for element in elements:
    Tranformed_Columns = transaction_data['transaction_type'] == element
    transaction_data[f'{element}_Value'] = transaction_data['transaction_amount'].where(Tranformed_Columns,None)

In [20]:
#FILLING THE NULL WITH 0 FOR SUM IT
transaction_data.fillna(0,inplace=True)

In [None]:
#GROUPING THE COLUMNS FOR TRAINING THE MODEL MORE EFFICIENTLY
customer_data = transaction_data.groupby("customer_id").agg(
    total_transactions=("transaction_id", "count"),
    total_amount=("transaction_amount", "sum"),
    num_deposits=("transaction_type", lambda x: (x == "deposit").sum()),
    num_withdrawals=("transaction_type", lambda x: (x == "withdrawal").sum()),
    withdrawals_amount=("withdrawal_Value",'sum'),
    deposits_amount=("deposit_Value",'sum')
).reset_index()
customer_data.head()

In [None]:
#AGAIN CHECK THE DATATYPE AFTER GROUPING
customer_data.dtypes

In [None]:
#CHECKING THE NULL VALUE AFTER GROUPING
customer_data.isna().sum()

In [None]:
# SELECT TOP 10 CUSTOMERS BASED ON TOTAL AMOUNT IN DESCENDING ORDER
top_10 = customer_data[["customer_id","total_transactions","total_amount"]].sort_values(by='total_amount', ascending=False).head(10)
plt.figure(figsize=(13, 5))
sns.barplot(y="customer_id", x="total_amount", data=top_10, orient='h')
plt.xlabel("Total Amount")
plt.ylabel("Customer ID")
plt.title("Top 10 Customers by Total Amount")
plt.show()

In [None]:
#SELECT TOP 10 CUSTOMERS BASED ON WITHDRAWALS AMOUNT 
top_10 = customer_data[["customer_id","withdrawals_amount","deposits_amount"]].sort_values(by='withdrawals_amount', ascending=False).head(10)

plt.figure(figsize=(13, 5))
sns.barplot(x="customer_id", y="withdrawals_amount", data=top_10, palette="coolwarm")
plt.title("Top 10 Customers by Withdrawals Amount", fontsize=16, fontweight="bold")
plt.xlabel("Customer ID", fontsize=14)
plt.ylabel("Withdrawals Amount", fontsize=14)
plt.show()

In [None]:
# CREATE A DISTRIBUTION PLOT FOR TOTAL AMOUNT WITH KDE
sns.displot(data=customer_data, x="total_amount", kde=True)  
plt.show()


In [None]:
# SELECT TOP 10 CUSTOMERS BASED ON DEPOSITS AMOUNT 
top_10 = customer_data[["customer_id","withdrawals_amount","deposits_amount"]].sort_values(by='deposits_amount', ascending=False).head(10)

plt.figure(figsize=(13, 5))
sns.barplot(x="customer_id", y="deposits_amount", data=top_10, palette="Greens")
plt.title("Top 10 Customers by Deposits Amount", fontsize=16, fontweight="bold")
plt.xlabel("Customer ID", fontsize=14)
plt.ylabel("Deposits Amount", fontsize=14)
plt.show()

In [None]:
#CHECKING OUTLIER FOR ALL THE COLUMNS
plt.figure(figsize=(16, 5))
sns.boxplot(data = customer_data)

In [None]:
#CHECKING THE CORRELATION FOR NUMERIC COLUMNS USING HEATMAP
correlation_matrix = customer_data[["total_transactions", "total_amount", 
                                    "num_deposits", "num_withdrawals", "withdrawals_amount", "deposits_amount"]].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix")
plt.show()

In [None]:
#IMPORTED ALL THE LIBRARIES FOR TRAINING,EVALUATE THE MODLE AND EVALUATE THE MODEL
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch

from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA


In [None]:
#FEATURE ENCODE THE TRAINING DATA COLUMNS
features = ["total_transactions", "total_amount", "num_deposits", "num_withdrawals","withdrawals_amount","deposits_amount"]

scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_data[features])

scaled_customer_data = pd.DataFrame(scaled_features, columns=features)
scaled_customer_data.head()

In [None]:
#TRAINING THE MODEL USING KMEANS ALGORITHM
kmeans = KMeans(n_clusters=2, random_state=42)
customer_data["cluster"] = kmeans.fit_predict(scaled_customer_data)

customer_data.head()

In [None]:
#HYPERTUNING THE CENTROID BASED K MEANS CLUSTERING MODEL 
kmeans = KMeans(random_state=42)

param_grid = {
    'n_clusters': np.arange(2, 11), 
    'init': ['k-means++', 'random'],  
    'max_iter': [300, 500, 1000],  
    'n_init': [10, 15],  
    'tol': [1e-4, 1e-3],  
}

grid_search = GridSearchCV(kmeans, param_grid, cv=3, verbose=1, n_jobs=-1)
grid_search.fit(scaled_customer_data)

print("Best Parameters:", grid_search.best_params_)

best_kmeans = KMeans(**grid_search.best_params_, random_state=42)
customer_data["cluster"] = best_kmeans.fit_predict(scaled_customer_data)



In [None]:
# DISPLAY THE DATAFRAME AFTER CLUSTERING
customer_data.head()

In [None]:
#CHECK THE SCORE FOR KMEANS MODEL
silhouette = silhouette_score(scaled_features, customer_data["cluster"])
db_index = davies_bouldin_score(scaled_features, customer_data["cluster"])
print(f"Silhouette Score: {silhouette}")
print(f"Davies-Bouldin Index: {db_index}")

In [None]:
#DISPLAY THE PCA PLOT KMEANS MODEL
pca = PCA(n_components=2)
data_pca = pca.fit_transform(scaled_features)

plt.scatter(data_pca[:, 0], data_pca[:, 1], c=customer_data["cluster"], cmap='viridis', s=50)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red', marker='X', s=200)
plt.title('Cluster Visualization (PCA)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

In [30]:
#IMPORTING JOBLIB TO SAVE THE MODEL
import joblib

In [None]:
#SAVED THE GOOD SCORE MODEL
joblib.dump(kmeans, r"d:\BANK PROJECT\kmeans.pkl")

In [None]:
#SAVE THE SCALED DATA 
joblib.dump(scaled_customer_data,r"d:\BANK PROJECT\scaled d.pkl")

In [84]:
#TRAINING THE DENSITY BASED DBSCAN MODEL
dbscan = DBSCAN(eps=0.5, min_samples=5)  
customer_data["cluster"] = dbscan.fit_predict(scaled_features)

In [None]:
#CHECK THE SCORE FOR DBSCAN MODEL
if len(set(customer_data["cluster"])) > 1:
    silhouette = silhouette_score(scaled_features, customer_data["cluster"])
    db_index = davies_bouldin_score(scaled_features, customer_data["cluster"])
    print(f"Silhouette Score: {silhouette}")
    print(f"Davies-Bouldin Index: {db_index}")
else:
    print("DBSCAN found less than two clusters; Silhouette Score is not defined.")

In [None]:
#DISPLAY THE PCA PLOT FOR DBSCAN MODEL
pca = PCA(n_components=2)
data_pca = pca.fit_transform(scaled_features)

plt.scatter(data_pca[:, 0], data_pca[:, 1], c=customer_data["cluster"], cmap='viridis', s=50)
plt.title('Cluster Visualization (DBSCAN, PCA)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

In [None]:
#TRAINED THE AGGLOMERATIVE CLUSTERING
hc = AgglomerativeClustering(n_clusters=2)
scaled_customer_data['cluster'] = hc.fit_predict(scaled_customer_data)

sch.dendrogram(sch.linkage(scaled_customer_data[features], method='ward'))

In [None]:
#CHECK THE SCORE FOR AGGLOMERATIVE CLUSTERING
silhouette = silhouette_score(scaled_features, scaled_customer_data["cluster"])
db_index = davies_bouldin_score(scaled_features, scaled_customer_data["cluster"])

print(f"Silhouette Score: {silhouette}")
print(f"Davies-Bouldin Index: {db_index}")

In [None]:
#DISPLAY THE PCA PLOT FOR AGGLOMERATIVE CLUSTERING
pca = PCA(n_components=2)
data_pca = pca.fit_transform(scaled_features)

plt.scatter(data_pca[:, 0], data_pca[:, 1], c=scaled_customer_data["cluster"], cmap='viridis', s=50)
plt.title('Cluster Visualization (PCA)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()