# Customer Segmentation of Credit Card Users Using Machine Learning


****In this project, I use machine learning clustering algorithms to analyze the spending behavior of credit card users. By applying algorithms such as K-Means and Hierarchical Clustering, I segment customers based on transaction patterns, purchase frequency, cash advances, and credit usage. The resulting clusters reveal distinct customer groups, providing insights for better risk assessment and targeted financial strategies.****

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

from sklearn.metrics import silhouette_score


In [None]:
# read the data

df = pd.read_csv("/kaggle/input/ccdata/CC GENERAL.csv")


#df = df.replace([np.inf, -np.inf], np.nan)
#df = df.fillna(0)+

df.head(10)

# Exploratory Data Analysis
 

In [None]:
print(df.columns)
print("-------------------------------------------")
print(df.shape)

df.info()

In [None]:
df.describe().T


In [None]:
for col in df.columns:
    print(col, ":", df[col].nunique())

In [None]:
df.isnull().sum().sort_values(ascending=False)


In [None]:
#discover the outliers

numerical_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
numerical_cols = [col for col in numerical_cols if df[col].nunique() > 1]  

outlier_summary = []

for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)]
    count = len(outliers)
    pct = (count / len(df)) * 100
    outlier_summary.append({'Feature': col, 'Outliers Count': count, 'Outliers %': pct})

outlier_df = pd.DataFrame(outlier_summary).sort_values(by='Outliers %', ascending=False)
outlier_df

In [None]:
plt.figure(figsize=(15, 25))
for i, col in enumerate(numerical_cols , 1):
    plt.subplot(7, 3, i)
    sns.boxplot(x=df[col])
    plt.title(col)
plt.tight_layout()
plt.show()


In [None]:
num_df = df.select_dtypes(include=['int64', 'float64'])

plt.figure(figsize=(14,10))
sns.heatmap(num_df.corr(), cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()

# preprocessing the data

In [None]:
#Handle missing values


df['MINIMUM_PAYMENTS'] = df['MINIMUM_PAYMENTS'].fillna(df['MINIMUM_PAYMENTS'].median())


df['CREDIT_LIMIT'] = df['CREDIT_LIMIT'].fillna(df['CREDIT_LIMIT'].mode()[0])


df.isnull().sum()


In [None]:
#handle the outliers

numerical_cols = df.select_dtypes(include=['float64', 'int64'])

for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = df[col].clip(lower_bound, upper_bound) 


In [None]:
#discover the outliers

numerical_cols = df.select_dtypes(include=['int64','float64'])
numerical_cols = [col for col in numerical_cols if df[col].nunique() > 1]  

outlier_summary = []

for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)]
    count = len(outliers)
    pct = (count / len(df)) * 100
    outlier_summary.append({'Feature': col, 'Outliers Count': count, 'Outliers %': pct})

outlier_df = pd.DataFrame(outlier_summary).sort_values(by='Outliers %', ascending=False)
outlier_df

In [None]:
#drop unuseful column

if 'CUST_ID' in df.columns:
     df = df.drop('CUST_ID', axis=1)




In [None]:
# Scaling the data

scaler = StandardScaler()

df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

df_scaled.head()


# Dimensionality reduction

In [None]:
#Principal Component Analysis (PCA) 



pca = PCA(n_components=3)  
df_pca = pd.DataFrame(pca.fit_transform(df_scaled), columns=[f'PC{i+1}' for i in range(3)])

df_pca.head()


# K-Means Clustering

In [None]:
# find the optimal number of clusters by Elbow Method

inertia = []

for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
    kmeans.fit(df_pca)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8,5))
plt.plot(range(1, 11), inertia, marker='o')
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.title("Elbow Method")
plt.show()


In [None]:
#find the best silhouette score

for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, n_init='auto', random_state=42)
    predict = kmeans.fit_predict(df_pca)

    score = silhouette_score(df_pca, predict)
    print(f"k={k}, Silhouette Score={score}")


In [None]:
#predict clusters
kmeans = KMeans(n_clusters=3, n_init=10, random_state=42)

kmeans_clusters = kmeans.fit_predict(df_pca)


unique, counts = np.unique(kmeans_clusters, return_counts=True)
print("Cluster distribution:", dict(zip(unique, counts)))


score = silhouette_score(df_pca,kmeans_clusters)
print("Silhouette Score:", score)


In [None]:
#3D Visualization of Clusters using PCA

x = df_pca.iloc[:, 0]   # PC1
y = df_pca.iloc[:, 1]   # PC2
z = df_pca.iloc[:, 2]   # PC3

fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111, projection='3d')

scatter = ax.scatter( x, y, z, s=50, c=kmeans_clusters, cmap='viridis', alpha=0.8)

ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
ax.set_title("3D Visualization of Clusters using PCA")

legend = ax.legend(*scatter.legend_elements(), title="Clusters")
ax.add_artist(legend)

plt.show()


In [None]:
#Cluster distribution and   Cluster distribution
unique, counts = np.unique(kmeans_clusters, return_counts=True)
print("Cluster distribution:", dict(zip(unique, counts)))


score = silhouette_score(df_pca,kmeans_clusters)
print("Silhouette Score:", score)


In [None]:
# K_Means clustring results

df['KMeans Cluster'] =kmeans_clusters


cluster_profile1 = df.groupby('KMeans Cluster').mean()
cluster_profile1




In [None]:

cluster_profile1.plot(kind='bar', figsize=(12,6))
plt.title("Cluster Profiling -  Bar Chart")
plt.xlabel("Clusters")
plt.ylabel("Average Value")
plt.tight_layout()
plt.show()



# Description of the most important features






****BALANCE: The amount of money a credit card user still owes to the bank. It increases with purchases or cash withdrawals and decreases when the user makes payments. It reflects the unpaid portion of the credit card at the end of the billing cycle.****



****PURCHASES: The total amount spent by the credit card user on purchases during a given period. It includes only purchases made with the card and excludes cash withdrawals or fees.****



****CASH_ADVANCE  The amount of cash a credit card user withdraws from their card.It increases the outstanding balance (BALANCE) on the card.The user receives cash to use as they wish.****  



****CREDIT_LIMIT  The maximum amount a bank allows a credit card user to spend or withdraw using their card.It represents the upper limit for purchases and cash advances.
It does not mean the user has spent this amount, only the maximum allowed usage.****



****PAYMENTS  The amount the user pays to the bank to settle their card balance.It reduces the outstanding balance (BALANCE).Represents the actual repayment of the debt.****



# K_Means Profiling



****Cluster 0 represents **High Spend Active Users**. These customers make large purchase amounts and use their credit cards frequently. Their overall risk level is **low**.****

****Cluster 1 represents **Low Activity Customers**. They use their credit cards occasionally and have generally low transaction levels. Their risk level is also **low**.****

****Cluster 2 represents **Cash Advance Heavy Users**. These customers rely heavily on cash withdrawals and make very few purchases. They are considered **very high-risk** customers.****
 

# Hierarchical Clustering

In [None]:
#Hierarchical Clustering Dendrogram

plt.figure(figsize=(12, 6))
linked = linkage(df_pca, method='ward')  
dendrogram(linked, truncate_mode='level', p=5)  
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.show()




In [None]:
#AgglomerativeClustering ,Cluster distribution and  Silhouette Score

hc_model = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
hc_clusters = hc_model.fit_predict(df_pca)



unique, counts = np.unique(hc_clusters, return_counts=True)
print("Cluster distribution:", dict(zip(unique, counts)))


score = silhouette_score(df_pca, hc_clusters)
print("Silhouette Score:", score)


In [None]:
#Features and clustring results
df['KMeans Cluster'] = kmeans_clusters

df['Hierarchical Cluster'] =  hc_clusters
df.head()

In [None]:
# Hierarchical clustring results

cluster_profile2 = df.groupby('Hierarchical Cluster').mean()
cluster_profile2




In [None]:

cluster_profile2.plot(kind='bar', figsize=(12,6))
plt.title("Cluster Profiling -  Bar Chart")
plt.xlabel("Clusters")
plt.ylabel("Average Value")
plt.tight_layout()
plt.show()



# Description of the most important features






****BALANCE: The amount of money a credit card user still owes to the bank. It increases with purchases or cash withdrawals and decreases when the user makes payments. It reflects the unpaid portion of the credit card at the end of the billing cycle.****



****PURCHASES: The total amount spent by the credit card user on purchases during a given period. It includes only purchases made with the card and excludes cash withdrawals or fees.****



****CASH_ADVANCE  The amount of cash a credit card user withdraws from their card.It increases the outstanding balance (BALANCE) on the card.The user receives cash to use as they wish.****  



****CREDIT_LIMIT  The maximum amount a bank allows a credit card user to spend or withdraw using their card.It represents the upper limit for purchases and cash advances.
It does not mean the user has spent this amount, only the maximum allowed usage.****



****PAYMENTS  The amount the user pays to the bank to settle their card balance.It reduces the outstanding balance (BALANCE).Represents the actual repayment of the debt.****



# Hierarchical Clustering Profiling


****Cluster 0 represents Moderate Spend Users. These customers have moderate purchase amounts, perform occasional one-off and installment purchases, and rarely withdraw cash. Their overall risk level is low.****

****Cluster 1 represents High Spend Active Users. These customers make large purchases, frequently use their credit cards, and have a high credit limit. They rarely withdraw cash. Their risk level is moderate.****

****Cluster 2 represents Cash Advance Heavy Users. These customers rely heavily on cash withdrawals, make very few purchases, and have a high credit limit. They are considered very high-risk customers.****

 # Conclusion

 
****K-Means produces more balanced and homogeneous clusters, reflected by a higher Silhouette Score (0.361). It is better for statistical consistency and clustering performance.****

****Hierarchical Clustering creates clusters that are slightly less homogeneous (Silhouette Score 0.307) but provides clearer distinctions between customer behaviors, such as separating Moderate Spend from High Spend users, which is valuable for financial analysis and risk assessment.****

****Then:
Hierarchical Clustering is preferred  for this dataset because its clusters offer more meaningful insights into customer behavior and risk, even if its Silhouette Score is slightly lower.****