Data preparation and loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv("C://Users/user/Downloads/Customer Segmentation/data.csv", encoding='ISO-8859-1')
df.head()


Data structure

In [None]:
df.shape
df.info()


Checking for missing values

In [None]:
df.isnull().sum()


Data Cleaning

Delete rows without CustomerID

In [None]:
df = df.dropna(subset=['CustomerID'])


Delete negative transactions (reversals)

In [None]:
df = df[df['Quantity'] > 0]
df = df[df['UnitPrice'] > 0]


Feature Engineering

Creating a TotalPrice column

In [None]:
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']


Customer-level

In [None]:
customer_df = df.groupby('CustomerID').agg({
    'InvoiceNo': 'nunique',      # Frequency
    'TotalPrice': 'sum'          # Total Purchase
}).reset_index()


Rename columns

In [None]:
customer_df.columns = ['CustomerID', 'Frequency', 'TotalPurchase']
customer_df.head()


Initial customer analysis

In [None]:
customer_df.describe()


Visualization

In [None]:
sns.scatterplot(
    x='Frequency',
    y='TotalPurchase',
    data=customer_df
)
plt.show()


Data normalization

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(
    customer_df[['Frequency', 'TotalPurchase']]
)


PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)

print("Explained Variance Ratio:", pca.explained_variance_ratio_)


In [None]:
pca_df = pd.DataFrame(
    pca_data,
    columns=['PC1', 'PC2']
)


Reduced data plot

In [None]:
plt.scatter(pca_df['PC1'], pca_df['PC2'])
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()


Elbow Method

In [None]:
from sklearn.cluster import KMeans

wcss = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(pca_df)
    wcss.append(kmeans.inertia_)


In [None]:
plt.plot(range(1, 11), wcss, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()


Clustering with K-Means

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(pca_df)

pca_df['Cluster'] = clusters
customer_df['Cluster'] = clusters


Outputs

In [None]:
sns.scatterplot(
    x='PC1',
    y='PC2',
    hue='Cluster',
    data=pca_df,
    palette='Set2'
)
plt.show()


Analysis of each cluster (Customer Profiling)

In [None]:
cluster_analysis = customer_df.groupby('Cluster').mean()
cluster_analysis
