# Customer Segmentation using K-means Clustering

In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [27]:
df = pd.read_excel("Online Retail.xlsx")

In [28]:
print(df.head())          
print(df.info())          
print(df.describe())      
print("Missing values:\n", df.isnull().sum())

  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  
0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom  
1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom  
3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
4 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       -----------

In [29]:
df = df.dropna()

df = df.drop_duplicates()
if 'InvoiceDate' in df.columns:
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
if 'Quantity' in df.columns and 'UnitPrice' in df.columns:
    df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]

In [30]:
num_cols = df.select_dtypes(include=['int64','float64']).columns
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

cat_cols = df.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in cat_cols:
    df[col] = encoder.fit_transform(df[col].astype(str))

In [31]:
if 'Quantity' in df.columns and 'UnitPrice' in df.columns:
    df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

selected_features = df[num_cols.tolist() + ['TotalPrice']] if 'TotalPrice' in df.columns else df[num_cols]

print("Final dataset shape:", df.shape)
print("Selected features:\n", selected_features.head())

Final dataset shape: (392692, 9)
Selected features:
    Quantity  UnitPrice  CustomerID  TotalPrice
0 -0.039446  -0.025893    1.495244    0.001021
1 -0.039446   0.011873    1.495244   -0.000468
2 -0.028365  -0.016901    1.495244    0.000479
3 -0.039446   0.011873    1.495244   -0.000468
4 -0.039446   0.011873    1.495244   -0.000468
