# Customer Segmentation using K-means Clustering

Starter notebook template. Replace `OnlineRetail.csv` with your dataset file.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Display settings
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 200)

In [None]:
# Load your data: replace the filename with your actual file
try:
    data = pd.read_csv('OnlineRetail.csv', encoding='ISO-8859-1')
except FileNotFoundError:
    print('Place your dataset file named OnlineRetail.csv in the same folder as this notebook or change the filename here.')
    data = pd.DataFrame()

# Show basic info
print('Rows, Columns:', data.shape)

if not data.empty:
    display(data.head())

In [None]:
# Basic info and missing values
if not data.empty:
    display(data.info())
    display(data.describe(include='all'))
    display(data.isnull().sum())

In [None]:
# Cleaning steps (examples)
if not data.empty:
    # Drop rows with no CustomerID
    data = data.dropna(subset=['CustomerID'])
    # Remove cancelled orders if InvoiceNo starts with 'C'
    if 'InvoiceNo' in data.columns:
        data = data[~data['InvoiceNo'].astype(str).str.startswith('C')]
    # Remove duplicates
    data = data.drop_duplicates()
    # Correct data types
    if 'InvoiceDate' in data.columns:
        data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
    # Example engineered feature
    if {'Quantity','UnitPrice'}.issubset(data.columns):
        data['TotalAmount'] = data['Quantity'] * data['UnitPrice']
    display(data.head())
    print('Cleaned shape:', data.shape)

In [None]:
# Feature engineering / aggregation (example: RFM features)
if not data.empty:
    # Convert CustomerID to string
    data['CustomerID'] = data['CustomerID'].astype(str)
    snapshot_date = data['InvoiceDate'].max() + pd.Timedelta(days=1) if 'InvoiceDate' in data.columns and not data['InvoiceDate'].isnull().all() else pd.Timestamp.today()
    # Compute RFM
    rfm = data.groupby('CustomerID').agg({
        'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
        'InvoiceNo': 'nunique',
        'TotalAmount': 'sum'
    }).reset_index()
    rfm.columns = ['CustomerID','Recency','Frequency','Monetary']
    display(rfm.head())
    print('RFM shape:', rfm.shape)

In [None]:
# Scaling features
if 'rfm' in globals() and not rfm.empty:
    scaler = StandardScaler()
    rfm_scaled = rfm.copy()
    rfm_scaled[['Recency','Frequency','Monetary']] = scaler.fit_transform(rfm[['Recency','Frequency','Monetary']])
    display(rfm_scaled.head())
else:
    print('RFM not available to scale (check previous steps)')

In [None]:
# Save cleaned and engineered features to CSV
output_csv = 'CustomerSegmentation_Features_AdityaRaj.csv'
if not data.empty:
    # Save RFM if exists, else save cleaned data
    if 'rfm' in globals() and not rfm.empty:
        rfm.to_csv(output_csv, index=False)
        print('Saved RFM features to', output_csv)
    else:
        data.to_csv('Cleaned_OnlineRetail_AdityaRaj.csv', index=False)
        print('Saved cleaned dataset to Cleaned_OnlineRetail_AdityaRaj.csv')
else:
    print('No data to save.')

## Notes and Next Steps

- Replace placeholder filename with your dataset.
- Run cells sequentially, inspect outputs, and modify cleaning rules as needed.
- After creating features and scaling, apply K-means from scikit-learn and evaluate using the elbow method and silhouette score.
- Before submission: Clear all outputs (Kernel -> Restart & Clear Output) to reduce file size, then save the notebook.
- Compress the notebook file if necessary (zip) and ensure file < 10 MB.
