<a href="https://colab.research.google.com/github/Ayan1311819/Machine_Learning/blob/main/segmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import seaborn as sns
import pandas as pd

In [None]:
#OBJECTIVE
#We want to understand our customers better so we can run more targeted campaigns and increase sales. Can you help us segment our users?

In [None]:
df = pd.read_excel('/content/Online Retail.xlsx')

In [None]:
df.head(n=3)

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df.count()

In [None]:
maxq=df['Quantity'].max()
df[df['Quantity']<0].count()

In [None]:
df = df[(df['Quantity'] > 0) & (df['Quantity'] < 80000)]


In [None]:
df.count()

In [None]:
#Understanding relationship between InvoiceID and CustomerID.
#Invoiceid is mapped to a products basket but each instance consist of one type of product only.
#So a CustomerID can have multiple InvoiceID but not vice versa.
df.groupby('InvoiceNo')['CustomerID'].nunique()
df.groupby('CustomerID')['InvoiceNo'].nunique()

In [None]:
print("Unique Invoices",df['InvoiceNo'].nunique())
print("Unique CustomerID", df['CustomerID'].nunique())

In [None]:
print("All Invoices",df['InvoiceNo'].count())
print("All CustomerID", df['CustomerID'].count())

In [None]:
#Exploration set
df_sample = df.sample(frac=0.05, random_state=42)
df_sample["CustomerID"].nunique()

In [None]:
sns.pairplot(df_sample)

In [None]:
df['StockCode'].nunique()

In [None]:
df['StockCode'].count()

In [None]:
#Segmentation : What a particular customer bought(description, Stockcode), Recency: using InvoiceDate,  using country

In [None]:
from datetime import datetime

# Reference date (usually one day after the last InvoiceDate)
latest_date = df['InvoiceDate'].max() + pd.Timedelta(days=1)
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
# Group by CustomerID and aggregate R, F, M
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (latest_date - x.max()).days,
    'InvoiceNo': 'nunique',
    'TotalPrice': 'sum'
}).reset_index()

rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']

In [None]:
rfm.head(n=3)

In [None]:
rfm.sort_values(by='Monetary', ascending=False).head()
rfm.describe()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [None]:
scaler = StandardScaler()
rfms = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])

In [None]:
#Detrmining K
inertia=[]
for i in range (1,12):
  kmeans = KMeans(n_clusters=i, random_state=42)
  kmeans.fit(rfms)
  inertia.append(kmeans.inertia_)

plt.plot(range(1,12),inertia,marker='o')
plt.title('Elbow method')
plt.xlabel('K value')
plt.ylabel('inertia')


In [None]:
kmeans  = KMeans(n_clusters=5, random_state=42)
rfm['Cluster'] = kmeans.fit_predict(rfms)

In [None]:
import seaborn as sns
sns.scatterplot(data=rfm, x='Recency', y='Frequency', hue='Cluster', palette='Set1')
plt.title('Customer Segments')
plt.xlabel('Recency')
plt.ylabel('Frequency')
plt.show()


In [None]:
#Defining custom labels and using them for evaluation/training can work—and gives you full control and interpretability.
#But it shifts you away from pure clustering into a more supervised or constrained workflow, which may or may not be what you want.
#High domain knowledge more time and manual work, doesn't makes sense when features increases.

In [None]:
rfm.groupby('Cluster').agg({
    'Recency': ['mean', 'median'],
    'Frequency': ['mean', 'median'],
    'Monetary': ['mean', 'median'],
    'CustomerID': 'count'
}).round(1)