---
#### Practise: Unsupervised Learning: Anomaly detection, Clustering

##### Date: 2026-01-26

Dataset: OnlineRetail 

In [None]:
import pandas as pd

df = pd.read_excel('Online Retail.xlsx') # !! замінити на свій шлях !!

In [None]:
df.head()

---
Task 1. EDA 

In [None]:
# EDA
df.head()
df.info()
df.shape
df.describe()
#df.isnull().sum()
#df.duplicated().sum()

In [None]:
df[df['UnitPrice']<0]

In [None]:
df[df['Quantity']<0]

In [None]:
#df.isnull().sum()
df.duplicated().sum()

In [None]:
df['Country'].unique()

---
Task 2. Preprocessing (clear data and delete NaN) + create columns TotalPrice + create snapshot_date ( max date )

In [None]:
df = df.dropna(subset=['CustomerID'])

In [None]:

df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
snapshot_date = df['InvoiceDate'].max()

In [None]:
snapshot_date

---
Task 3. Create RFM

In [None]:
rfm = df.groupby("CustomerID").agg({
    "InvoiceDate": lambda x: (snapshot_date - x.max()).days,
    "InvoiceNo": "nunique",
    "TotalPrice": "sum"
})
rfm.columns = ["Recency", "Frequency", "Monetary"]


In [None]:
rfm

---
Task 4. Scalling data

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()# створити StandardScaler

rfm_scaled = scaler.fit_transform(rfm) # you code


---
Task 5. Fine-tuning K-Means use Elbow Method & Silhouette Score


In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

inertias = []

K_range = range(2, 10)
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(rfm_scaled)
    inertias.append(km.inertia_)
# В циклі пройти по кластерах від 2 до 10, та порахувати iteria km.inertia_ (приклад)

In [None]:
inertias

In [None]:
#kmeans = KMeans(n_clusters=4, random_state=42)
#df["cluster"] = kmeans.fit_predict(rfm_scaled)

In [None]:
# Silhouette Score
from sklearn.metrics import silhouette_score

# silhouette_score(rfm_scaled, df["cluster"])
# В циклі пройти по кластерах від 2 до 10, та порахувати silhouette_score 
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42)
    labels = km.fit_predict(rfm_scaled)
    print(k, silhouette_score(rfm_scaled, labels))

---
Task 6. Create final model. Add visualization

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42)
rfm["cluster"] = kmeans.fit_predict(rfm_scaled)

In [None]:
import seaborn as sns

plt.figure(figsize=(10, 6)) 
sns.scatterplot(data=rfm, x='Frequency', y='Monetary', hue='cluster', palette='viridis') 
plt.title(f'Візуалізація сегментів клієнтів (k=4)') 
plt.show()

---
Task 7. Final result and variant of solve

In [None]:
rfm.groupby('cluster').mean()

---
Task 8. Anomaly detection. Scalling data 

In [None]:
features = df[["Quantity", "UnitPrice", "TotalPrice"]]

In [None]:
# use StandardScaler

X_scaled = scaler.fit_transform(features)# you code


---
Task 9. Isolation Forest. Create model with contamination = 0.01

In [None]:
from sklearn.ensemble import IsolationForest
# you model
iso = IsolationForest(
        n_estimators=n_estimators,
        contamination=contamination,
        random_state=random_state,
        n_jobs=-1
    )
iso.decision_function(X_scaled)
iso.predict(X_scaled)

---
Task 10. LOF. Create model with n_neighbors = 20 and contamination=0.01

In [None]:
from sklearn.neighbors import LocalOutlierFactor
# you code 

lof = LocalOutlierFactor(n_neighbors=20, contamination=0.01, novelty=True)

---
Task 11. Final result

In [None]:
sns.scatterplot(data=df.sample(5000), x="UnitPrice", y="Quantity", hue="IF_Anomaly")
plt.title("Isolation Forest Anomalies")
plt.show()


In [None]:
(df["IF_Anomaly"] == -1).mean()

In [None]:
((df["IF_Anomaly"] == -1) & (df["LOF_Anomaly"] == -1)).sum()

In [None]:
for c in [0.005, 0.01, 0.02]:
    iso = IsolationForest(contamination=c, random_state=42)
    preds = iso.fit_predict(X_scaled)
    print(c, (preds == -1).mean())


In [None]:
for k in [10, 20, 50]:
    lof = LocalOutlierFactor(n_neighbors=k, contamination=0.01)
    preds = lof.fit_predict(X_scaled)
    print(k, (preds == -1).mean())


In [None]:
scores = []
conts = [0.005, 0.01, 0.02]

for c in conts:
    iso = IsolationForest(contamination=c)
    preds = iso.fit_predict(X_scaled)
    scores.append((preds == -1).sum())

plt.plot(conts, scores)
plt.title("Anomalies vs Contamination")
plt.show()
