In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

sales = pd.read_csv(r'C:\Users\Chaim\Desktop\Master M2\Smart_Sales_Dashboard\Smart_Sales_Dashboard\data\clean\sales_clean.csv')
customers = pd.read_csv(r'C:\Users\Chaim\Desktop\Master M2\Smart_Sales_Dashboard\Smart_Sales_Dashboard\data\clean\customers_clean.csv')
customers.head
# Convert dates
sales['invoice_date'] = pd.to_datetime(sales['invoice_date'], errors='coerce')


In [3]:
# Reference date
max_date = sales['invoice_date'].max()

rfm = sales.groupby('customer_id').agg({
    'invoice_date': lambda x: (max_date - x.max()).days,   # Recency
    'invoice_no': 'nunique',                               # Frequency
    'total': 'sum'                                         # Monetary
}).reset_index()

rfm.columns = ['customer_id', 'recency', 'frequency', 'monetary']


Customer Segmentation 

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Drop NaNs from RFM
rfm_clean = rfm[['recency', 'frequency', 'monetary']].dropna()

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(rfm_clean)

# Determine inertia for elbow method
inertia = []
K_range = range(1, 8)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

# Fit KMeans with chosen number of clusters
kmeans = KMeans(n_clusters=4, random_state=42).fit(X)

# Assign cluster labels back to rfm_clean
rfm_clean['cluster'] = kmeans.labels_

# Save the clustered dataset
rfm_clean.to_csv(r'C:\Users\Chaim\Desktop\Master M2\Smart_Sales_Dashboard\Smart_Sales_Dashboard\data\customers_clusters.csv', index=False)


Churn Prediction

In [6]:
import os
import joblib
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# -----------------------------
# Step 1: Prepare data
# -----------------------------
# Assuming rfm_clean is already defined
features_df = rfm_clean.copy()

# Example churn label: 1 if recency > 90 days, else 0
features_df['churn'] = features_df['recency'].apply(lambda x: 1 if x > 90 else 0)

# Features and target
X = features_df[['recency', 'frequency', 'monetary']]
y = features_df['churn']

# Optional: scale features for Logistic Regression
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -----------------------------
# Step 2: Train/test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# -----------------------------
# Step 3: Train Logistic Regression
# -----------------------------
clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)

# -----------------------------
# Step 4: Predict & evaluate
# -----------------------------
y_pred = clf.predict(X_test)
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

# -----------------------------
# Step 5: Save the model safely
# -----------------------------
# Create a 'models' folder in your project directory
model_folder = r'.\models'
os.makedirs(model_folder, exist_ok=True)

# Save both the model and the scaler
joblib.dump(clf, os.path.join(model_folder, 'churn_model.pkl'))
joblib.dump(scaler, os.path.join(model_folder, 'scaler.pkl'))

print(f"Model and scaler saved successfully in folder: {os.path.abspath(model_folder)}")


Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       206
           1       1.00      1.00      1.00     11803

    accuracy                           1.00     12009
   macro avg       1.00      1.00      1.00     12009
weighted avg       1.00      1.00      1.00     12009

Model and scaler saved successfully in folder: c:\Users\Chaim\Desktop\Master M2\Smart_Sales_Dashboard\Smart_Sales_Dashboard\notebooks\models
