<a href="https://colab.research.google.com/github/Dee-Nwanjah/SQL-Database-Fundamental-Projects/blob/main/2.)Customer_Segmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setups

In [46]:
# Import libraries
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2
from google.colab import files
import sqlite3

RFM Analysis Setup

In [50]:
# Calculate RFM metrics using pandas
from datetime import datetime

# Define the analysis date
analysis_date = datetime(2024, 1, 31)

# Calculate Recency, Frequency, and Monetary values
rfm_df = orders_df.groupby('customer_id').agg(
    last_order_date=('order_date', 'max'),
    frequency=('order_id', 'nunique'),
    monetary=('total_amount', 'sum')
).reset_index()

# Calculate Recency in days
rfm_df['recency'] = (analysis_date - rfm_df['last_order_date']).dt.days

# Merge with customer details
rfm_df = rfm_df.merge(customers_df[['customer_id', 'first_name', 'city']], on='customer_id', how='left')

# Calculate percentile ranks
rfm_df['recency_rank'] = rfm_df['recency'].rank(pct=True, ascending=False)
rfm_df['frequency_rank'] = rfm_df['frequency'].rank(pct=True, ascending=True)
rfm_df['monetary_rank'] = rfm_df['monetary'].rank(pct=True, ascending=True)

# Display the first few rows of the RFM DataFrame
display(rfm_df.head(10))

Unnamed: 0,customer_id,last_order_date,frequency,monetary,recency,first_name,city,recency_rank,frequency_rank,monetary_rank
0,1,2024-01-22,5,1817.69,9,Customer_1,Houston,0.88191,0.51809,0.315578
1,2,2023-07-25,5,3095.58,190,Customer_2,Phoenix,0.093467,0.51809,0.680402
2,3,2023-07-21,3,1340.27,194,Customer_3,Los Angeles,0.08995,0.197487,0.18593
3,4,2024-01-07,3,977.92,24,Customer_4,Los Angeles,0.725126,0.197487,0.102513
4,5,2023-12-26,7,3696.17,36,Customer_5,Houston,0.625628,0.81608,0.807035
5,6,2023-08-06,3,667.32,178,Customer_6,Houston,0.109548,0.197487,0.048241
6,7,2023-07-31,6,2538.94,184,Customer_7,New York,0.103015,0.683417,0.534673
7,8,2024-01-22,5,4112.9,9,Customer_8,New York,0.88191,0.51809,0.859296
8,9,2023-10-17,8,3259.2,106,Customer_9,New York,0.275377,0.901005,0.725628
9,10,2023-12-01,9,4421.27,61,Customer_10,Chicago,0.461809,0.950754,0.903518


Advanced Window Functions

In [51]:
# Convert 'order_date' to datetime objects if they aren't already
orders_df['order_date'] = pd.to_datetime(orders_df['order_date'])

# Sort data by order_date for window functions
orders_df_sorted = orders_df.sort_values(by='order_date')

# Calculate Running totals and moving averages using pandas
orders_df_sorted['running_total'] = orders_df_sorted['total_amount'].cumsum()
orders_df_sorted['moving_avg_7day'] = orders_df_sorted['total_amount'].rolling(window=7).mean()

# Calculate Daily Rank (requires grouping by date first)
orders_df_sorted['daily_rank'] = orders_df_sorted.groupby(orders_df_sorted['order_date'].dt.date)['total_amount'].rank(method='dense', ascending=False)

# Display results for Running totals and moving averages
print("Running Totals, Moving Averages, and Daily Rank:")
display(orders_df_sorted[['order_date', 'total_amount', 'running_total', 'moving_avg_7day', 'daily_rank']].head(10))

# Customer lifecycle analysis using pandas
# Sort data by customer_id and order_date
customer_orders_sorted = orders_df.sort_values(by=['customer_id', 'order_date'])

# Calculate the previous order date for each customer
customer_orders_sorted['previous_order_date'] = customer_orders_sorted.groupby('customer_id')['order_date'].shift(1)

# Calculate the difference in days between the current and previous order
customer_orders_sorted['days_since_previous_order'] = (customer_orders_sorted['order_date'] - customer_orders_sorted['previous_order_date']).dt.days

# Determine customer status based on days since previous order
def get_customer_status(row):
    if pd.isna(row['previous_order_date']):
        return 'First Order'
    elif row['days_since_previous_order'] <= 30:
        return 'Regular'
    elif row['days_since_previous_order'] <= 90:
        return 'Returning'
    else:
        return 'Reactivated'

customer_orders_sorted['customer_status'] = customer_orders_sorted.apply(get_customer_status, axis=1)

# Add order sequence for each customer
customer_orders_sorted['order_sequence'] = customer_orders_sorted.groupby('customer_id').cumcount() + 1


# Display results for Customer lifecycle analysis
print("\nCustomer Lifecycle Analysis:")
display(customer_orders_sorted[['customer_id', 'order_sequence', 'order_date', 'previous_order_date', 'customer_status']].head(10))

Running Totals, Moving Averages, and Daily Rank:


Unnamed: 0,order_date,total_amount,running_total,moving_avg_7day,daily_rank
1695,2023-01-01,931.26,931.26,,1.0
3635,2023-01-01,744.19,1675.45,,5.0
2054,2023-01-01,323.12,1998.57,,7.0
2325,2023-01-01,862.78,2861.35,,3.0
567,2023-01-01,360.95,3222.3,,6.0
3623,2023-01-01,790.97,4013.27,,4.0
2875,2023-01-01,877.58,4890.85,698.692857,2.0
4403,2023-01-02,150.1,5040.95,587.098571,11.0
4404,2023-01-02,553.96,5594.91,559.922857,4.0
2832,2023-01-02,339.51,5934.42,562.264286,9.0



Customer Lifecycle Analysis:


Unnamed: 0,customer_id,order_sequence,order_date,previous_order_date,customer_status
214,1,1,2023-01-23,NaT,First Order
1661,1,2,2023-08-05,2023-01-23,Reactivated
2847,1,3,2023-09-05,2023-08-05,Returning
4048,1,4,2024-01-12,2023-09-05,Reactivated
2694,1,5,2024-01-22,2024-01-12,Regular
2646,2,1,2023-01-18,NaT,First Order
3434,2,2,2023-01-30,2023-01-18,Regular
3916,2,3,2023-02-18,2023-01-30,Regular
3722,2,4,2023-05-09,2023-02-18,Returning
3905,2,5,2023-07-25,2023-05-09,Returning


Customer Segmentation

In [49]:
# Create customer segments based on RFM scores using pandas

# Define segmentation rules based on percentile ranks
def segment_customer(row):
    if row['recency_rank'] >= 0.8 and row['frequency_rank'] >= 0.8 and row['monetary_rank'] >= 0.8:
        return 'Champions'
    elif row['recency_rank'] >= 0.6 and row['frequency_rank'] >= 0.6:
        return 'Loyal Customers'
    elif row['recency_rank'] >= 0.8 and row['frequency_rank'] <= 0.2:
        return 'New Customers'
    elif row['recency_rank'] <= 0.2 and row['frequency_rank'] >= 0.6:
        return 'At Risk'
    elif row['recency_rank'] <= 0.2 and row['frequency_rank'] <= 0.2:
        return 'Lost Customers'
    else:
        return 'Other'

# Apply the segmentation function to the rfm_df DataFrame
rfm_df['customer_segment'] = rfm_df.apply(segment_customer, axis=1)

# Display the RFM DataFrame with the new customer_segment column
display(rfm_df.head(10))

Unnamed: 0,customer_id,last_order_date,frequency,monetary,recency,first_name,city,recency_rank,frequency_rank,monetary_rank,customer_segment
0,1,2024-01-22,5,1817.69,9,Customer_1,Houston,0.88191,0.51809,0.315578,Other
1,2,2023-07-25,5,3095.58,190,Customer_2,Phoenix,0.093467,0.51809,0.680402,Other
2,3,2023-07-21,3,1340.27,194,Customer_3,Los Angeles,0.08995,0.197487,0.18593,Lost Customers
3,4,2024-01-07,3,977.92,24,Customer_4,Los Angeles,0.725126,0.197487,0.102513,Other
4,5,2023-12-26,7,3696.17,36,Customer_5,Houston,0.625628,0.81608,0.807035,Loyal Customers
5,6,2023-08-06,3,667.32,178,Customer_6,Houston,0.109548,0.197487,0.048241,Lost Customers
6,7,2023-07-31,6,2538.94,184,Customer_7,New York,0.103015,0.683417,0.534673,At Risk
7,8,2024-01-22,5,4112.9,9,Customer_8,New York,0.88191,0.51809,0.859296,Other
8,9,2023-10-17,8,3259.2,106,Customer_9,New York,0.275377,0.901005,0.725628,Other
9,10,2023-12-01,9,4421.27,61,Customer_10,Chicago,0.461809,0.950754,0.903518,Other
