In [4]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from datetime import datetime

In [5]:
def create_rfm_features(df):
    """
    Calculate RFM (Recency, Frequency, Monetary) metrics for each customer.
    """
    # Ensure datetime format
    df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])
    
    # Set snapshot date (most recent transaction in data)
    snapshot_date = df['TransactionStartTime'].max()
    
    # Group by customer and calculate RFM
    rfm = df.groupby('CustomerId').agg({
        'TransactionStartTime': lambda x: (snapshot_date - x.max()).days,  # Recency
        'TransactionId': 'count',                                         # Frequency
        'Amount': 'sum'                                                   # Monetary
    }).reset_index()
    
    rfm.columns = ['CustomerId', 'Recency', 'Frequency', 'Monetary']
    return rfm


In [6]:

def add_high_risk_label(df, n_clusters=3, random_state=42):
    """
    Add is_high_risk column to the original dataframe using RFM analysis.
    """
    # Calculate RFM metrics
    rfm = create_rfm_features(df)
    
    # Copy for processing
    rfm_processed = rfm.copy()
    
    # Handle negative values and log transform
    rfm_processed['Monetary'] = rfm_processed['Monetary'].clip(lower=0)
    rfm_processed['Frequency'] = np.log1p(rfm_processed['Frequency'])
    rfm_processed['Monetary'] = np.log1p(rfm_processed['Monetary'])
    
    # Standardize features
    scaler = StandardScaler()
    features = ['Recency', 'Frequency', 'Monetary']
    rfm_scaled = scaler.fit_transform(rfm_processed[features])
    
    # Cluster customers
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    clusters = kmeans.fit_predict(rfm_scaled)
    rfm_processed['Cluster'] = clusters
    
    # Identify high-risk cluster (highest recency, lowest frequency/monetary)
    cluster_means = rfm_processed.groupby('Cluster')[features].mean()
    cluster_means['Score'] = (
        cluster_means['Recency'].rank(ascending=True) + 
        cluster_means['Frequency'].rank(ascending=False) + 
        cluster_means['Monetary'].rank(ascending=False)
    )
    high_risk_cluster = cluster_means['Score'].idxmax()
    
    # Create risk labels
    rfm_processed['is_high_risk'] = (rfm_processed['Cluster'] == high_risk_cluster).astype(int)
    
    # Merge back to original data
    df_with_risk = pd.merge(
        df,
        rfm_processed[['CustomerId', 'is_high_risk']],
        on='CustomerId',
        how='left'
    )
    
    # Fill NA with 0 (customers with no transactions are low risk)
    df_with_risk['is_high_risk'] = df_with_risk['is_high_risk'].fillna(0)
    
    return df_with_risk


In [7]:

df = pd.read_csv('../Data/raw/data.csv')
df_with_risk = add_high_risk_label(df)

df_with_risk.to_csv("Proxy.csv")