In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [7]:
# Load the dataset
df = pd.read_csv("../data/processed/eda_data.csv")

# Feature Engineering
agg_df = df.groupby('CustomerId').agg({
    'Amount': ['sum', 'mean', 'std', 'max', 'min', 'count'],
    'Value': ['sum', 'mean', 'std', 'max', 'min'],
    'TransactionHour': 'nunique',
    'TransactionDay': 'nunique',
    'TransactionMonth': 'nunique'
}).reset_index()

# Flatten column names
agg_df.columns = ['CustomerId'] + ['_'.join(col).strip() for col in agg_df.columns[1:]]
customer_ids = agg_df['CustomerId']

num_features = [col for col in agg_df.columns if agg_df[col].dtype in ['int64', 'float64'] and col != 'CustomerId']


X = agg_df.drop(columns=['CustomerId'])
y = np.zeros(X.shape[0]) 

print("X columns:", X.columns.tolist())

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Apply transformations
X_scaled = numeric_pipeline.fit_transform(X)

print("Transformed X shape:", X_scaled.shape)

print("Feature extraction complete. Processed data ready for modeling.")

X columns: ['Amount_sum', 'Amount_mean', 'Amount_std', 'Amount_max', 'Amount_min', 'Amount_count', 'Value_sum', 'Value_mean', 'Value_std', 'Value_max', 'Value_min', 'TransactionHour_nunique', 'TransactionDay_nunique', 'TransactionMonth_nunique']
Transformed X shape: (3742, 14)
Feature extraction complete. Processed data ready for modeling.


In [8]:
# Task 4 : proxy labels creation
from sklearn.cluster import KMeans
import seaborn as sns

df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'], errors='coerce')

# Define snapshot date for Recency calculation
snapshot_date = df['TransactionStartTime'].max() + pd.Timedelta(days=1)

# Calculate RFM per CustomerId
rfm = df.groupby('CustomerId').agg({
    'TransactionStartTime': lambda x: (snapshot_date - x.max()).days,
    'TransactionId': 'count',
    'Value': 'sum'
}).reset_index()
rfm.columns = ['CustomerId', 'Recency', 'Frequency', 'Monetary']

# Scale RFM for clustering
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])

# KMeans Clustering (3 segments)
kmeans = KMeans(n_clusters=3, random_state=42)
rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)

# Determine high-risk cluster: lowest Frequency + Monetary, highest Recency
cluster_summary = rfm.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean'
}).sort_values(by='Frequency')

high_risk_cluster = cluster_summary.index[0]  # assume lowest freq is highest risk
rfm['is_high_risk'] = (rfm['Cluster'] == high_risk_cluster).astype(int)

# Save or merge this label with processed features
# e.g. rfm[['CustomerId', 'is_high_risk']] 
#merge it with df

rfm = rfm[['CustomerId', 'is_high_risk']]
df = df.merge(rfm, on='CustomerId', how='left')

# Save the processed data with proxy labels
PROCESSED_DATA_PATH = "../data/processed/eda_data_with_proxy_labels.csv"
df.to_csv(PROCESSED_DATA_PATH, index=False)

print("Proxy labels created. High-risk cluster:", high_risk_cluster)
print(rfm['is_high_risk'].value_counts())

Proxy labels created. High-risk cluster: 0
is_high_risk
0    2307
1    1435
Name: count, dtype: int64
