In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


In [3]:
# Load the dataset
df = pd.read_csv("../data/processed/eda_data.csv")

In [5]:
# Feature Engineering
agg_df = df.groupby('CustomerId').agg({
    'Amount': ['sum', 'mean', 'std', 'max', 'min', 'count'],
    'Value': ['sum', 'mean', 'std', 'max', 'min'],
    'TransactionHour': 'nunique',
    'TransactionDay': 'nunique',
    'TransactionMonth': 'nunique'
}).reset_index()

In [6]:
# Flatten column names
agg_df.columns = ['CustomerId'] + ['_'.join(col).strip() for col in agg_df.columns[1:]]

In [7]:
num_features = [col for col in agg_df.columns if agg_df[col].dtype in ['int64', 'float64'] and col != 'CustomerId']

In [None]:

X = agg_df.drop(columns=['CustomerId'])
y = np.zeros(X.shape[0]) 

print("X columns:", X.columns.tolist())

X columns: ['Amount_sum', 'Amount_mean', 'Amount_std', 'Amount_max', 'Amount_min', 'Amount_count', 'Value_sum', 'Value_mean', 'Value_std', 'Value_max', 'Value_min', 'TransactionHour_nunique', 'TransactionDay_nunique', 'TransactionMonth_nunique']


In [9]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [None]:
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, num_features)
])

# Apply transformations
X_scaled = numeric_pipeline.fit_transform(X)

In [15]:
# proxy labels creation

from sklearn.cluster import KMeans
import seaborn as sns

df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'], errors='coerce')

# 📅 Define snapshot date for Recency calculation
snapshot_date = df['TransactionStartTime'].max() + pd.Timedelta(days=1)

# 🧮 Calculate RFM per CustomerId
rfm = df.groupby('CustomerId').agg({
    'TransactionStartTime': lambda x: (snapshot_date - x.max()).days,
    'TransactionId': 'count',
    'Value': 'sum'
}).reset_index()
rfm.columns = ['CustomerId', 'Recency', 'Frequency', 'Monetary']

# 📊 Visual check (optional)
# sns.pairplot(rfm[['Recency', 'Frequency', 'Monetary']])

# 🧼 Scale RFM for clustering
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])

# 🚀 KMeans Clustering (3 segments)
kmeans = KMeans(n_clusters=3, random_state=42)
rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)

# 🔍 Determine high-risk cluster: lowest Frequency + Monetary, highest Recency
cluster_summary = rfm.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean'
}).sort_values(by='Frequency')

high_risk_cluster = cluster_summary.index[0]  # assume lowest freq is highest risk
rfm['is_high_risk'] = (rfm['Cluster'] == high_risk_cluster).astype(int)

# 🧩 Save or merge this label with processed features
# e.g. rfm[['CustomerId', 'is_high_risk']] 
#merge it with df
rfm = rfm[['CustomerId', 'is_high_risk']]
df = df.merge(rfm, on='CustomerId', how='left')

# Save the processed data with proxy labels
PROCESSED_DATA_PATH = "../data/processed/eda_data_with_proxy_labels.csv"
df.to_csv(PROCESSED_DATA_PATH, index=False)

print("✅ Proxy labels created. High-risk cluster:", high_risk_cluster)
print(rfm['is_high_risk'].value_counts())

✅ Proxy labels created. High-risk cluster: 0
is_high_risk
0    2307
1    1435
Name: count, dtype: int64


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import mlflow
import mlflow.sklearn

df = pd.read_csv("../data/processed/eda_data_with_proxy_labels.csv")  # Adjust path as needed

df_final = pd.DataFrame(X_scaled)
df_final['CustomerId'] = customer_ids.values
df_final = df_final.merge(rfm[['CustomerId', 'is_high_risk']], on='CustomerId')

y = df_final['is_high_risk']
X_final = df_final.drop(columns=['CustomerId', 'is_high_risk'])

# 🔀 Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, stratify=y, random_state=42)

# 🤖 Train and Evaluate (Task 5)
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

mlflow.set_experiment("credit-risk-model")

for name, model in models.items():
    with mlflow.start_run(run_name=name):
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        probs = model.predict_proba(X_test)[:, 1]

        acc = accuracy_score(y_test, preds)
        prec = precision_score(y_test, preds)
        rec = recall_score(y_test, preds)
        f1 = f1_score(y_test, preds)
        roc = roc_auc_score(y_test, probs)

        mlflow.log_param("model", name)
        mlflow.log_metrics({
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1_score": f1,
            "roc_auc": roc
        })

        mlflow.sklearn.log_model(model, "model", registered_model_name=name.replace(" ", "_"))

        print(f"✅ {name} Results")
        print(classification_report(y_test, preds))

ValueError: Found input variables with inconsistent numbers of samples: [3742, 95662]