# KYC Funnel Analysis: Data Cleaning & Exploratory Analysis

Overview:
- Purpose: clean raw KYC exports, create binary flags for common failure modes, and prepare features for modeling and cohort analysis.
- Steps: load data, apply heuristics for fraud/quality/tech failures, impute missing categorical fields, create combined features, and build a simple baseline model.
- Notes: keep transformations transparent — these flags are useful both for analysis and as model inputs.

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, TargetEncoder

from sklearn.metrics import roc_auc_score, classification_report

from sklearn.inspection import permutation_importance

In [None]:
kyc_details = pd.read_csv('KYC_details.csv')
kyc_summary = pd.read_csv('KYC_summary.csv')

df = pd.merge(kyc_summary, kyc_details, on='user_reference', how='left')

In [None]:
df['date_'] = pd.to_datetime(df['date_'])
df['is_pass'] = df['decision_type'].isin(['PASSED', 'APPROVED']).astype(int)

In [None]:
anomalies = df[
    (df['decision_type'].isin(['PASSED', 'APPROVED'])) & 
    (df['watchlist_screening_decision'].isna())
]
df_clean = df.drop(anomalies.index).copy(deep=True)

In [None]:
fraud_keywords = [
    'DIGITAL_COPY', 'MANIPULATED', 'FAKE', 'PUNCHED', 
    'MISMATCH_FRONT_BACK', 'PHOTOCOPY'
]
fraud_pattern = '|'.join(fraud_keywords)

df_clean['is_confirmed_fraud'] = (
    df_clean['image_checks_decision_details'].astype(str).str.contains(fraud_pattern, case=False, na=False) |
    df_clean['usability_decision_details'].astype(str).str.contains('PHOTOCOPY', case=False, na=False)
).astype(int)

In [None]:
quality_keywords = [
    'GLARE', 'BLURRED', 'MISSING_PAGE', 'NOT_UPLOADED', 
    'DAMAGED_DOCUMENT', 'PART_OF_DOCUMENT_MISSING', 'PART_OF_DOCUMENT_HIDDEN',
    'BAD_QUALITY', 'FACE_NOT_FULLY_VISIBLE', 'liveness_UNDETERMINED'
]
quality_pattern = '|'.join(quality_keywords)

df_clean['is_quality_fail'] = (
    df_clean['usability_decision_details'].astype(str).str.contains(quality_pattern, case=False, na=False) |
    df_clean['liveness_decision_details'].astype(str).str.contains(quality_pattern, case=False, na=False)
).astype(int)

In [None]:
df_clean['is_face_mismatch'] = (df_clean['similarity_decision_details'] == 'NO_MATCH').astype(int)
df_clean['is_unsupported_doc'] = df_clean['usability_decision_details'].isin(['UNSUPPORTED_DOCUMENT_TYPE', 'NOT_A_DOCUMENT']).astype(int)

In [None]:
tech_fail_keywords = ['TECHNICAL_ERROR', 'MISMATCHING_DATAPOINTS', 'MISMATCH_HRZ_MRZ_DATA']
tech_pattern = '|'.join(tech_fail_keywords)

df_clean['is_tech_data_fail'] = (
    df_clean['data_checks_decision_details'].astype(str).str.contains(tech_pattern, case=False, na=False) |
    df_clean['extraction_decision_details'].astype(str).str.contains(tech_pattern, case=False, na=False)
).astype(int)

In [None]:
bins = [0, 18, 25, 35, 45, 55, 100]
labels = ['<18', '18-25', '26-35', '36-45', '46-55', '55+']

df_clean['proxy_age'] = 2023 - pd.to_numeric(df_clean['year_birth'], errors='coerce')
df_clean['age_group'] = pd.cut(df_clean['proxy_age'], bins=bins, labels=labels).astype(str).replace('nan', 'Unknown')

In [None]:
cols_to_fill = ['data_issuing_country', 'data_type', 'data_sub_type']
for col in cols_to_fill:
    df_clean[col] = df_clean[col].fillna('UNKOWN')

In [None]:
df_clean['combo_country_type'] = df_clean['data_issuing_country'] + "_" + df_clean['data_type']
df_clean['combo_country_subtype'] = df_clean['data_issuing_country'] + "_" + df_clean['data_sub_type']
df_clean['combo_country_age'] = df_clean['data_issuing_country'] + "_" + df_clean['age_group']
df_clean['combo_type_age'] = df_clean['data_type'] + "_" + df_clean['age_group']

In [None]:
categorical_cols = [
    'data_issuing_country',
    'data_sub_type',
    'combo_country_type',
    'combo_country_subtype',
    'combo_country_age',
]

binary_features = [
    'is_confirmed_fraud',
    'is_quality_fail',
    'is_face_mismatch',
    'is_tech_data_fail',
    'is_unsupported_doc'
]

In [None]:
X = df_clean[categorical_cols+binary_features]
y = df_clean['is_pass']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
model_features_train = X_train[binary_features].copy()
model_features_test = X_test[binary_features].copy()

te = TargetEncoder(smooth="auto", cv=5, random_state=42)

for col in categorical_cols:
    te.fit(X_train[[col]], y_train)
    model_features_train[f"{col}_TE"] = te.transform(X_train[[col]]).flatten()
    model_features_test[f"{col}_TE"] = te.transform(X_test[[col]]).flatten()
    
    
    counts = X_train[col].value_counts()
    model_features_train[f"{col}_CE"] = X_train[col].map(counts).fillna(0)
    model_features_test[f"{col}_CE"] = X_test[col].map(counts).fillna(0)

Modeling notes:
- Input: binary feature flags + target-encoded categorical features + count-encoding (CE) for stability.
- Train/Test: a simple 70/30 split is used to create a baseline classifier.
- Purpose: this model is a diagnostic baseline — we mainly use feature importance and SHAP to understand drivers of pass/fail.

In [None]:
model = xgb.XGBClassifier(random_state=42)
model.fit(model_features_train, y_train)

In [None]:
y_predict_proba = model.predict_proba(model_features_test)
y_predict = model.predict(model_features_test)

print(classification_report(y_test, y_predict))

Interpreting the model (SHAP):
- We use SHAP to get a global view of feature importance and per-sample explanations.
- This helps validate that the binary flags and encoded categorical features behave as expected and surface any surprising drivers.

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer(model_features_test)

plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, model_features_test, show=False)
plt.title("SHAP Value Summary", fontsize=16)
plt.tight_layout()