In [3]:
import pandas as pd
import numpy as np
from pathlib import Path

In [56]:
def extract_data(filepath:'bnpl_dataset_20000.csv') -> pd.DataFrame:
    return pd.read_csv('bnpl_dataset_20000.csv')

In [57]:
df=load_data('bnpl_dataset_20000.csv')

In [58]:
df.head()

Unnamed: 0,transaction_id,customer_id,customer_name,gender,age,location,transaction_date,purchase_amount,down_payment,installment_amount,installments,payment_status,product_category,merchant_name,customer_income,credit_score,late_payments,account_created,fraud_flag
0,667a639d-686b-43a7-8040-eb50ee6457df,7120013d-3b9a-4125-9e24-885a88b07fa4,Crystal Singh,Male,33,Sheffield,2024-11-30,1462.06,167.0,177.87,1,Missed,Home,eBay,2554.81,371,2,2023-01-12,0
1,48c39a7f-a94a-408d-a6d1-e5a2ed329562,808d51a2-3564-4ac7-9250-c47e0d215696,Jason Schroeder,Other,28,Leeds,2024-10-26,842.93,72.07,117.19,2,Missed,Home,Amazon,4656.38,411,1,2022-12-15,0
2,d2578768-a2e8-4cb9-8dd4-48987059668d,b357d186-2642-4ae1-82b9-d436ab38a550,Heather Gregory,Other,45,Sheffield,2024-05-15,781.59,244.76,210.26,8,Paid,Electronics,Zara,5317.14,799,0,2023-08-22,0
3,398a7067-a195-4b81-abbe-17fa7cecbffa,d35d0ddd-4cd0-4b7b-8380-767d6ff8d426,Judy Tran,Male,63,Manchester,2024-10-27,1007.49,159.57,268.36,1,Missed,Home,Currys,4290.49,754,1,2022-06-01,0
4,7544ead9-aac7-47df-a6fe-79b338324fb6,34914080-e016-4c6c-ad52-3c050c916554,Terry Page,Female,51,London,2025-02-16,1915.33,362.62,30.25,11,Missed,Sports,Tesco,2233.52,453,0,2022-04-23,0


In [10]:
#remove invalid / underage customers

In [11]:
df=df[df['age']>=18]


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   transaction_id      20000 non-null  object 
 1   customer_id         20000 non-null  object 
 2   customer_name       20000 non-null  object 
 3   gender              20000 non-null  object 
 4   age                 20000 non-null  int64  
 5   location            20000 non-null  object 
 6   transaction_date    20000 non-null  object 
 7   purchase_amount     20000 non-null  float64
 8   down_payment        20000 non-null  float64
 9   installment_amount  20000 non-null  float64
 10  installments        20000 non-null  int64  
 11  payment_status      20000 non-null  object 
 12  product_category    20000 non-null  object 
 13  merchant_name       20000 non-null  object 
 14  customer_income     20000 non-null  float64
 15  credit_score        20000 non-null  int64  
 16  late

In [17]:
df = df.drop_duplicates()

if 'transaction_date' in df.columns:
    df['transaction_date'] = pd.to_datetime(
        df['transaction_date'], errors='coerce'
    )

if 'account_created' in df.columns:
    df['account_created'] = pd.to_datetime(
        df['account_created'], errors='coerce'
    )

In [52]:
def clean_data(df):

    # Identify numeric columns
    numeric_cols = df.select_dtypes(include='number').columns

    # Fill missing values for core BNPL risk fields
    for col in ['age', 'customer_income', 'credit_score']:
        if col in df.columns:
            df[col] = df[col].fillna(round(df[col].median(), 0))

    # Handle categorical gender column
    if 'gender' in df.columns:
        df['gender'] = df['gender'].fillna('Unknown')

    # Replace invalid negative values with NaN
    for c in ['purchase_amount', 'down_payment', 'installments',
              'customer_income', 'installment_amount']:
        if c in df.columns:
            df.loc[df[c] < 0, c] = np.nan

    return df


In [60]:
def transform_data(df):
    df=df.copy()
    if 'purchase_amount' in df.columns:
        df['financial_amount']=df['purchase_amount']-df['down_payment']
    
    if 'purchase_amount' in df.columns and 'installment_amount' in df.columns:
        df['repayment_rate']=df['purchase_amount']/df['installment_amount'].fillna(0)

    if {'repayment',  'credit_score', 'late_payments'}. issubset(df.columns):
        df['high_risk'] = ((df['repayment_rate'] < 0.8) |
                           (df['late_payments'] >= 2) |
                           (df['credit_score'] < 550)).astype(int)
    if {'financial_amount', 'installments'}.issubset(df.columns):
        df['monthly_installment_est'] = df['financial_amount'] / df['installments']

    return df
        


In [61]:
df['credit_score'].min()

300

In [62]:
df['credit_score'].max()

850

In [63]:
def validate_data(df):
    problems = []

    # Check credit score range
    if 'credit_score' in df.columns:
        if ((df['credit_score'] < 300) | (df['credit_score'] > 850)).any():
            problems.append("Invalid credit scores found.")

    # Check for negative purchase amounts
    if 'purchase_amount' in df.columns:
        if (df['purchase_amount'] < 0).any():
            problems.append("Negative purchase amounts found.")

    return problems


In [64]:
def save_data(df, filename:"C:/Users/ajayr/cleaned_bnpl_data.csv"):
    Path(filename).parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(filename, index=False)
    return filename

In [65]:
if __name__ == "__main__":
    df = extract_data("bnpl_dataset_20000.csv")
    dfc = clean_data(df)
    dft = transform_data(dfc)

    save_data(dft, "C:/Users/ajayr/cleaned_bnpl_data.csv")
    print("Saved cleaned_bnpl_data.csv")


Saved cleaned_bnpl_data.csv
