In [None]:
# Exploratory Data Analysis (EDA)

# This notebook explores the Xente transaction dataset to understand data structure,
# distributions, data quality issues, and behavioral patterns that inform feature
# engineering and proxy target construction for credit risk modeling.


In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("default")
sns.set()


In [None]:
df = pd.read_csv("../data/raw/data.csv")


In [None]:
df.head()
df.shape
df.columns
df.info()
df.describe()
df.describe(include="object")


In [None]:
df["TransactionStartTime"] = pd.to_datetime(df["TransactionStartTime"])
df["transaction_hour"] = df["TransactionStartTime"].dt.hour
df["transaction_day"] = df["TransactionStartTime"].dt.day
df["transaction_month"] = df["TransactionStartTime"].dt.month
df["transaction_year"] = df["TransactionStartTime"].dt.year


In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df["Amount"], bins=50, kde=True)
plt.title("Distribution of Transaction Amount")
plt.show()
plt.figure(figsize=(6,4))
sns.histplot(df["Value"], bins=50, kde=True)
plt.title("Distribution of Transaction Value")
plt.show()


In [None]:
df["ProductCategory"].value_counts().plot(kind="bar", figsize=(8,4))
plt.title("Product Category Distribution")
plt.show()


In [None]:
numeric_df = df.select_dtypes(include=np.number)

plt.figure(figsize=(10,6))
sns.heatmap(numeric_df.corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()


In [None]:
df.isna().sum()
sns.heatmap(df.isna(), cbar=False)
plt.title("Missing Values Heatmap")
plt.show()


In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(x=df["Amount"])
plt.title("Outliers in Transaction Amount")
plt.show()
sns.boxplot(x=df["Value"])


In [None]:
## Key Insights

# 1. Transaction amounts and values are highly right-skewed, indicating the presence
#    of extreme values and the potential need for log transformation or robust scaling.

# 2. A small number of product categories and channels dominate transaction volume,
#    suggesting strong behavioral concentration.

# 3. Several numerical features show moderate correlation, particularly between
#    Amount and Value, indicating possible redundancy.

# 4. Missing values are minimal and can be handled through simple imputation strategies.

# 5. The presence of frequent low-value transactions suggests that transaction
#    frequency may be a strong indicator of customer engagement and credit risk.
