In [None]:
# Task 2: Exploratory Data Analysis (EDA) for Bati Bank Credit Risk Model

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
# Make sure plot directory exists
plot_dir = "../plots/task-2"
os.makedirs(plot_dir, exist_ok=True)

# Load the data
df = pd.read_csv("./../data/raw/data.csv")

# Fraud label distribution
print("\n Fraud label distribution:")
print(df['FraudResult'].value_counts(normalize=True) * 100)

sns.countplot(data=df, x='FraudResult')
plt.title('Fraud vs Non-Fraud Distribution')
plt.savefig(f"{plot_dir}/fraud_distribution.png")
plt.clf()


ðŸ”¹ Fraud label distribution:
FraudResult
0    99.798248
1     0.201752
Name: proportion, dtype: float64


<Figure size 640x480 with 0 Axes>

In [None]:
# Check for class imbalance
fraud_rate = df['FraudResult'].mean()
print(f"\n Fraud Rate: {fraud_rate:.2%}")


ðŸ”¹ Fraud Rate: 0.20%


In [None]:
# Convert date column
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

#  Time-based analysis
df['Hour'] = df['TransactionStartTime'].dt.hour
sns.histplot(data=df, x='Hour', bins=24, kde=False)
plt.title('Transaction Hour Distribution')
plt.xlabel('Hour of Day')
plt.savefig(f"{plot_dir}/hour_distribution.png")
plt.clf()

<Figure size 640x480 with 0 Axes>

In [None]:
#  Transaction amount distribution
plt.figure(figsize=(10, 5))
sns.histplot(df['Amount'], bins=100, kde=True)
plt.title('Transaction Amount Distribution')
plt.savefig(f"{plot_dir}/amount_distribution.png")
plt.clf()

#  Compare transaction amount by fraud
plt.figure(figsize=(10, 5))
sns.boxplot(data=df, x='FraudResult', y='Amount')
plt.title('Transaction Amount by Fraud Result')
plt.savefig(f"{plot_dir}/amount_by_fraud.png")
plt.clf()


<Figure size 1000x500 with 0 Axes>

<Figure size 1000x500 with 0 Axes>

In [None]:
#  Categorical variable fraud rate
categorical_cols = ['ProductCategory', 'ChannelId', 'PricingStrategy', 'CountryCode', 'CurrencyCode']

for col in categorical_cols:
    plt.figure(figsize=(10, 4))
    sns.barplot(data=df, x=col, y='FraudResult', estimator=np.mean)
    plt.title(f"Fraud Rate by {col}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f"{plot_dir}/fraud_by_{col}.png")
    plt.clf()


<Figure size 1000x400 with 0 Axes>

<Figure size 1000x400 with 0 Axes>

<Figure size 1000x400 with 0 Axes>

<Figure size 1000x400 with 0 Axes>

<Figure size 1000x400 with 0 Axes>

In [10]:
# Correlation heatmap
numeric_cols = df.select_dtypes(include=np.number).columns
plt.figure(figsize=(10, 6))
sns.heatmap(df[numeric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.savefig(f"{plot_dir}/correlation_heatmap.png")
plt.clf()

# ðŸ”¹ Summary statistics
print("\nðŸ”¹ Summary Statistics:")
print(df.describe())

# (Optional) Save cleaned version
# df.to_csv("processed/cleaned_data.csv", index=False)


ðŸ”¹ Summary Statistics:
       CountryCode        Amount         Value  PricingStrategy   FraudResult  \
count      95662.0  9.566200e+04  9.566200e+04     95662.000000  95662.000000   
mean         256.0  6.717846e+03  9.900584e+03         2.255974      0.002018   
std            0.0  1.233068e+05  1.231221e+05         0.732924      0.044872   
min          256.0 -1.000000e+06  2.000000e+00         0.000000      0.000000   
25%          256.0 -5.000000e+01  2.750000e+02         2.000000      0.000000   
50%          256.0  1.000000e+03  1.000000e+03         2.000000      0.000000   
75%          256.0  2.800000e+03  5.000000e+03         2.000000      0.000000   
max          256.0  9.880000e+06  9.880000e+06         4.000000      1.000000   

               Hour  
count  95662.000000  
mean      12.447722  
std        4.846964  
min        0.000000  
25%        8.000000  
50%       13.000000  
75%       17.000000  
max       23.000000  


<Figure size 1000x600 with 0 Axes>