# Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the dataset


In [None]:
# Load the dataset
df = pd.read_csv('../data/credit_scoring_data.csv')


# Overview of the Data


In [None]:
# Overview of the Data
print("\n--- Overview of the Dataset ---")
print(f"Number of Rows: {df.shape[0]}")
print(f"Number of Columns: {df.shape[1]}")
print("\nColumn Names and Data Types:\n", df.dtypes)
print("\nFirst 5 Rows of the Dataset:\n", df.head())

# Check for duplicate values


In [None]:
# Check for duplicate values
duplicates = df.duplicated().sum()
print(f"\nNumber of Duplicate Rows: {duplicates}")

# Summary Statistics


In [None]:
# Summary Statistics
print("\n--- Summary Statistics ---")
print(df.describe())

# Distribution of Numerical Features


In [None]:
# Distribution of Numerical Features
numerical_features = df.select_dtypes(include=['number']).columns

# Display summary statistics for numerical features
print("Numerical Features Summary:")
print(df[numerical_features].describe())

plt.figure(figsize=(12, 6))
df[numerical_features].hist(bins=30, figsize=(12, 10), layout=(3, 3))
plt.suptitle("Distribution of Numerical Features")
plt.show()

# Print histogram counts for numerical features
print("\nHistogram Counts for Numerical Features:")
for feature in numerical_features:
    counts, bin_edges = np.histogram(df[feature], bins=30)
    print(f"\n{feature} Histogram Counts:")
    for count, edge in zip(counts, bin_edges[:-1]):
        print(f"Range {edge:.2f} - {bin_edges[bin_edges.tolist().index(edge)+1]:.2f}: {count}")


# Distribution of Categorical Features


In [None]:

# Distribution of Categorical Features
categorical_features = df.select_dtypes(include=['object']).columns

# Display counts for categorical features
print("\nCategorical Features Counts:")
for feature in categorical_features:
    counts = df[feature].value_counts()
    print(f"\n{feature} Counts:")
    print(counts)

    plt.figure(figsize=(8, 4))
    sns.countplot(y=df[feature], order=df[feature].value_counts().index)
    plt.title(f"Distribution of {feature}")
    plt.show()