In [1]:
import pandas as pd

df = pd.read_csv("../data/loans.csv")
df.head()



Unnamed: 0,Loan ID,Status,Client ID,Debtor Identifier,Debtor Identifier Type,Debtor Country,Trust ID,Amount,Created At,Accepted At,Refused At,Repaid At,Due Date,Insurance Status
0,ccba7b5961ac84c6bc09ba20b0497bd666ac10a7ecf123...,REPAID,2477304f4867e7ea86fd4414e0f845c0e4bd06516fe687...,17c277f8d264ccec868dc55add915dc93fd4ae4bd779cf...,siren,FR,e98a46aea01b6f55341744cdddbba3d6a88ab1e7d366eb...,13643.52,2025-01-24T09:52:41.912,2025-01-24T09:52:54.08,,2025-03-06T05:05:38.362,2025-03-25T08:00:00,SETTLED
1,ea43f967166a7809d0e0f27fff33a3732fa600513d1b5f...,REPAID,a57af78c8caa2a9c6efbf8d9535f34441319c66ba56803...,494f60d81e3f8e6effbf093a1c2c6d0a99ca9d561bf624...,siren,FR,e98a46aea01b6f55341744cdddbba3d6a88ab1e7d366eb...,58.97,2025-08-27T07:51:48.154,2025-08-27T09:59:54.827,,2025-09-30T10:30:19,01/10/2025,
2,1d39f1f6c61cef090d31c552b0d3e511280ba64c2fb69a...,REPAID,dd3a54bd37885757d4c4b3dbd85c5fdcffa351fe2d1680...,8ab1efec5e005f628c0e5793276ddaf1e3693cf880a0a9...,cif,ES,e98a46aea01b6f55341744cdddbba3d6a88ab1e7d366eb...,6654.03,2025-02-24T09:13:01.171,2025-02-24T16:51:01.077,,2025-06-03T18:32:50,2025-05-20T09:12:33.344,REFUSED
3,f03b65936792e9d35e66db0572aa43cdc5f2d33d75fcd0...,REPAID,dd3a54bd37885757d4c4b3dbd85c5fdcffa351fe2d1680...,8563f72a004fcba1d5ae23410ccf82a6d8bcc85fb6ccfd...,kvk,NL,c4ed1a68f3ad7b3c85c4400e688e4dd3dcfe4da53d171d...,4452.8,2025-10-07T06:19:38.206,2025-10-07T06:19:47.087,,2025-10-29T15:34:56,2025-11-01T06:19:09.316,SETTLED
4,70290cf7ced390115c4443cab5f519fcf8ac52a11dafbd...,REPAID,dd3a54bd37885757d4c4b3dbd85c5fdcffa351fe2d1680...,17c277f8d264ccec868dc55add915dc93fd4ae4bd779cf...,siret,FR,e98a46aea01b6f55341744cdddbba3d6a88ab1e7d366eb...,966.08,2025-07-29T12:21:45.349,2025-07-29T12:21:54.306,,2025-09-05T14:43:14,2025-09-22T12:21:41.397,SETTLED


In [2]:
import pandas as pd
import numpy as np

df.columns = (
    df.columns.astype(str)
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
)

date_cols = [c for c in ["created_at","accepted_at","refused_at","repaid_at","due_date"] if c in df.columns]
for c in date_cols:
    df[c] = pd.to_datetime(df[c], errors="coerce")

df["amount"] = pd.to_numeric(df["amount"], errors="coerce")

print("=== Dataset Overview ===")
print(f"Rows: {df.shape[0]:,}")
print(f"Columns: {df.shape[1]:,}")
print("\nColumns:")
print(df.columns.tolist())

print("\n=== Missingness (% of rows) ===")
missing = (df.isna().mean() * 100).sort_values(ascending=False)
print(missing.to_string())

prod = df.loc[
    df["accepted_at"].notna() & df["amount"].notna(),
    ["accepted_at", "amount", "debtor_country", "insurance_status"]
].copy()

print("\n=== Production snapshot ===")
print(f"Accepted (financed) loans: {len(prod):,}")
print(f"Total financed amount: {prod['amount'].sum():,.0f}")

insured_statuses = {"ACTIVATED", "CLAIMED", "EXPIRED", "SETTLED", "TIME_BARRED"}
not_insured_statuses = {"REFUSED", "REJECTED"}

prod["month"] = prod["accepted_at"].dt.to_period("M").dt.to_timestamp()

prod["insured_amount"] = np.select(
    [
        prod["insurance_status"].isin(insured_statuses),
        prod["insurance_status"].isin(not_insured_statuses)
    ],
    [
        prod["amount"],  
        0.0              
    ],
    default=np.nan
)

monthly_country = (
    prod.groupby(["month", "debtor_country"], as_index=False)
        .agg(
            total_production=("amount", "sum"),
            insured_production=("insured_amount", "sum")
        )
)

monthly_country["insurance_rate_of_production"] = (
    monthly_country["insured_production"]
    / monthly_country["total_production"]
)

top_5_per_month = (
    monthly_country
    .dropna(subset=["insurance_rate_of_production"])
    .sort_values(
        ["month", "insurance_rate_of_production"],
        ascending=[True, False]
    )
    .groupby("month", as_index=False)
    .head(5)
    .reset_index(drop=True)
)

print("\n=== Top 5 Countries by Insurance Rate of Production (Per Month) ===")
display(
    top_5_per_month.assign(
        insurance_rate=lambda d: (d["insurance_rate_of_production"] * 100).round(1).astype(str) + "%",
        total_production=lambda d: d["total_production"].round(0).map(lambda x: f"{x:,.0f}")
    )[
        ["month", "debtor_country", "insurance_rate", "total_production"]
    ]
)


=== Dataset Overview ===
Rows: 68,083
Columns: 14

Columns:
['loan_id', 'status', 'client_id', 'debtor_identifier', 'debtor_identifier_type', 'debtor_country', 'trust_id', 'amount', 'created_at', 'accepted_at', 'refused_at', 'repaid_at', 'due_date', 'insurance_status']

=== Missingness (% of rows) ===
repaid_at                 95.916749
refused_at                93.826653
due_date                  87.612179
insurance_status          47.876856
accepted_at                6.255600
created_at                 0.098409
debtor_country             0.000000
debtor_identifier_type     0.000000
debtor_identifier          0.000000
client_id                  0.000000
status                     0.000000
loan_id                    0.000000
trust_id                   0.000000
amount                     0.000000

=== Production snapshot ===
Accepted (financed) loans: 63,824
Total financed amount: 267,425,403

=== Top 5 Countries by Insurance Rate of Production (Per Month) ===


Unnamed: 0,month,debtor_country,insurance_rate,total_production
0,2025-01-01,AT,100.0%,19237
1,2025-01-01,DK,100.0%,12350
2,2025-01-01,IT,100.0%,6933
3,2025-01-01,MC,100.0%,697
4,2025-01-01,PT,100.0%,2335
5,2025-02-01,AT,100.0%,180807
6,2025-02-01,CA,100.0%,71720
7,2025-02-01,DK,100.0%,15079
8,2025-02-01,IE,100.0%,7400
9,2025-02-01,IT,100.0%,19933


=== Brief Findings ===
- The dataset contains 68,083 rows and 14 columns.
- 63,824 loans have accepted_at populated and are treated as financed production.
- Total financed volume across the dataset is 267,425,403.
- Loan amounts appear right-skewed, with a small number of large loans contributing disproportionately to total financed volume.
- Production is concentrated across a subset of debtor countries, indicating geographic concentration within the portfolio.
- Insurance rate of production is measured on a value-weighted basis (insured amount รท total financed amount).
- When ranking countries monthly by insurance rate of production, the top five countries vary over time rather than remaining fixed.
- Countries with fewer observed months or lower production volumes were retained but should be interpreted with caution when comparing monthly rankings.
- Some fields exhibit non-trivial missingness, particularly across lifecycle dates and insurance-related attributes, which should be considered in downstream analysis.