<a href="https://colab.research.google.com/github/Antony-6487/Antony-6487/blob/main/Loan_Default.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:

import pandas as pd
import numpy as np
DATA_DIR = "."

train_dem  = pd.read_csv(f"{DATA_DIR}/traindemographics.csv", parse_dates=["birthdate"])
test_dem   = pd.read_csv(f"{DATA_DIR}/testdemographics.csv",  parse_dates=["birthdate"])
train_perf = pd.read_csv(f"{DATA_DIR}/trainperf.csv", parse_dates=["approveddate","creationdate"])
test_perf  = pd.read_csv(f"{DATA_DIR}/testperf.csv",  parse_dates=["approveddate","creationdate"])
train_prev = pd.read_csv(f"{DATA_DIR}/trainprevloans.csv", parse_dates=["approveddate","creationdate","closeddate","firstduedate","firstrepaiddate"])
test_prev  = pd.read_csv(f"{DATA_DIR}/testprevloans.csv",  parse_dates=["approveddate","creationdate","closeddate","firstduedate","firstrepaiddate"])
sample_sub = pd.read_csv(f"{DATA_DIR}/SampleSubmission.csv")

  test_perf  = pd.read_csv(f"{DATA_DIR}/testperf.csv",  parse_dates=["approveddate","creationdate"])
  test_perf  = pd.read_csv(f"{DATA_DIR}/testperf.csv",  parse_dates=["approveddate","creationdate"])


In [49]:
print("train_dem:",  train_dem.shape, "| test_dem:",  test_dem.shape)
print("train_perf:", train_perf.shape, "| test_perf:", test_perf.shape)
print("train_prev:", train_prev.shape, "| test_prev:", test_prev.shape)
print("sample_sub:", sample_sub.shape)

# peek at a few columns to confirm
print("\ntrain_perf columns:", train_perf.columns.tolist()[:12])
train_perf.head(3)

train_dem: (4346, 9) | test_dem: (1487, 9)
train_perf: (4368, 10) | test_perf: (1450, 9)
train_prev: (18183, 12) | test_prev: (5907, 12)
sample_sub: (1450, 2)

train_perf columns: ['customerid', 'systemloanid', 'loannumber', 'approveddate', 'creationdate', 'loanamount', 'totaldue', 'termdays', 'referredby', 'good_bad_flag']


Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,referredby,good_bad_flag
0,8a2a81a74ce8c05d014cfb32a0da1049,301994762,12,2017-07-25 08:22:56,2017-07-25 07:22:47,30000.0,34500.0,30,,Good
1,8a85886e54beabf90154c0a29ae757c0,301965204,2,2017-07-05 17:04:41,2017-07-05 16:04:18,15000.0,17250.0,30,,Good
2,8a8588f35438fe12015444567666018e,301966580,7,2017-07-06 14:52:57,2017-07-06 13:52:51,20000.0,22250.0,15,,Good


In [52]:


def ensure_datetime(df, cols, dayfirst=False):
    df = df.copy()
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], errors="coerce", dayfirst=dayfirst)
    return df

# list of date columns in each table
dem_date_cols  = ["birthdate"]
perf_date_cols = ["approveddate", "creationdate"]
prev_date_cols = ["approveddate", "creationdate", "closeddate", "firstduedate", "firstrepaiddate"]

# apply to all six dataframes
train_dem  = ensure_datetime(train_dem,  dem_date_cols)
test_dem   = ensure_datetime(test_dem,   dem_date_cols)

train_perf = ensure_datetime(train_perf, perf_date_cols)
test_perf  = ensure_datetime(test_perf,  perf_date_cols)

train_prev = ensure_datetime(train_prev, prev_date_cols)
test_prev  = ensure_datetime(test_prev,  prev_date_cols)

# re-check
show_dtypes("train_perf (after)", train_perf)
show_dtypes("train_prev (after)", train_prev)
show_dtypes("train_dem  (after)", train_dem)


train_perf (after) dtypes:
customerid               object
systemloanid              int64
loannumber                int64
approveddate     datetime64[ns]
creationdate     datetime64[ns]
loanamount              float64
totaldue                float64
termdays                  int64
referredby               object
good_bad_flag            object
dtype: object

train_prev (after) dtypes:
customerid                 object
systemloanid                int64
loannumber                  int64
approveddate       datetime64[ns]
creationdate       datetime64[ns]
loanamount                float64
totaldue                  float64
termdays                    int64
closeddate         datetime64[ns]
referredby                 object
firstduedate       datetime64[ns]
firstrepaiddate    datetime64[ns]
dtype: object

train_dem  (after) dtypes:
customerid                            object
birthdate                     datetime64[ns]
bank_account_type                     object
longitude_gps            

  df[c] = pd.to_datetime(df[c], errors="coerce", dayfirst=dayfirst)
  df[c] = pd.to_datetime(df[c], errors="coerce", dayfirst=dayfirst)


In [53]:

def date_diff_days(later, earlier):

    return (later - earlier).dt.total_seconds() / (24*3600)



def make_perf_features(df):

    out = df.copy()

    out["days_creation_to_approval"] = date_diff_days(out["approveddate"], out["creationdate"])

    out.loc[out["days_creation_to_approval"] < 0, "days_creation_to_approval"] = np.nan

    out["approved_month"] = out["approveddate"].dt.month

    out["approved_dow"]   = out["approveddate"].dt.dayofweek

    return out



def make_prev_aggregates(df_prev):

    g = df_prev.copy()

    g["days_to_close"]       = date_diff_days(g["closeddate"],      g["approveddate"])

    g["days_to_first_repay"] = date_diff_days(g["firstrepaiddate"], g["approveddate"])

    g["repaid_before_due"]   = (g["firstrepaiddate"] < g["firstduedate"]).astype(float)



    agg = g.groupby("customerid").agg(

        prev_loan_count=("systemloanid","count"),

        prev_loanamount_mean=("loanamount","mean"),

        prev_totaldue_mean=("totaldue","mean"),

        prev_termdays_mean=("termdays","mean"),

        prev_days_to_close_mean=("days_to_close","mean"),

        prev_days_to_first_repay_mean=("days_to_first_repay","mean"),

        prev_repaid_before_due_rate=("repaid_before_due","mean"),

    ).reset_index()

    return agg



def merge_everything(perf_fe, dem, prev_agg, include_target=True):


    df = perf_fe.merge(dem, on="customerid", how="left")

    df = df.merge(prev_agg, on="customerid", how="left")



    # age at approval (NaN if either side missing)

    df["age_years"] = (df["approveddate"] - df["birthdate"]).dt.days / 365.25



    if include_target and "good_bad_flag" in df.columns:

        df["Good_Bad_flag"] = (

            df["good_bad_flag"].astype("string").str.strip().str.lower() == "good"

        ).astype(int)

        df = df.drop(columns=["good_bad_flag"])

    return df

In [54]:
# current-loan features
train_perf_fe = make_perf_features(train_perf)
test_perf_fe  = make_perf_features(test_perf)

# previous-loan aggregates
train_prev_agg = make_prev_aggregates(train_prev)
test_prev_agg  = make_prev_aggregates(test_prev)

# merged modeling tables
train_all = merge_everything(train_perf_fe, train_dem, train_prev_agg, include_target=True)
test_all  = merge_everything(test_perf_fe,  test_dem,  test_prev_agg,  include_target=False)

print("train_all:", train_all.shape)
print("test_all :", test_all.shape)

# sanity peek
train_all[["customerid","loanamount","approveddate","age_years"]].head()

train_all: (4376, 29)
test_all : (1450, 28)


Unnamed: 0,customerid,loanamount,approveddate,age_years
0,8a2a81a74ce8c05d014cfb32a0da1049,30000.0,2017-07-25 08:22:56,45.524983
1,8a85886e54beabf90154c0a29ae757c0,15000.0,2017-07-05 17:04:41,31.865845
2,8a8588f35438fe12015444567666018e,20000.0,2017-07-06 14:52:57,32.796715
3,8a85890754145ace015429211b513e16,10000.0,2017-07-27 19:00:41,39.794661
4,8a858970548359cc0154883481981866,40000.0,2017-07-03 23:42:45,30.819986


In [55]:
numeric_features = [
    "loanamount","totaldue","termdays","loannumber",
    "days_creation_to_approval","approved_month","approved_dow",
    "longitude_gps","latitude_gps","age_years",
    "prev_loan_count","prev_loanamount_mean","prev_totaldue_mean",
    "prev_termdays_mean","prev_days_to_close_mean",
    "prev_days_to_first_repay_mean","prev_repaid_before_due_rate",
]

#avoiding  KeyErrors)
kept = [c for c in numeric_features if c in train_all.columns]

X = train_all[kept].copy()
y = train_all["Good_Bad_flag"].astype(int)
X_test = test_all[kept].copy()

# median fill
train_meds = X.median(numeric_only=True)
X      = X.fillna(train_meds)
X_test = X_test.fillna(train_meds)

print("features used:", kept)
print("X shape:", X.shape, " | X_test shape:", X_test.shape)

features used: ['loanamount', 'totaldue', 'termdays', 'loannumber', 'days_creation_to_approval', 'approved_month', 'approved_dow', 'longitude_gps', 'latitude_gps', 'age_years', 'prev_loan_count', 'prev_loanamount_mean', 'prev_totaldue_mean', 'prev_termdays_mean', 'prev_days_to_close_mean', 'prev_days_to_first_repay_mean', 'prev_repaid_before_due_rate']
X shape: (4376, 17)  | X_test shape: (1450, 17)


In [56]:
#modelling
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf = DecisionTreeClassifier(max_depth=6, random_state=42)
clf.fit(X_tr, y_tr)

va_pred = clf.predict(X_va)
acc = accuracy_score(y_va, va_pred)
err = 1 - acc


print("Validation accuracy:", round(acc, 4))
print("Validation error rate:", round(err, 4))

Validation accuracy: 0.7523
Validation error rate: 0.2477


In [40]:
# fit on all training rows
clf.fit(X, y)

# predict test labels (0/1)
test_pred = clf.predict(X_test).astype(int)

submission = pd.DataFrame({
    "customerid": test_all["customerid"],
    "Good_Bad_flag": test_pred
})

if "sample_sub" in globals() and set(sample_sub["customerid"]) == set(submission["customerid"]):
    submission = (submission.set_index("customerid")
                            .loc[sample_sub["customerid"]]
                            .reset_index())

OUT_PATH = "submission_superlender.csv"
submission.to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH)
submission.head()

Saved: submission_superlender.csv


Unnamed: 0,customerid,Good_Bad_flag
0,8a28afc7474813a40147639ec637156b,1
1,8a3735d5518aba7301518ac34413010d,1
2,8a76e7d443e6e97c0143ed099d102b1d,1
3,8a818823525dceef01525deda2480384,1
4,8a818926522ea5ef01523aff15c37482,1


In [None]:
#using an