In [39]:
%pip install missingno MissForest lazypredict

Note: you may need to restart the kernel to use updated packages.


In [40]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.inspection import permutation_importance
import shap
import lightgbm as lgb

# NOTE
El DataFrame df se crea a partir de la tabla base_datos_pripal de la base de datos credit_scoring.db. El archivo no se incluye en el repositorio debido a su tamaño; la generación de la base de datos a partir del CSV original y la creación de la tabla se explica detalladamente en el notebook data-collection.ipynb

Descriptive analysis:
In this part of the project, we begin exploring the dataset created from the initial information obtained from the LendingClub dataset (Kaggle). The objective of this stage is to describe and understand the structure of the data, the variables and their types, their distributions, skewness, and the presence of missing values.

In [41]:
conn = sqlite3.connect(r"c:\Users\User\Documents\GITHUB\final_project_creditscoring\Data\credit_scoring.db")
df = pd.read_sql("SELECT * FROM main_table", conn)
conn.close()
n_rows,n_cols = df.shape

print(f'En este df existen {n_rows} filas y {n_cols} columnas')

En este df existen 192309 filas y 157 columnas


With the analysis below, we can understand that the dataset contains a large number of numerical variables, along with several categorical features represented as object types. This initial inspection highlights the need for feature selection and type handling in later stages.

In [42]:
cols_types = df.dtypes.reset_index().rename(
    columns={'index': 'column_name', 0: 'dtype'}
)

cols_types['dtype'].value_counts()

dtype
float64    118
object      38
int64        1
Name: count, dtype: int64

In [43]:
cols_types[cols_types['dtype'] == 'object']

Unnamed: 0,column_name,dtype
1,member_id,object
5,term,object
8,grade,object
9,sub_grade,object
10,emp_title,object
11,emp_length,object
12,home_ownership,object
14,verification_status,object
15,issue_d,object
16,loan_status,object


In [44]:
print(cols_types[cols_types['dtype'] == 'float64'].to_string(index=False))

                               column_name   dtype
                                 loan_amnt float64
                               funded_amnt float64
                           funded_amnt_inv float64
                                  int_rate float64
                               installment float64
                                annual_inc float64
                                       dti float64
                               delinq_2yrs float64
                            fico_range_low float64
                           fico_range_high float64
                            inq_last_6mths float64
                    mths_since_last_delinq float64
                    mths_since_last_record float64
                                  open_acc float64
                                   pub_rec float64
                                 revol_bal float64
                                revol_util float64
                                 total_acc float64
                               

In [45]:
cols_types[cols_types['dtype'] == 'int64']

Unnamed: 0,column_name,dtype
0,id,int64


Revision of constant columns

In [46]:
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

# Unique values per column
uniq = df.nunique(dropna=False)

# Show results
print(uniq)

id                                            192309
member_id                                          1
loan_amnt                                       1456
funded_amnt                                     1457
funded_amnt_inv                                 4943
term                                               2
int_rate                                         500
installment                                    42329
grade                                              7
sub_grade                                         35
emp_title                                      92693
emp_length                                        12
home_ownership                                     6
annual_inc                                     15650
verification_status                                3
issue_d                                          103
loan_status                                        9
pymnt_plan                                         2
url                                           

In [47]:
uniq = df.nunique(dropna=False)
uniq[uniq == 1]

member_id      1
policy_code    1
dtype: int64

In [48]:
cols = ['member_id', 'policy_code']

for col in cols:
    print(df[col].value_counts(dropna=False))

member_id
None    192309
Name: count, dtype: int64
policy_code
1.0    192309
Name: count, dtype: int64


In [49]:
df = df.drop(columns=['member_id', 'policy_code','id'])

Revision of Duplicated Rows

In [50]:
df.duplicated().sum()

np.int64(0)

Missing values analysis

In [51]:
missing = df.isna().mean()*100
print(missing[missing>0])

emp_title                                      6.883193
emp_length                                     5.486483
desc                                          77.997390
title                                          0.595396
dti                                            0.039520
mths_since_last_delinq                        54.635508
mths_since_last_record                        87.276207
revol_util                                     0.066040
last_pymnt_d                                   0.106079
next_pymnt_d                                  75.548206
last_credit_pull_d                             0.005200
mths_since_last_major_derog                   80.243254
annual_inc_joint                              96.915381
dti_joint                                     96.915381
verification_status_joint                     97.036020
tot_coll_amt                                  23.141403
tot_cur_bal                                   23.141403
open_acc_6m                                   63

In [52]:
missing_threshold = 0

high_missing_cols = missing[missing >= missing_threshold]
print(high_missing_cols)

loan_amnt                                      0.000000
funded_amnt                                    0.000000
funded_amnt_inv                                0.000000
term                                           0.000000
int_rate                                       0.000000
installment                                    0.000000
grade                                          0.000000
sub_grade                                      0.000000
emp_title                                      6.883193
emp_length                                     5.486483
home_ownership                                 0.000000
annual_inc                                     0.000000
verification_status                            0.000000
issue_d                                        0.000000
loan_status                                    0.000000
pymnt_plan                                     0.000000
url                                            0.000000
desc                                          77

In [53]:
# Column categorization
def classify_column(col):
    if col.startswith(("hardship", "settlement", "deferral")):
        return "post_loan"
    if col.startswith("sec_app") or col.endswith("_joint"):
        return "second_applicant"
    if col.startswith("mths_since"):
        return "structural_missing"
    return "other"

categories = ["post_loan", "second_applicant", "structural_missing"]
analysis_dict = {cat: [col for col in df.columns if classify_column(col) == cat] for cat in categories}

print("Column categorization complete")

Column categorization complete


In [54]:
# Remove post-loan and second-applicant columns
post_loan_cols = analysis_dict["post_loan"]
second_applicant = analysis_dict["second_applicant"]

cols_to_drop = post_loan_cols + second_applicant
df = df.drop(columns=cols_to_drop)
print(f"Dropped {len(cols_to_drop)} columns")

Dropped 34 columns


# Train-Test Split

In [55]:
# Filter frequent classes only
print(df['loan_status'].value_counts())

frequent_classes = df['loan_status'].value_counts()[df['loan_status'].value_counts() > 5].index
df = df[df['loan_status'].isin(frequent_classes)]

print(f"Filtered to {len(df)} rows")

loan_status
Fully Paid                                             119172
Current                                                 44568
Charged Off                                             26114
Late (31-120 days)                                       1141
Does not meet the credit policy. Status:Fully Paid        442
In Grace Period                                           436
Late (16-30 days)                                         249
Does not meet the credit policy. Status:Charged Off       186
Default                                                     1
Name: count, dtype: int64
Filtered to 192308 rows


In [56]:
# Create X, y split
X = df.drop(columns=['loan_status'])
y = df['loan_status']

# Split 80% train / 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {X_train.shape}, Test: {X_test.shape}")

Train: (153846, 119), Test: (38462, 119)


# Feature Engineering

In [57]:
# Convert term to numeric
X_train['term'] = X_train['term'].astype(str).str.replace(' months','').astype(int)
X_test['term'] = X_test['term'].astype(str).str.replace(' months','').astype(int)

# Remove free text columns
cols_to_drop = ['emp_title', 'url', 'title', 'desc']
X_train = X_train.drop(columns=[c for c in cols_to_drop if c in X_train.columns])
X_test = X_test.drop(columns=[c for c in cols_to_drop if c in X_test.columns])

# Handle zip_code
X_train['zip_code'] = X_train['zip_code'].astype(str).str[:3]
X_test['zip_code'] = X_test['zip_code'].astype(str).str[:3]

ord_enc_zip = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train[['zip_code']] = ord_enc_zip.fit_transform(X_train[['zip_code']])
X_test[['zip_code']] = ord_enc_zip.transform(X_test[['zip_code']])

print("Feature engineering completed")

Feature engineering completed


In [58]:
# One-hot encode categorical variables
categorical_cols = ['grade', 'sub_grade', 'home_ownership', 
                    'verification_status', 'purpose', 
                    'initial_list_status', 'application_type']

ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
ohe.fit(X_train[categorical_cols])

X_train_encoded = pd.DataFrame(ohe.transform(X_train[categorical_cols]), 
                               columns=ohe.get_feature_names_out(categorical_cols), 
                               index=X_train.index)
X_test_encoded = pd.DataFrame(ohe.transform(X_test[categorical_cols]), 
                              columns=ohe.get_feature_names_out(categorical_cols), 
                              index=X_test.index)

X_train = X_train.drop(columns=categorical_cols).join(X_train_encoded)
X_test = X_test.drop(columns=categorical_cols).join(X_test_encoded)

print(f"X_train shape: {X_train.shape}")

X_train shape: (153846, 170)


In [59]:
# Encode binary columns
binary_cols = ['pymnt_plan', 'debt_settlement_flag']
for col in binary_cols:
    if col in X_train.columns:
        X_train[col] = X_train[col].map({'y': 1, 'n': 0})
        X_test[col] = X_test[col].map({'y': 1, 'n': 0})

print("Binary encoding completed")

Binary encoding completed


In [60]:
# One-hot encode addr_state
if 'addr_state' in X_train.columns:
    ohe_state = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
    ohe_state.fit(X_train[['addr_state']])

    X_train_state = pd.DataFrame(ohe_state.transform(X_train[['addr_state']]),
                                 columns=ohe_state.get_feature_names_out(['addr_state']),
                                 index=X_train.index)
    X_test_state = pd.DataFrame(ohe_state.transform(X_test[['addr_state']]),
                                columns=ohe_state.get_feature_names_out(['addr_state']),
                                index=X_test.index)

    X_train = X_train.drop(columns=['addr_state']).join(X_train_state)
    X_test = X_test.drop(columns=['addr_state']).join(X_test_state)

print("State encoding completed")

State encoding completed


# Target Encoding

In [61]:
# Encode target to binary
default_labels = {
    'Charged Off': 1,
    'Does not meet the credit policy. Status:Charged Off': 1,
    'Does not meet the credit policy. Status:Fully Paid': 1,
    'Late (16-30 days)': 1,
    'Late (31-120 days)': 1,
    'Fully Paid': 0,
    'Current': 0,
    'In Grace Period': 0
}

y_train = y_train.map(default_labels)
y_test = y_test.map(default_labels)

print("Target distribution:")
print(y_train.value_counts())
print(y_test.value_counts())

Target distribution:
loan_status
0    131340
1     22506
Name: count, dtype: int64
loan_status
0    32836
1     5626
Name: count, dtype: int64


# Outlier Detection

In [62]:
# Apply IsolationForest
df_train_clean = X_train.copy()

iso = IsolationForest(random_state=123, contamination='auto')
df_train_clean['outlier_flag'] = iso.fit_predict(df_train_clean)

# Remove outliers
df_train_clean = df_train_clean[df_train_clean['outlier_flag'] == 1]
df_train_clean = df_train_clean.drop(columns=['outlier_flag'])
y_train_clean = y_train.loc[df_train_clean.index]

print(f"Rows after removing outliers: {df_train_clean.shape[0]}")

ValueError: could not convert string to float: '4 years'

# Feature Selection

In [None]:
# Method 1: Random Forest Feature Importance
model_rf = RandomForestClassifier(random_state=42, n_jobs=-1).fit(df_train_clean, y_train_clean)
importances = model_rf.feature_importances_ / model_rf.feature_importances_.sum() * 100

df_rf_imp = pd.DataFrame({
    'feature': df_train_clean.columns,
    'rf_importance': importances
}).sort_values(by='rf_importance', ascending=False)

df_rf_imp['rf_importance_acum'] = df_rf_imp['rf_importance'].cumsum()
print(df_rf_imp.head(20))

In [None]:
# Method 2: Permutation Importance
X_train1, X_val, y_train1, y_val = train_test_split(
    df_train_clean, y_train_clean, test_size=0.2, random_state=42
)

model_xgb = XGBClassifier(
    objective='binary:logistic',
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
).fit(X_train1, y_train1)

perm = permutation_importance(
    model_xgb, X_val, y_val, n_repeats=10, random_state=42, n_jobs=-1, scoring='accuracy'
)

df_perm_imp = pd.DataFrame({
    'feature': df_train_clean.columns,
    'perm_imp': perm.importances_mean * 100
}).sort_values('perm_imp', ascending=False)

print(df_perm_imp.head(20))

In [None]:
# Method 3: SHAP Values
model_lgbm = lgb.LGBMClassifier(random_state=42, n_jobs=-1).fit(df_train_clean, y_train_clean)

explainer = shap.Explainer(model_lgbm, X_val)
shap_vals = explainer(X_val).values

imp_shap = np.abs(shap_vals).mean(axis=0)
imp_shap_pct = imp_shap / imp_shap.sum() * 100

df_shap_imp = pd.DataFrame({
    "feature": X_val.columns,
    "shap_imp": imp_shap_pct
}).sort_values('shap_imp', ascending=False)

print(df_shap_imp.head(20))

In [None]:
%pip install missingno MissForest lazypredict

In [None]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.inspection import permutation_importance
import lightgbm as lgb

In [None]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.inspection import permutation_importance
import shap
import lightgbm as lgb

# NOTE
El DataFrame df se crea a partir de la tabla basa_datos_pripal de la base de datos credit_scoring.db. El archivo no se incluye en el repositorio debido a su tamaño; la generación de la base de datos a partir del CSV original y la creación de la tabla se explica detalladamente en el notebook data-collection.ipynb

Descriptive analysis:
In this part of the project, we begin exploring the dataset created from the initial information obtained from the LendingClub dataset (Kaggle). The objective of this stage is to describe and understand the structure of the data, the variables and their types, their distributions, skewness, and the presence of missing values.

In [None]:
conn = sqlite3.connect("/workspaces/final_project_creditscoring/Data/credit_scoring.db")
df = pd.read_sql("SELECT * FROM main_table", conn)
conn.close()
n_rows,n_cols = df.shape

print(f'En este df existen {n_rows} filas y {n_cols} columnas')

With the analysis below, we can understand that the dataset contains a large number of numerical variables, along with several categorical features represented as object types. This initial inspection highlights the need for feature selection and type handling in later stages. Now we are proceeding with a list of each type of column to identify possible issues in data types (such as date columns that are objects or numerical data that is shown as objects, etc.).

In [None]:
cols_types = df.dtypes.reset_index().rename(
    columns={'index': 'column_name', 0: 'dtype'}
)

cols_types['dtype'].value_counts()

In [None]:
cols_types[cols_types['dtype'] == 'object']

In [None]:
print(cols_types[cols_types['dtype'] == 'float64'].to_string(index=False))

In [None]:
cols_types[cols_types['dtype'] == 'int64']

Revision of constant columns: 

The following code initially didn't specify dropna=False, which made it show a few columns as constants. This led us to investigate what was happening and whether we were working with the right dataframe. However, this mistake was enlightening, as it helped us identify possible *data leakage variables, such as: hardship_type, deferral_term, and hardship_length.*

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

# Unique values per column
uniq = df.nunique(dropna=False)

# Show results
print(uniq)

In [None]:
uniq = df.nunique(dropna=False)
uniq[uniq == 1]

After the code revision, the only two variables with constant values are member_id and policy_code. It does not make much sense to have a unique value for member_id if we have almost 200k data entries, so we needed to check the exact values contained in this column.

In [None]:
cols = ['member_id', 'policy_code']

for col in cols:
    print(df[col].value_counts(dropna=False))

Regarding policy_code, a similar situation occurs. According to the data dictionary, LendingClub has only two types of policies: publicly available (1) and new products not publicly available (2). In this dataset, only publicly available products are present. Therefore, following the same reasoning as above, policy_code is not a relevant column for the analysis.

Lastly, as we are dropping member_id and policy_code because they are not predictors, we are doing the same with two other columns as well, which, even though they do not have constant values, can generate noise in the analysis: the url column and the id column. 

In [None]:
df = df.drop(columns=['member_id', 'policy_code','id'])

Revision of Duplicated Rows: No duplicated rows were identified.

In [None]:
df.duplicated().sum()

Revision of Duplicated Columns: Two variables (deferral_term and hardship_length) were found to be exact duplicates, containing identical values across all observations. Both variables are related to post-loan hardship events (we previously identified them as potential data leakers) and will therefore be excluded from the modeling stage. 

In [None]:
df.T.duplicated().sum()
df.T.duplicated(keep=False)

Missing values: 
We identified columns with a high percentage of missing values, so we proceeded to define a missing threshold of 50%, where variables with more than 50% missing values will be considered for exclusion from the modeling stage. However, first we must evaluate them on a case-by-case basis to understand if any of those variables are conceptually important.

In [None]:
missing = df.isna().mean()*100
missing[missing>0]
print(missing)

In [None]:
missing_threshold = 0

high_missing_cols = missing[missing >= missing_threshold]
print(high_missing_cols)

In [None]:
(missing > 0).sum()

Now we have to identify other missing values and audit them to understand how we should treat each case.

In [None]:
#Identify other missing values
cat_col = df.select_dtypes(include=['object']).columns

for col in cat_col: 
    print(df[col].value_counts())

In [None]:
df_faltantes = df.replace(['None'],np.nan,inplace=True)

Columns with more than 50% missing values were manually reviewed and classified into post-loan variables, second-applicant features, structurally missing variables, and late-reported behavioral features based on domain knowledge and data documentation. No features were removed at this stage; the analysis documents decisions to be applied during model preparation.

In [None]:
def classify_column(col):
    if col.startswith(("hardship", "settlement", "deferral")):
        return "post_loan"
    if col.startswith("sec_app") or col.endswith("_joint"):
        return "second_applicant"
    if col.startswith("mths_since"):
        return "structural_missing"
    return "other"

categories = ["post_loan", "second_applicant", "structural_missing"]
analysis_dict = {cat: [col for col in df.columns if classify_column(col) == cat] for cat in categories}

print("🔍 --- STARTING COLUMN CATEGORIZATION ANALYSIS --- 🔍\n")

for category, cols in analysis_dict.items():
    print(f"📁 CATEGORY: {category.upper()}")
    if not cols:
        print("   ❌ No columns found in this category.\n")
    else:
        print(f"   ✅ Found {len(cols)} columns.")
        missing_stats = df[cols].isnull().mean() * 100
        print(missing_stats.sort_values(ascending=False).to_string())
        print("-" * 40 + "\n")

print("🚀 --- ANALYSIS COMPLETE --- 🚀")

In [None]:
#Column checker: to be able to quickly check the characteristics of a column and its type

target_col = 'loan_status'  

col_type = df[target_col].dtype

summary = pd.DataFrame({
    'Count': df[target_col].value_counts(dropna=False),
    'Percentage (%)': df[target_col].value_counts(dropna=False, normalize=True) * 100
})

print(f"Content Analysis for: {target_col.upper()}")
print(summary)
print('---'*30)
print(f"Data Type: {col_type}")

In [None]:
all_classified_cols = (analysis_dict['post_loan'] + 
                      analysis_dict['second_applicant'] + 
                      analysis_dict['structural_missing'])

other_cols = [col for col in df.columns if col not in all_classified_cols]

print("🔍 --- AUDITING 'OTHER' COLUMNS WITH HIGH MISSING RATIO (>40%) ---")
high_missing_other = df[other_cols].isnull().mean()
high_missing_other = high_missing_other[high_missing_other > 0.4].sort_values(ascending=False)

if high_missing_other.empty:
    print("✅ No additional critical missing values found outside defined categories.")
else:
    print("⚠️ Attention: The following columns also have a high missing ratio:")
    print(high_missing_other.to_string())

1. *Post-loan variables:*

These features contain information generated after loan origination, such as hardship or settlement events. Their high missingness reflects the fact that most loans do not enter these processes. Because these variables include future information relative to the credit decision, they were identified as potential sources of data leakage.

Planned decision: Exclude.

2. *Second-applicant variables:*

These variables describe characteristics of a co-borrower in joint loan applications. The high proportion of missing values reflects that most loans involve a single applicant, meaning missing values indicate the absence of a second applicant rather than missing information.

Rather than modeling the full co-borrower profile, the presence of a second applicant is captured through a binary indicator. This approach preserves potentially relevant information while avoiding additional complexity and extensive imputation.

Planned decision: Create a binary flag indicating whether a loan includes a second applicant, and exclude detailed second-applicant features during model preparation.

3. *Structurally missing variables:*

These features represent the time since the last occurrence of negative credit events. Missing values indicate that the event has never occurred, making the missingness itself informative.

Planned decision: Retain for modeling and apply a dedicated imputation strategy at a later stage.

4. *Late-reported features:*

These variables were introduced into the dataset at later periods and are unavailable for older loans. Missingness is driven by historical reporting limitations rather than borrower behavior.

Planned decision: Evaluate after defining the temporal train-test split.

DATA CLEANING & PREPROCESSING STRATEGY

Now that we have a clearer understanding of the data, we can proceed with data cleaning and processing.

1. DF Backup: Create a full copy of the raw dataset to ensure data integrity and allow for easy rollbacks during the experimentation phase.

2. Target Definition & Filtering: Refine the loan_status variable. We exclude ongoing loans and focus only on definitive outcomes.

Default (1): Charged off, default, or late (30–120 days).
Charged Off
Late (31-120 days)
Default
Does not meet the credit policy. Status:Charged Off

Non-Default (0): Fully paid.
Fully Paid

3. Leakage Removal: Drop all Post-loan variables. These features contain information only available after the credit decision has been made, which would lead to Data Leakage.

4. Structural Simplification (Joint Apps): Consolidate +16 second-applicant features into a single Binary Flag (is_joint_application). This reduces dimensionality while preserving the fact that a co-borrower exists.

5. Zero Ratio & Variance Analysis: Identify features with excessive sparsity. We decide whether to drop columns with near-zero variance or binarize features where the simple presence of an event (0 vs >0) is more predictive than its frequency.

6. Missingness Audit (Missingno): Visualize the remaining missing values to determine the mechanism of missingness (Random vs. Structural). This dictates the final decision: drop the column (if >50% NaN) or keep it for Imputation after the Train-Test Split.

In [None]:
df_backup = df.copy()

In [None]:
# Function to inspect unique values and their prevalence
def inspect_categories(dataframe, column_list):
    """
    Prints frequency and percentage distribution for categorical features.
    """
    for col in column_list:
        print(f"\n--- Feature: {col.upper()} ---")
        
        counts = dataframe[col].value_counts(dropna=False)
        percentages = dataframe[col].value_counts(dropna=False, normalize=True) * 100
        
        summary = pd.DataFrame({
            'Count': counts,
            'Percentage (%)': percentages.round(2)
        })
        
        print(summary)
        print("-" * 30)

target_cols = analysis_dict.get('other', [])
inspect_categories(df, target_cols)

Now we proceed with a zero ratio analysis, that has the objective of helping us decide which variables doesn't have enough information to support the model and which ones are corrupt with 0s that should be NaNs.

In [None]:
#Zero Ratio Analysis
zero_ratio = (df == 0).mean() * 100
zero_ratio_all = zero_ratio[zero_ratio > 0].sort_values(ascending=False)

print("ALL COLUMNS WITH ZEROS")
print(zero_ratio_all.to_string())

Having identified the columns with higher amounts of 0s, we decided to audit them one by one and review their unique values to understand whether the 0s are informative or if they represent null/NaN values. This audit is half based on the code shown below and half on a manual review of the dataset dictionary.

In [None]:
# 1. Calculate Zero Ratio again to get the target columns
zero_ratio = (df == 0).mean() * 100
# Define a threshold (e.g., columns with more than 50% zeros)
high_zero_threshold = 30.0
high_zero_cols = zero_ratio[zero_ratio > high_zero_threshold].sort_values(ascending=False).index.tolist()

def audit_high_zero_columns(dataframe, column_list):
    """
    Audits columns with high zero ratios to see value distribution 
    and help decide between dropping, keeping, or binarizing.
    """
    print(f"🔍 --- AUDITING {len(column_list)} COLUMNS WITH > {high_zero_threshold}% ZEROS --- \n")
    
    for col in column_list:
        print(f"📊 Feature: {col.upper()}")
        print(f"Zero Ratio: {zero_ratio[col]:.2f}%")
        
        # Count unique values excluding zero
        non_zero_values = dataframe[dataframe[col] != 0][col]
        unique_counts = non_zero_values.nunique()
        
        print(f"Unique values (excluding zero): {unique_counts}")
        
        if unique_counts < 15:
            # If few unique values, show frequency
            print("Distribution (Top Values):")
            print(dataframe[col].value_counts().head(10))
        else:
            # If many unique values, show basic stats for non-zero data
            print("Non-zero stats:")
            print(non_zero_values.describe()[['mean', 'min', 'max']])
        
        print("-" * 40)

# Execute the audit
audit_high_zero_columns(df, high_zero_cols)

The function below is intended to serve as a filter to help us quickly verify whether a column was correctly classified in the categories defined above.

In [None]:
def get_column_category(column_name, mapping_dict):
    """
    Checks which category a specific column belongs to based on the analysis_dict.
    """
    for category, columns in mapping_dict.items():
        if column_name in columns:
            return category
    return "other (or not found)"

test_col = 'loan_status' # You can change this name to any column
result = get_column_category(test_col, analysis_dict)
print(f"Verification: The column '{test_col}' is categorized as: {result.upper()}")

# 3. Batch verification (Optional)
# List of columns you want to verify right now
verify_list = ['sec_app_fico_range_low', 'mths_since_last_delinq', 'loan_amnt', 'settlement_term']

print("BATCH VERIFICATION:")
for col in verify_list:
    cat = get_column_category(col, analysis_dict)
    print(f"- {col:30} -> Category: {cat}")

In [None]:
low_variance_cols = []

for col in df.columns:
    vc = df[col].value_counts(dropna=False, normalize=True)
    if vc.iloc[0] > 0.99:   # más del 99% el mismo valor
        low_variance_cols.append(col)

low_variance_cols

In [None]:
desc = df.describe().T
desc.sort_values(by='max', ascending=False).head(20)

The analysis of the maximum values reveals the presence of extreme values in some financial variables, which suggests the need to apply transformations or outlier treatment techniques in later stages.

In [None]:
desc.assign(
    mean_median_ratio = desc['mean'] / desc['50%']
).sort_values('mean_median_ratio', ascending=False).head(10)

Several numerical variables present highly skewed distributions, with median values equal to zero and a small proportion of non-zero observations. This pattern is expected for count-based credit history variables. However, some highly skewed variables correspond to post-loan information and will therefore be excluded from the modeling process to prevent data leakage.

EDA (cerrando)
1. Tratamiento columnas con alto % de 0s ⏳
2. Matriz de missing (missingno) ⏳
3. Valores faltantes explícitos ✅
4. Valores faltantes ocultos ✅
5. Filas duplicadas ✅
6. Drop policy_code ✅

Decisiones de features

7. Identificar columnas data leakage ⏳(ya estan identificadas, ahora hay que hacer drop)
8. Definir estrategia second applicant (flag + drop cols) ⏳
9. Identificar columnas ID / no predictivas ⏳(ya estan identificadas, ahora hay que hacer drop)
10. Definir target ⏳(ya identificado)

Modelado

11. Crear df_model
12. Split temporal
13. Imputación / preprocessing

We removed the pos_loan category because it represents data leaks, and also the second_applicant category, since we only need one flag and this would be application_type, which tells us if the loan was taken by a group or individually.


In [None]:
# Lista de columnas "post_loan"
post_loan_cols = analysis_dict["post_loan"]
# Lista de columnas "second_applicant"
second_applicant = analysis_dict["second_applicant"]

cols_to_drop = post_loan_cols + second_applicant
df = df.drop(columns=cols_to_drop)

# Split train-test

Al explorar la columna objetivo loan_status, se identificó que algunas clases tenían un número extremadamente bajo de registros (por ejemplo, solo uno o dos casos). Esto genera problemas al dividir el dataset en entrenamiento y prueba usando estratificación (stratify=y), ya que no es posible mantener la proporción de clases cuando algunas aparecen muy pocas veces.

Para evitar este error y asegurar que el modelo pueda generalizar correctamente, se filtraron estas clases minoritarias, manteniendo únicamente las clases con un número suficiente de observaciones. Esto permite realizar un train-test split estratificado seguro y garantiza que tanto el conjunto de entrenamiento como el de prueba tengan representatividad adecuada de cada clase relevante.

In [None]:
# Contar la frecuencia de cada clase
print(df['loan_status'].value_counts())

# Mantener solo clases frecuentes 
frequent_classes = df['loan_status'].value_counts()[df['loan_status'].value_counts() > 5].index
df = df[df['loan_status'].isin(frequent_classes)]

In [None]:
# Supongamos que tu DataFrame se llama df
X = df.drop(columns=['loan_status'])  # Variables predictoras
y = df['loan_status']                 # Target

# Split 80% train / 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## Codificacion columnas tipo object

In [None]:
df.dtypes.value_counts()

In [None]:
df.select_dtypes(include='object').columns

Manejo de fechas

Las columnas de tipo fecha (issue_d, earliest_cr_line, last_pymnt_d, etc.) no se pueden usar directamente en un modelo. Por eso:

- Convertimos las columnas a tipo datetime.

- Creamos features derivadas, como antigüedad del crédito (credit_age) en años, que pueden ser más útiles para el modelo que la fecha cruda.

In [None]:
date_cols = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 
             'next_pymnt_d', 'last_credit_pull_d', 
             'payment_plan_start_date', 'debt_settlement_flag_date']

for col in date_cols:
    X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
    X_test[col] = pd.to_datetime(X_test[col], errors='coerce')

# antigüedad del crédito
X_train['credit_age'] = (pd.to_datetime('today') - X_train['earliest_cr_line']).dt.days / 365
X_test['credit_age'] = (pd.to_datetime('today') - X_test['earliest_cr_line']).dt.days / 365

Codificación de variables categóricas (One-hot eficiente)

- a) Variables nominales (sin orden) – One-hot eficiente

Usamos drop='first' para evitar multicolinealidad (importante en regresión logística).

In [None]:
categorical_cols = ['grade', 'sub_grade', 'home_ownership', 
                    'verification_status', 'purpose', 
                    'initial_list_status', 'application_type', 
                    'disbursement_method']

# One-hot encoder eliminando la primera categoría
ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)

# Ajustamos solo con train
ohe.fit(X_train[categorical_cols])

# Transformamos train y test
X_train_encoded = pd.DataFrame(ohe.transform(X_train[categorical_cols]), 
                               columns=ohe.get_feature_names_out(categorical_cols), 
                               index=X_train.index)

X_test_encoded = pd.DataFrame(ohe.transform(X_test[categorical_cols]), 
                              columns=ohe.get_feature_names_out(categorical_cols), 
                              index=X_test.index)

# Reemplazamos las columnas originales por las codificadas
X_train = X_train.drop(columns=categorical_cols).join(X_train_encoded)
X_test = X_test.drop(columns=categorical_cols).join(X_test_encoded)

- b) Variables ordinales (con orden)

El orden es relevante, por eso asignamos números manualmente:

In [None]:
emp_length_map = {
    '< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3,
    '4 years': 4, '5 years': 5, '6 years': 6, '7 years': 7,
    '8 years': 8, '9 years': 9, '10+ years': 10
}

X_train['emp_length'] = X_train['emp_length'].map(emp_length_map)
X_test['emp_length'] = X_test['emp_length'].map(emp_length_map)

Eliminación de texto libre (desc)

El texto libre no aporta información estructurada inmediata y procesarlo requeriría NLP. Para mantener un modelo interpretable y eficiente, se eliminó.

In [None]:
X_train = X_train.drop(columns=['desc'])
X_test = X_test.drop(columns=['desc'])

Manejo de columnas binarias

Convertimos y/n a 1/0:

In [None]:
binary_cols = ['pymnt_plan', 'debt_settlement_flag']
for col in binary_cols:
    X_train[col] = X_train[col].map({'y': 1, 'n': 0})
    X_test[col] = X_test[col].map({'y': 1, 'n': 0})

In [None]:
# Revisar los tipos de todas las columnas
print(X_train.dtypes.value_counts())

In [None]:
# Columnas que siguen siendo object
object_cols = X_train.select_dtypes(include='object').columns
print(object_cols)

Manejo de columnas tipo object restantes

Al revisar el dataset tras la limpieza y codificación inicial, quedaron algunas columnas tipo object: term, emp_title, url, title, zip_code y addr_state.

Estas columnas no se eliminaron en los pasos anteriores porque algunas requerían transformaciones específicas para ser útiles en el modelo, mientras que otras podían eliminarse para mantener interpretabilidad y eficiencia:

- term: contiene la duración del préstamo como texto (ej. "36 months"). Se convirtió a un valor numérico en meses para que el modelo pueda utilizarlo directamente.

- emp_title, url, title: columnas de texto libre con demasiadas categorías únicas y sin estructura clara. Se eliminaron para simplificar el modelo y mantener su interpretabilidad.

- zip_code: originalmente un código postal completo, se redujo a los primeros 3 dígitos y se codificó numéricamente mediante OrdinalEncoder, manejando correctamente los códigos nuevos que aparezcan en el conjunto de prueba. Esto permite conservar información geográfica sin explotar la dimensionalidad.

- addr_state: contiene el estado de residencia. Como tiene pocas categorías, se codificó mediante One-hot con drop='first', generando columnas independientes que el modelo puede interpretar sin introducir redundancia.

Esta revisión garantiza que todas las variables sean numéricas o codificadas correctamente, evitando errores al entrenar una regresión logística y manteniendo la interpretabilidad y eficiencia del modelo.

In [None]:
# 1️⃣ Convertir 'term' a número de meses (funciona si ya es int o si es string)
# -----------------------------
X_train['term'] = X_train['term'].astype(str).str.replace(' months','').astype(int)
X_test['term'] = X_test['term'].astype(str).str.replace(' months','').astype(int)

# -----------------------------
# 2️⃣ Eliminar columnas de texto libre irrelevantes
# -----------------------------
cols_to_drop = ['emp_title', 'url', 'title', 'desc']
X_train = X_train.drop(columns=[c for c in cols_to_drop if c in X_train.columns])
X_test = X_test.drop(columns=[c for c in cols_to_drop if c in X_test.columns])

# -----------------------------
# 3️⃣ Manejo de zip_code (solo 3 primeros dígitos)
# -----------------------------
X_train['zip_code'] = X_train['zip_code'].astype(str).str[:3]
X_test['zip_code'] = X_test['zip_code'].astype(str).str[:3]

ord_enc_zip = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train[['zip_code']] = ord_enc_zip.fit_transform(X_train[['zip_code']])
X_test[['zip_code']] = ord_enc_zip.transform(X_test[['zip_code']])

# -----------------------------
# 4️⃣ One-hot encoding de addr_state (pocas categorías)
# -----------------------------
if 'addr_state' in X_train.columns:
    ohe_state = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
    ohe_state.fit(X_train[['addr_state']])

    X_train_state = pd.DataFrame(ohe_state.transform(X_train[['addr_state']]),
                                 columns=ohe_state.get_feature_names_out(['addr_state']),
                                 index=X_train.index)

    X_test_state = pd.DataFrame(ohe_state.transform(X_test[['addr_state']]),
                                columns=ohe_state.get_feature_names_out(['addr_state']),
                                index=X_test.index)

    X_train = X_train.drop(columns=['addr_state']).join(X_train_state)
    X_test = X_test.drop(columns=['addr_state']).join(X_test_state)

# -----------------------------
# 5️⃣ Convertir columnas datetime a métricas numéricas
# -----------------------------
date_cols = ['earliest_cr_line', 'issue_d', 'last_pymnt_d',
             'next_pymnt_d', 'last_credit_pull_d',
             'payment_plan_start_date', 'debt_settlement_flag_date']

for col in date_cols:
    if col in X_train.columns:
        # Convertir a datetime
        X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
        X_test[col] = pd.to_datetime(X_test[col], errors='coerce')
        
        # Métrica numérica: días desde hoy
        X_train[col + '_days_since'] = (pd.to_datetime('today') - X_train[col]).dt.days
        X_test[col + '_days_since'] = (pd.to_datetime('today') - X_test[col]).dt.days

# Eliminar columnas datetime originales
X_train = X_train.drop(columns=[c for c in date_cols if c in X_train.columns])
X_test = X_test.drop(columns=[c for c in date_cols if c in X_test.columns])

# -----------------------------
# 6️⃣ Verificación final: no object ni datetime
# -----------------------------
print("Tipos de columnas finales X_train:\n", X_train.dtypes.value_counts())
print("Columnas tipo object restantes:", X_train.select_dtypes(include='object').columns.tolist())

Tras la codificación de las columnas tipo object y la eliminación de las columnas de texto libre irrelevantes, el dataset pasó de tener 94 columnas numéricas y 26 columnas tipo object, a:

215 columnas de tipo float64

4 columnas de tipo int64

Esto refleja que todas las variables categóricas han sido correctamente codificadas:

- Las variables binarias y ordinales se mantienen como numéricas (int64 o float64).

- Las variables categóricas con pocas categorías, como addr_state, se codificaron con One-hot.

- Las columnas de fecha se transformaron a métricas numéricas (días transcurridos desde cada fecha), permitiendo que los modelos interpreten la información temporal.

- Las variables de texto libre que no aportaban información estructurada se eliminaron (emp_title, url, title, desc).

Como resultado, no quedan columnas tipo object ni datetime, asegurando que el dataset esté completamente listo para entrenar modelos de regresión logística, IsolationForest o RandomForest, evitando errores y manteniendo interpretabilidad.

### Codificación del Target (loan_status) a Binario

El objetivo del proyecto es predecir si un préstamo caerá en default o no. Originalmente, la columna loan_status contenía múltiples estados textuales como:

'Charged Off', 'Fully Paid', 'Current', 'Late (16-30 days)', 'Late (31-120 days)', 
'Does not meet the credit policy. Status:Charged Off', 'Does not meet the credit policy. Status:Fully Paid', 'In Grace Period'


Para simplificar el problema a clasificación binaria, se realizó la siguiente transformación:

Se definieron ciertos estados como default (1):
'Charged Off', 'Does not meet the credit policy. Status:Charged Off', 'Does not meet the credit policy. Status:Fully Paid', 'Late (16-30 days)', 'Late (31-120 days)'.

Todos los demás estados se consideraron No Default (0): 'Fully Paid', 'Current', 'In Grace Period'.

Esta codificación asegura que el target sea numérico y binario, compatible con modelos de clasificación como RandomForest, XGBoost o regresión logística, evitando errores por valores categóricos y manteniendo la interpretabilidad del modelo.

In [None]:
#  Codificar target a binario usando map
# -----------------------------
default_labels = {
    'Charged Off': 1,
    'Does not meet the credit policy. Status:Charged Off': 1,
    'Does not meet the credit policy. Status:Fully Paid': 1,
    'Late (16-30 days)': 1,
    'Late (31-120 days)': 1,
    'Fully Paid': 0,
    'Current': 0,
    'In Grace Period': 0  # puedes ajustar según tu criterio
}

y_train = y_train.map(default_labels)
y_test = y_test.map(default_labels)

# Revisar conteo de clases
print(y_train.value_counts())
print(y_test.value_counts())

# Outliers

In [None]:
iso = IsolationForest(random_state=123)
df_num = X_train.copy()
df_num['outlier_flag'] = iso.fit_predict(X_train)
df_num['outlier_flag'].value_counts()

In [None]:
# Crear copia del dataset de entrenamiento

df_train_clean = X_train.copy()

#  Aplicar IsolationForest para detectar outliers
iso = IsolationForest(random_state=123, contamination='auto')
df_train_clean['outlier_flag'] = iso.fit_predict(df_train_clean)


# Eliminar registros considerados outliers (-1)

df_train_clean = df_train_clean[df_train_clean['outlier_flag'] == 1]


# Eliminar la columna outlier_flag si no se va a usar como característica
df_train_clean = df_train_clean.drop(columns=['outlier_flag'])
y_train_clean = y_train.loc[df_train_clean.index]

print("Número de registros después de eliminar outliers:", df_train_clean.shape[0])

Se aplicó IsolationForest para identificar registros atípicos en el dataset de entrenamiento. Los outliers son puntos que presentan patrones muy diferentes al resto de los datos y podrían distorsionar los resultados de modelos sensibles, como la regresión logística.

Tras la detección, se eliminaron los 45 registros considerados outliers de un total de 153,846, lo que representa menos del 0.03% del dataset. Esta eliminación asegura que el modelo se entrene sobre datos consistentes, manteniendo la interpretabilidad y evitando que valores extremos afecten los coeficientes.

El dataset resultante conserva prácticamente toda la información original, pero más "limpio", garantizando una base sólida para el entrenamiento de modelos de regresión y otros algoritmos supervisados.

# Seleccion de caracteristicas

## Método 1 Feature importance de RF

In [None]:
# Entrenamiento de RF
model = RandomForestClassifier(random_state=42,n_jobs=-1).fit(df_train_clean, y_train_clean)
# Importancia de características
importances = model.feature_importances_/model.feature_importances_.sum()*100
# Convertir a DataFrame
df_rf_imp = pd.DataFrame({'feature': df_train_clean.columns,'rf_importance': importances}).sort_values(by='rf_importance', ascending=False)
# Calculamos la importancia acumulada
df_rf_imp['rf_importance_acum'] = df_rf_imp['rf_importance'].cumsum()
df_rf_imp

## Metodo 2 Permutation/Shuffle importance

In [None]:
# Para esta técnica y la de shap se necesita conjunto de validación
X_train1, X_val, y_train1, y_val = train_test_split(df_train_clean, y_train_clean, test_size=0.2, random_state=42)

# Ajustamos el modelo
model_xgb = XGBClassifier(objective='binary:logistic',random_state=42,use_label_encoder=False,eval_metric='logloss').fit(X_train1, y_train1)

# Realizamos 10 permutaciones por cada característica (se usa neg_mean_absolute_error)
perm = permutation_importance(model_xgb, X_val, y_val, n_repeats=10, random_state=42, n_jobs=-1, scoring='accuracy')

df_perm_imp = pd.DataFrame({'feature': df_train_clean.columns, 'perm_imp': perm.importances_mean*100}).sort_values('perm_imp', ascending=False)
df_perm_imp

## Metodo 3 SHAP

In [None]:
# Ajustamos el modelo
model_lgbm = lgb.LGBMClassifier(random_state=42, n_jobs=-1).fit(df_train_clean, y_train_clean)

explainer = shap.Explainer(model_lgbm, X_val)   # usa el mismo X_val
shap_vals = explainer(X_val).values

imp_shap = np.abs(shap_vals).mean(axis=0)
imp_shap_pct = imp_shap/imp_shap.sum()*100
df_shap_imp = pd.DataFrame({"feature": X_val.columns, "shap_imp": imp_shap_pct}).sort_values('shap_imp', ascending=False)
df_shap_imp

In [None]:
%pip install missingno MissForest lazypredict

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.inspection import permutation_importance
import shap
import lightgbm as lgb

  from .autonotebook import tqdm as notebook_tqdm
Matplotlib is building the font cache; this may take a moment.


# NOTE
El DataFrame df se crea a partir de la tabla basa_datos_pripal de la base de datos credit_scoring.db. El archivo no se incluye en el repositorio debido a su tamaño; la generación de la base de datos a partir del CSV original y la creación de la tabla se explica detalladamente en el notebook data-collection.ipynb

Descriptive analysis:
In this part of the project, we begin exploring the dataset created from the initial information obtained from the LendingClub dataset (Kaggle). The objective of this stage is to describe and understand the structure of the data, the variables and their types, their distributions, skewness, and the presence of missing values.

In [None]:
conn = sqlite3.connect(r"C:\Users\User\Documents\GITHUB\final_project_creditscoring\Data\credit_scoring.db")
df = pd.read_sql("SELECT * FROM main_table", conn)
conn.close()
n_rows,n_cols = df.shape

print(f'En este df existen {n_rows} filas y {n_cols} columnas')

En este df existen 192309 filas y 157 columnas


With the analysis below, we can understand that the dataset contains a large number of numerical variables, along with several categorical features represented as object types. This initial inspection highlights the need for feature selection and type handling in later stages. Now we are proceeding with a list of each type of column to identify possible issues in data types (such as date columns that are objects or numerical data that is shown as objects, etc.).

In [None]:
cols_types = df.dtypes.reset_index().rename(
    columns={'index': 'column_name', 0: 'dtype'}
)

cols_types['dtype'].value_counts()

dtype
float64    118
object      38
int64        1
Name: count, dtype: int64

In [None]:
cols_types[cols_types['dtype'] == 'object']

Unnamed: 0,column_name,dtype
1,member_id,object
5,term,object
8,grade,object
9,sub_grade,object
10,emp_title,object
11,emp_length,object
12,home_ownership,object
14,verification_status,object
15,issue_d,object
16,loan_status,object


In [None]:
print(cols_types[cols_types['dtype'] == 'float64'].to_string(index=False))

                               column_name   dtype
                                 loan_amnt float64
                               funded_amnt float64
                           funded_amnt_inv float64
                                  int_rate float64
                               installment float64
                                annual_inc float64
                                       dti float64
                               delinq_2yrs float64
                            fico_range_low float64
                           fico_range_high float64
                            inq_last_6mths float64
                    mths_since_last_delinq float64
                    mths_since_last_record float64
                                  open_acc float64
                                   pub_rec float64
                                 revol_bal float64
                                revol_util float64
                                 total_acc float64
                               

In [None]:
cols_types[cols_types['dtype'] == 'int64']

Unnamed: 0,column_name,dtype
0,id,int64


Revision of constant columns: 

The following code initially didn’t specify dropna=False, which made it show a few columns as constants. This led us to investigate what was happening and whether we were working with the right dataframe. However, this mistake was enlightening, as it helped us identify possible *data leakage variables, such as: hardship_type, deferral_term, and hardship_length.*

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

# Unique values per column
uniq = df.nunique(dropna=False)

# Show results
print(uniq)


id                                            192309
member_id                                          1
loan_amnt                                       1456
funded_amnt                                     1457
funded_amnt_inv                                 4943
term                                               2
int_rate                                         500
installment                                    42329
grade                                              7
sub_grade                                         35
emp_title                                      92693
emp_length                                        12
home_ownership                                     6
annual_inc                                     15650
verification_status                                3
issue_d                                          103
loan_status                                        9
pymnt_plan                                         2
url                                           

In [None]:
uniq = df.nunique(dropna=False)
uniq[uniq == 1]

member_id      1
policy_code    1
dtype: int64

After the code revision, the only two variables with constant values are member_id and policy_code. It does not make much sense to have a unique value for member_id if we have almost 200k data entries, so we needed to check the exact values contained in this column.

In [None]:
cols = ['member_id', 'policy_code']

for col in cols:
    print(df[col].value_counts(dropna=False))

member_id
None    192309
Name: count, dtype: int64
policy_code
1.0    192309
Name: count, dtype: int64


Regarding policy_code, a similar situation occurs. According to the data dictionary, LendingClub has only two types of policies: publicly available (1) and new products not publicly available (2). In this dataset, only publicly available products are present. Therefore, following the same reasoning as above, policy_code is not a relevant column for the analysis.

Lastly, as we are dropping member_id and policy_code because they are not predictors, we are doing the same with two other columns as well, which, even though they do not have constant values, can generate noise in the analysis: the url column and the id column. 

In [None]:
df = df.drop(columns=['member_id', 'policy_code','id'])

Revision of Duplicated Rows: No duplicated rows were identified.

In [None]:
df.duplicated().sum()

np.int64(0)

Revision of Duplicated Columns: Two variables (deferral_term and hardship_length) were found to be exact duplicates, containing identical values across all observations. Both variables are related to post-loan hardship events (we previously identified them as potential data leakers) and will therefore be excluded from the modeling stage. 

In [None]:
df.T.duplicated().sum()
df.T.duplicated(keep=False)

loan_amnt                                     False
funded_amnt                                   False
funded_amnt_inv                               False
term                                          False
int_rate                                      False
installment                                   False
grade                                         False
sub_grade                                     False
emp_title                                     False
emp_length                                    False
home_ownership                                False
annual_inc                                    False
verification_status                           False
issue_d                                       False
loan_status                                   False
pymnt_plan                                    False
url                                           False
desc                                          False
purpose                                       False
title       

Missing values: 
We identified columns with a high percentage of missing values, so we proceeded to define a missing threshold of 50%, where variables with more than 50% missing values will be considered for exclusion from the modeling stage. However, first we must evaluate them on a case-by-case basis to understand if any of those variables are conceptually important.

In [None]:
missing = df.isna().mean()*100
missing[missing>0]
print(missing)

In [None]:
missing_threshold = 0

high_missing_cols = missing[missing >= missing_threshold]
print(high_missing_cols)

In [None]:
(missing > 0).sum()

Now we have to identify other missing values and audit them to understand how we should treat each case.

In [None]:
#Identify other missing values
cat_col = df.select_dtypes(include=['object']).columns

for col in cat_col: 
    print(df[col].value_counts())

In [None]:
df_faltantes = df.replace(['None'],np.nan,inplace=True)

Columns with more than 50% missing values were manually reviewed and classified into post-loan variables, second-applicant features, structurally missing variables, and late-reported behavioral features based on domain knowledge and data documentation. No features were removed at this stage; the analysis documents decisions to be applied during model preparation.

In [None]:
def classify_column(col):
    if col.startswith(("hardship", "settlement", "deferral")):
        return "post_loan"
    if col.startswith("sec_app") or col.endswith("_joint"):
        return "second_applicant"
    if col.startswith("mths_since"):
        return "structural_missing"
    return "other"

categories = ["post_loan", "second_applicant", "structural_missing"]
analysis_dict = {cat: [col for col in df.columns if classify_column(col) == cat] for cat in categories}

print("🔍 --- STARTING COLUMN CATEGORIZATION ANALYSIS --- 🔍\n")

for category, cols in analysis_dict.items():
    print(f"📁 CATEGORY: {category.upper()}")
    if not cols:
        print("   ❌ No columns found in this category.\n")
    else:
        print(f"   ✅ Found {len(cols)} columns.")
        missing_stats = df[cols].isnull().mean() * 100
        print(missing_stats.sort_values(ascending=False).to_string())
        print("-" * 40 + "\n")

print("🚀 --- ANALYSIS COMPLETE --- 🚀")

In [None]:
#Column checker: to be able to quickly check the characteristics of a column and its type

target_col = 'loan_status'  

col_type = df[target_col].dtype

summary = pd.DataFrame({
    'Count': df[target_col].value_counts(dropna=False),
    'Percentage (%)': df[target_col].value_counts(dropna=False, normalize=True) * 100
})

print(f"Content Analysis for: {target_col.upper()}")
print(summary)
print('---'*30)
print(f"Data Type: {col_type}")


In [None]:
all_classified_cols = (analysis_dict['post_loan'] + 
                      analysis_dict['second_applicant'] + 
                      analysis_dict['structural_missing'])

other_cols = [col for col in df.columns if col not in all_classified_cols]

print("🔍 --- AUDITING 'OTHER' COLUMNS WITH HIGH MISSING RATIO (>40%) ---")
high_missing_other = df[other_cols].isnull().mean()
high_missing_other = high_missing_other[high_missing_other > 0.4].sort_values(ascending=False)

if high_missing_other.empty:
    print("✅ No additional critical missing values found outside defined categories.")
else:
    print("⚠️ Attention: The following columns also have a high missing ratio:")
    print(high_missing_other.to_string())

1. *Post-loan variables:*

These features contain information generated after loan origination, such as hardship or settlement events. Their high missingness reflects the fact that most loans do not enter these processes. Because these variables include future information relative to the credit decision, they were identified as potential sources of data leakage.

Planned decision: Exclude.

2. *Second-applicant variables:*

These variables describe characteristics of a co-borrower in joint loan applications. The high proportion of missing values reflects that most loans involve a single applicant, meaning missing values indicate the absence of a second applicant rather than missing information.

Rather than modeling the full co-borrower profile, the presence of a second applicant is captured through a binary indicator. This approach preserves potentially relevant information while avoiding additional complexity and extensive imputation.

Planned decision: Create a binary flag indicating whether a loan includes a second applicant, and exclude detailed second-applicant features during model preparation.

3. *Structurally missing variables:*

These features represent the time since the last occurrence of negative credit events. Missing values indicate that the event has never occurred, making the missingness itself informative.

Planned decision: Retain for modeling and apply a dedicated imputation strategy at a later stage.

4. *Late-reported features:*

These variables were introduced into the dataset at later periods and are unavailable for older loans. Missingness is driven by historical reporting limitations rather than borrower behavior.

Planned decision: Evaluate after defining the temporal train-test split.

DATA CLEANING & PREPROCESSING STRATEGY

Now that we have a clearer understanding of the data, we can proceed with data cleaning and processing.

1. DF Backup: Create a full copy of the raw dataset to ensure data integrity and allow for easy rollbacks during the experimentation phase.

2. Target Definition & Filtering: Refine the loan_status variable. We exclude ongoing loans and focus only on definitive outcomes.

Default (1): Charged off, default, or late (30–120 days).
Charged Off
Late (31-120 days)
Default
Does not meet the credit policy. Status:Charged Off

Non-Default (0): Fully paid.
Fully Paid

3. Leakage Removal: Drop all Post-loan variables. These features contain information only available after the credit decision has been made, which would lead to Data Leakage.

4. Structural Simplification (Joint Apps): Consolidate +16 second-applicant features into a single Binary Flag (is_joint_application). This reduces dimensionality while preserving the fact that a co-borrower exists.

5. Zero Ratio & Variance Analysis: Identify features with excessive sparsity. We decide whether to drop columns with near-zero variance or binarize features where the simple presence of an event (0 vs >0) is more predictive than its frequency.

6. Missingness Audit (Missingno): Visualize the remaining missing values to determine the mechanism of missingness (Random vs. Structural). This dictates the final decision: drop the column (if >50% NaN) or keep it for Imputation after the Train-Test Split.

In [None]:
df_backup = df.copy()

In [None]:
# Function to inspect unique values and their prevalence
def inspect_categories(dataframe, column_list):
    """
    Prints frequency and percentage distribution for categorical features.
    """
    for col in column_list:
        print(f"\n--- Feature: {col.upper()} ---")
        
        counts = dataframe[col].value_counts(dropna=False)
        percentages = dataframe[col].value_counts(dropna=False, normalize=True) * 100
        
        summary = pd.DataFrame({
            'Count': counts,
            'Percentage (%)': percentages.round(2)
        })
        
        print(summary)
        print("-" * 30)

target_cols = analysis_dict.get('other', [])
inspect_categories(df, target_cols)

Now we proceed with a zero ratio analysis, that has the objective of helping us decide which variables doesn't have enough information to support the model and which ones are corrupt with 0s that should be NaNs.

In [None]:
#Zero Ratio Analysis
zero_ratio = (df == 0).mean() * 100
zero_ratio_all = zero_ratio[zero_ratio > 0].sort_values(ascending=False)

print("ALL COLUMNS WITH ZEROS")
print(zero_ratio_all.to_string()) 

Having identified the columns with higher amounts of 0s, we decided to audit them one by one and review their unique values to understand whether the 0s are informative or if they represent null/NaN values. This audit is half based on the code shown below and half on a manual review of the dataset dictionary.

In [None]:
# 1. Calculate Zero Ratio again to get the target columns
zero_ratio = (df == 0).mean() * 100
# Define a threshold (e.g., columns with more than 50% zeros)
high_zero_threshold = 30.0
high_zero_cols = zero_ratio[zero_ratio > high_zero_threshold].sort_values(ascending=False).index.tolist()

def audit_high_zero_columns(dataframe, column_list):
    """
    Audits columns with high zero ratios to see value distribution 
    and help decide between dropping, keeping, or binarizing.
    """
    print(f"🔍 --- AUDITING {len(column_list)} COLUMNS WITH > {high_zero_threshold}% ZEROS --- \n")
    
    for col in column_list:
        print(f"📊 Feature: {col.upper()}")
        print(f"Zero Ratio: {zero_ratio[col]:.2f}%")
        
        # Count unique values excluding zero
        non_zero_values = dataframe[dataframe[col] != 0][col]
        unique_counts = non_zero_values.nunique()
        
        print(f"Unique values (excluding zero): {unique_counts}")
        
        if unique_counts < 15:
            # If few unique values, show frequency
            print("Distribution (Top Values):")
            print(dataframe[col].value_counts().head(10))
        else:
            # If many unique values, show basic stats for non-zero data
            print("Non-zero stats:")
            print(non_zero_values.describe()[['mean', 'min', 'max']])
        
        print("-" * 40)

# Execute the audit
audit_high_zero_columns(df, high_zero_cols)

The function below is intended to serve as a filter to help us quickly verify whether a column was correctly classified in the categories defined above.

In [None]:
def get_column_category(column_name, mapping_dict):
    """
    Checks which category a specific column belongs to based on the analysis_dict.
    """
    for category, columns in mapping_dict.items():
        if column_name in columns:
            return category
    return "other (or not found)"

test_col = 'loan_status' # You can change this name to any column
result = get_column_category(test_col, analysis_dict)
print(f"Verification: The column '{test_col}' is categorized as: {result.upper()}")

# 3. Batch verification (Optional)
# List of columns you want to verify right now
verify_list = ['sec_app_fico_range_low', 'mths_since_last_delinq', 'loan_amnt', 'settlement_term']

print("BATCH VERIFICATION:")
for col in verify_list:
    cat = get_column_category(col, analysis_dict)
    print(f"- {col:30} -> Category: {cat}")

In [None]:
low_variance_cols = []

for col in df.columns:
    vc = df[col].value_counts(dropna=False, normalize=True)
    if vc.iloc[0] > 0.99:   # más del 99% el mismo valor
        low_variance_cols.append(col)

low_variance_cols

In [None]:
desc = df.describe().T
desc.sort_values(by='max', ascending=False).head(20)

The analysis of the maximum values reveals the presence of extreme values in some financial variables, which suggests the need to apply transformations or outlier treatment techniques in later stages.

In [None]:
desc.assign(
    mean_median_ratio = desc['mean'] / desc['50%']
).sort_values('mean_median_ratio', ascending=False).head(10)

Several numerical variables present highly skewed distributions, with median values equal to zero and a small proportion of non-zero observations. This pattern is expected for count-based credit history variables. However, some highly skewed variables correspond to post-loan information and will therefore be excluded from the modeling process to prevent data leakage.

EDA (cerrando)
1. Tratamiento columnas con alto % de 0s ⏳
2. Matriz de missing (missingno) ⏳
3. Valores faltantes explícitos ✅
4. Valores faltantes ocultos ✅
5. Filas duplicadas ✅
6. Drop policy_code ✅

Decisiones de features

7. Identificar columnas data leakage ⏳(ya estan identificadas, ahora hay que hacer drop)
8. Definir estrategia second applicant (flag + drop cols) ⏳
9. Identificar columnas ID / no predictivas ⏳(ya estan identificadas, ahora hay que hacer drop)
10. Definir target ⏳(ya identificado)

Modelado

11. Crear df_model
12. Split temporal
13. Imputación / preprocessing

We removed the pos_loan category because it represents data leaks, and also the second_applicant category, since we only need one flag and this would be application_type, which tells us if the loan was taken by a group or individually.


In [None]:
# Lista de columnas "post_loan"
post_loan_cols = analysis_dict["post_loan"]
# Lista de columnas "second_applicant"
second_applicant = analysis_dict["second_applicant"]

cols_to_drop = post_loan_cols + second_applicant
df = df.drop(columns=cols_to_drop)

# Split train-test

Al explorar la columna objetivo loan_status, se identificó que algunas clases tenían un número extremadamente bajo de registros (por ejemplo, solo uno o dos casos). Esto genera problemas al dividir el dataset en entrenamiento y prueba usando estratificación (stratify=y), ya que no es posible mantener la proporción de clases cuando algunas aparecen muy pocas veces.

Para evitar este error y asegurar que el modelo pueda generalizar correctamente, se filtraron estas clases minoritarias, manteniendo únicamente las clases con un número suficiente de observaciones. Esto permite realizar un train-test split estratificado seguro y garantiza que tanto el conjunto de entrenamiento como el de prueba tengan representatividad adecuada de cada clase relevante.

In [None]:
# Contar la frecuencia de cada clase
print(df['loan_status'].value_counts())

# Mantener solo clases frecuentes 
frequent_classes = df['loan_status'].value_counts()[df['loan_status'].value_counts() > 5].index
df = df[df['loan_status'].isin(frequent_classes)]


In [None]:
# Supongamos que tu DataFrame se llama df
X = df.drop(columns=['loan_status'])  # Variables predictoras
y = df['loan_status']                 # Target

# Split 80% train / 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


## Codificacion columnas tipo object

In [None]:
df.dtypes.value_counts()


In [None]:
df.select_dtypes(include='object').columns


Manejo de fechas

Las columnas de tipo fecha (issue_d, earliest_cr_line, last_pymnt_d, etc.) no se pueden usar directamente en un modelo. Por eso:

- Convertimos las columnas a tipo datetime.

- Creamos features derivadas, como antigüedad del crédito (credit_age) en años, que pueden ser más útiles para el modelo que la fecha cruda.

In [None]:
date_cols = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 
             'next_pymnt_d', 'last_credit_pull_d', 
             'payment_plan_start_date', 'debt_settlement_flag_date']

for col in date_cols:
    X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
    X_test[col] = pd.to_datetime(X_test[col], errors='coerce')

# antigüedad del crédito
X_train['credit_age'] = (pd.to_datetime('today') - X_train['earliest_cr_line']).dt.days / 365
X_test['credit_age'] = (pd.to_datetime('today') - X_test['earliest_cr_line']).dt.days / 365


Codificación de variables categóricas (One-hot eficiente)

- a) Variables nominales (sin orden) – One-hot eficiente

Usamos drop='first' para evitar multicolinealidad (importante en regresión logística).

In [None]:
categorical_cols = ['grade', 'sub_grade', 'home_ownership', 
                    'verification_status', 'purpose', 
                    'initial_list_status', 'application_type', 
                    'disbursement_method']

# One-hot encoder eliminando la primera categoría
ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)

# Ajustamos solo con train
ohe.fit(X_train[categorical_cols])

# Transformamos train y test
X_train_encoded = pd.DataFrame(ohe.transform(X_train[categorical_cols]), 
                               columns=ohe.get_feature_names_out(categorical_cols), 
                               index=X_train.index)

X_test_encoded = pd.DataFrame(ohe.transform(X_test[categorical_cols]), 
                              columns=ohe.get_feature_names_out(categorical_cols), 
                              index=X_test.index)

# Reemplazamos las columnas originales por las codificadas
X_train = X_train.drop(columns=categorical_cols).join(X_train_encoded)
X_test = X_test.drop(columns=categorical_cols).join(X_test_encoded)


- b) Variables ordinales (con orden)

El orden es relevante, por eso asignamos números manualmente:

In [None]:
emp_length_map = {
    '< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3,
    '4 years': 4, '5 years': 5, '6 years': 6, '7 years': 7,
    '8 years': 8, '9 years': 9, '10+ years': 10
}

X_train['emp_length'] = X_train['emp_length'].map(emp_length_map)
X_test['emp_length'] = X_test['emp_length'].map(emp_length_map)


Eliminación de texto libre (desc)

El texto libre no aporta información estructurada inmediata y procesarlo requeriría NLP. Para mantener un modelo interpretable y eficiente, se eliminó.

In [None]:
X_train = X_train.drop(columns=['desc'])
X_test = X_test.drop(columns=['desc'])


Manejo de columnas binarias

Convertimos y/n a 1/0:

In [None]:
binary_cols = ['pymnt_plan', 'debt_settlement_flag']
for col in binary_cols:
    X_train[col] = X_train[col].map({'y': 1, 'n': 0})
    X_test[col] = X_test[col].map({'y': 1, 'n': 0})


In [None]:
# Revisar los tipos de todas las columnas
print(X_train.dtypes.value_counts())


In [None]:
# Columnas que siguen siendo object
object_cols = X_train.select_dtypes(include='object').columns
print(object_cols)


Manejo de columnas tipo object restantes

Al revisar el dataset tras la limpieza y codificación inicial, quedaron algunas columnas tipo object: term, emp_title, url, title, zip_code y addr_state.

Estas columnas no se eliminaron en los pasos anteriores porque algunas requerían transformaciones específicas para ser útiles en el modelo, mientras que otras podían eliminarse para mantener interpretabilidad y eficiencia:

- term: contiene la duración del préstamo como texto (ej. "36 months"). Se convirtió a un valor numérico en meses para que el modelo pueda utilizarlo directamente.

- emp_title, url, title: columnas de texto libre con demasiadas categorías únicas y sin estructura clara. Se eliminaron para simplificar el modelo y mantener su interpretabilidad.

- zip_code: originalmente un código postal completo, se redujo a los primeros 3 dígitos y se codificó numéricamente mediante OrdinalEncoder, manejando correctamente los códigos nuevos que aparezcan en el conjunto de prueba. Esto permite conservar información geográfica sin explotar la dimensionalidad.

- addr_state: contiene el estado de residencia. Como tiene pocas categorías, se codificó mediante One-hot con drop='first', generando columnas independientes que el modelo puede interpretar sin introducir redundancia.

Esta revisión garantiza que todas las variables sean numéricas o codificadas correctamente, evitando errores al entrenar una regresión logística y manteniendo la interpretabilidad y eficiencia del modelo.

In [None]:
# 1️⃣ Convertir 'term' a número de meses (funciona si ya es int o si es string)
# -----------------------------
X_train['term'] = X_train['term'].astype(str).str.replace(' months','').astype(int)
X_test['term'] = X_test['term'].astype(str).str.replace(' months','').astype(int)

# -----------------------------
# 2️⃣ Eliminar columnas de texto libre irrelevantes
# -----------------------------
cols_to_drop = ['emp_title', 'url', 'title', 'desc']
X_train = X_train.drop(columns=[c for c in cols_to_drop if c in X_train.columns])
X_test = X_test.drop(columns=[c for c in cols_to_drop if c in X_test.columns])

# -----------------------------
# 3️⃣ Manejo de zip_code (solo 3 primeros dígitos)
# -----------------------------
X_train['zip_code'] = X_train['zip_code'].astype(str).str[:3]
X_test['zip_code'] = X_test['zip_code'].astype(str).str[:3]

ord_enc_zip = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train[['zip_code']] = ord_enc_zip.fit_transform(X_train[['zip_code']])
X_test[['zip_code']] = ord_enc_zip.transform(X_test[['zip_code']])

# -----------------------------
# 4️⃣ One-hot encoding de addr_state (pocas categorías)
# -----------------------------
if 'addr_state' in X_train.columns:
    ohe_state = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
    ohe_state.fit(X_train[['addr_state']])

    X_train_state = pd.DataFrame(ohe_state.transform(X_train[['addr_state']]),
                                 columns=ohe_state.get_feature_names_out(['addr_state']),
                                 index=X_train.index)

    X_test_state = pd.DataFrame(ohe_state.transform(X_test[['addr_state']]),
                                columns=ohe_state.get_feature_names_out(['addr_state']),
                                index=X_test.index)

    X_train = X_train.drop(columns=['addr_state']).join(X_train_state)
    X_test = X_test.drop(columns=['addr_state']).join(X_test_state)

# -----------------------------
# 5️⃣ Convertir columnas datetime a métricas numéricas
# -----------------------------
date_cols = ['earliest_cr_line', 'issue_d', 'last_pymnt_d',
             'next_pymnt_d', 'last_credit_pull_d',
             'payment_plan_start_date', 'debt_settlement_flag_date']

for col in date_cols:
    if col in X_train.columns:
        # Convertir a datetime
        X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
        X_test[col] = pd.to_datetime(X_test[col], errors='coerce')
        
        # Métrica numérica: días desde hoy
        X_train[col + '_days_since'] = (pd.to_datetime('today') - X_train[col]).dt.days
        X_test[col + '_days_since'] = (pd.to_datetime('today') - X_test[col]).dt.days

# Eliminar columnas datetime originales
X_train = X_train.drop(columns=[c for c in date_cols if c in X_train.columns])
X_test = X_test.drop(columns=[c for c in date_cols if c in X_test.columns])

# -----------------------------
# 6️⃣ Verificación final: no object ni datetime
# -----------------------------
print("Tipos de columnas finales X_train:\n", X_train.dtypes.value_counts())
print("Columnas tipo object restantes:", X_train.select_dtypes(include='object').columns.tolist())

Tras la codificación de las columnas tipo object y la eliminación de las columnas de texto libre irrelevantes, el dataset pasó de tener 94 columnas numéricas y 26 columnas tipo object, a:

215 columnas de tipo float64

4 columnas de tipo int64

Esto refleja que todas las variables categóricas han sido correctamente codificadas:

- Las variables binarias y ordinales se mantienen como numéricas (int64 o float64).

- Las variables categóricas con pocas categorías, como addr_state, se codificaron con One-hot.

- Las columnas de fecha se transformaron a métricas numéricas (días transcurridos desde cada fecha), permitiendo que los modelos interpreten la información temporal.

- Las variables de texto libre que no aportaban información estructurada se eliminaron (emp_title, url, title, desc).

Como resultado, no quedan columnas tipo object ni datetime, asegurando que el dataset esté completamente listo para entrenar modelos de regresión logística, IsolationForest o RandomForest, evitando errores y manteniendo interpretabilidad.

### Codificación del Target (loan_status) a Binario

El objetivo del proyecto es predecir si un préstamo caerá en default o no. Originalmente, la columna loan_status contenía múltiples estados textuales como:

'Charged Off', 'Fully Paid', 'Current', 'Late (16-30 days)', 'Late (31-120 days)', 
'Does not meet the credit policy. Status:Charged Off', 'Does not meet the credit policy. Status:Fully Paid', 'In Grace Period'


Para simplificar el problema a clasificación binaria, se realizó la siguiente transformación:

Se definieron ciertos estados como default (1):
'Charged Off', 'Does not meet the credit policy. Status:Charged Off', 'Does not meet the credit policy. Status:Fully Paid', 'Late (16-30 days)', 'Late (31-120 days)'.

Todos los demás estados se consideraron No Default (0): 'Fully Paid', 'Current', 'In Grace Period'.

Esta codificación asegura que el target sea numérico y binario, compatible con modelos de clasificación como RandomForest, XGBoost o regresión logística, evitando errores por valores categóricos y manteniendo la interpretabilidad del modelo.

In [None]:
#  Codificar target a binario usando map
# -----------------------------
default_labels = {
    'Charged Off': 1,
    'Does not meet the credit policy. Status:Charged Off': 1,
    'Does not meet the credit policy. Status:Fully Paid': 1,
    'Late (16-30 days)': 1,
    'Late (31-120 days)': 1,
    'Fully Paid': 0,
    'Current': 0,
    'In Grace Period': 0  # puedes ajustar según tu criterio
}

y_train = y_train.map(default_labels)
y_test = y_test.map(default_labels)

# Revisar conteo de clases
print(y_train.value_counts())
print(y_test.value_counts())

# Outliers

In [None]:
iso = IsolationForest(random_state=123)
df_num = X_train.copy()
df_num['outlier_flag'] = iso.fit_predict(X_train)
df_num['outlier_flag'].value_counts()

In [None]:
# Crear copia del dataset de entrenamiento

df_train_clean = X_train.copy()

#  Aplicar IsolationForest para detectar outliers
iso = IsolationForest(random_state=123, contamination='auto')
df_train_clean['outlier_flag'] = iso.fit_predict(df_train_clean)


# Eliminar registros considerados outliers (-1)

df_train_clean = df_train_clean[df_train_clean['outlier_flag'] == 1]


# Eliminar la columna outlier_flag si no se va a usar como característica
df_train_clean = df_train_clean.drop(columns=['outlier_flag'])
y_train_clean = y_train.loc[df_train_clean.index]

print("Número de registros después de eliminar outliers:", df_train_clean.shape[0])

Se aplicó IsolationForest para identificar registros atípicos en el dataset de entrenamiento. Los outliers son puntos que presentan patrones muy diferentes al resto de los datos y podrían distorsionar los resultados de modelos sensibles, como la regresión logística.

Tras la detección, se eliminaron los 45 registros considerados outliers de un total de 153,846, lo que representa menos del 0.03% del dataset. Esta eliminación asegura que el modelo se entrene sobre datos consistentes, manteniendo la interpretabilidad y evitando que valores extremos afecten los coeficientes.

El dataset resultante conserva prácticamente toda la información original, pero más “limpio”, garantizando una base sólida para el entrenamiento de modelos de regresión y otros algoritmos supervisados.

# Seleccion de caracteristicas

## Método 1 Feature importance de RF

In [None]:
# Entrenamiento de RF
model = RandomForestClassifier(random_state=42,n_jobs=-1).fit(df_train_clean, y_train_clean)
# Importancia de características
importances = model.feature_importances_/model.feature_importances_.sum()*100
# Convertir a DataFrame
df_rf_imp = pd.DataFrame({'feature': df_train_clean.columns,'rf_importance': importances}).sort_values(by='rf_importance', ascending=False)
# Calculamos la importancia acumulada
df_rf_imp['rf_importance_acum'] = df_rf_imp['rf_importance'].cumsum()
df_rf_imp

## Metodo 2 Permutation/Shuffle importance

In [None]:
# Para esta técnica y la de shap se necesita conjunto de validación
X_train1, X_val, y_train1, y_val = train_test_split(df_train_clean, y_train_clean, test_size=0.2, random_state=42)

# Ajustamos el modelo
model_xgb = XGBClassifier(objective='binary:logistic',random_state=42,use_label_encoder=False,eval_metric='logloss').fit(X_train1, y_train1)

# Realizamos 10 permutaciones por cada característica (se usa neg_mean_absolute_error)
perm = permutation_importance(model_xgb, X_val, y_val, n_repeats=10, random_state=42, n_jobs=-1, scoring='accuracy')

df_perm_imp = pd.DataFrame({'feature': df_train_clean.columns, 'perm_imp': perm.importances_mean*100}).sort_values('perm_imp', ascending=False)
df_perm_imp

## Metodo 3 SHAP

In [None]:
# Ajustamos el modelo
model_lgbm = lgb.LGBMClassifier(random_state=42, n_jobs=-1).fit(df_train_clean, y_train)

explainer = shap.Explainer(model_lgbm, X_val)   # usa el mismo X_val
shap_vals = explainer(X_val).values

imp_shap = np.abs(shap_vals).mean(axis=0)
imp_shap_pct = imp_shap/imp_shap.sum()*100
df_shap_imp = pd.DataFrame({"feature": X_val.columns, "shap_imp": imp_shap_pct}).sort_values('shap_imp', ascending=False)
df_shap_imp