In [60]:
import numpy as np 
import pandas as pd 

**Observe Data**

In [61]:
train_df = pd.read_csv("/kaggle/input/loan-approval-dataset/train.csv")
test_df = pd.read_csv("/kaggle/input/loan-approval-dataset/test.csv")
sample_submission = pd.read_csv('/kaggle/input/loan-approval-dataset/sample_submission.csv')

In [62]:
train_df.head(10)

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,26.0,34427.0,RENT,5.946998,DEBTCONSOLIDATION,E,19270.0,11.272717,0.193133,Y,4.0,1
1,1,32.0,91102.0,MORTGAGE,8.527473,VENTURE,B,19815.0,12.831859,0.183127,N,4.0,0
2,2,25.0,25847.0,RENT,4.919103,PERSONAL,D,10043.0,15.53212,0.241554,Y,3.0,0
3,3,22.0,32863.0,RENT,3.427904,EDUCATION,,12256.0,9.877151,0.19832,N,4.0,0
4,4,24.0,50740.0,RENT,2.842202,EDUCATION,C,6412.0,13.465822,0.185754,Y,2.0,1
5,5,28.0,41613.0,RENT,4.991867,DEBTCONSOLIDATION,A,6953.0,12.943339,0.045998,N,6.0,1
6,6,27.0,53599.0,RENT,4.994485,DEBTCONSOLIDATION,D,7603.0,13.859111,0.135505,Y,12.0,1
7,7,30.0,72230.0,MORTGAGE,1.426203,PERSONAL,A,6157.0,6.075695,0.118602,N,7.0,0
8,8,27.0,106662.0,RENT,5.005692,PERSONAL,B,19531.0,9.652674,0.215162,N,5.0,0
9,9,22.0,23082.0,RENT,3.987022,EDUCATION,A,3716.0,7.671146,0.089016,N,4.0,0


In [63]:
test_df.head(10)

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,39550,22.0,30330.0,RENT,6.221275,MEDICAL,C,19989.0,14.71376,0.377784,Y,4.0
1,39551,31.0,79403.0,MORTGAGE,9.876324,VENTURE,A,11849.0,10.467577,0.135866,N,10.0
2,39552,25.0,26966.0,RENT,2.643441,MEDICAL,C,8481.0,14.937696,0.036416,N,4.0
3,39553,25.0,,MORTGAGE,8.736224,DEBTCONSOLIDATION,A,23894.0,7.972062,,N,
4,39554,27.0,87388.0,MORTGAGE,4.986163,PERSONAL,A,6053.0,6.023534,0.064072,N,2.0
5,39555,27.0,200037.0,MORTGAGE,,VENTURE,A,7209.0,7.649907,,,7.0
6,39556,27.0,32615.0,RENT,3.411023,PERSONAL,B,18981.0,11.030476,0.104336,N,7.0
7,39557,28.0,37485.0,RENT,3.395477,MEDICAL,D,11093.0,13.036917,0.069718,Y,9.0
8,39558,28.0,59157.0,,5.01659,VENTURE,A,,,0.123681,N,8.0
9,39559,23.0,52437.0,RENT,2.233137,VENTURE,B,17416.0,10.198254,0.193084,N,4.0


**Cleaning Data before train model**

In [64]:
X = train_df.drop(columns=['loan_status', 'id'])
y = train_df['loan_status']
X_test = test_df.drop(columns=['id'])

# Separate numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

> fill missing value with AVG

In [65]:
# For numerical columns, fill with median
X[numerical_cols] = X[numerical_cols].fillna(X[numerical_cols].median())
X_test[numerical_cols] = X_test[numerical_cols].fillna(X[numerical_cols].median())

# For categorical columns, fill with mode
X[categorical_cols] = X[categorical_cols].fillna(X[categorical_cols].mode().iloc[0])
X_test[categorical_cols] = X_test[categorical_cols].fillna(X[categorical_cols].mode().iloc[0])

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

In [67]:
# Safe Label Encoding: only encode columns that exist in X or X_test
label_encoders = {}

for col in categorical_cols:
    # Skip if column doesn't exist in either dataframe
    if col not in X.columns and col not in X_test.columns:
        print(f"Skipping column: {col} (not found in either train or test)")
        continue

    le = LabelEncoder()

    # Use actual or placeholder data to allow fitting
    col_train = X[col] if col in X.columns else pd.Series(["NA"] * len(X))
    col_test = X_test[col] if col in X_test.columns else pd.Series(["NA"] * len(X_test))

    combined = pd.concat([col_train, col_test], axis=0)
    le.fit(combined)

    # Transform only where column exists
    if col in X.columns:
        X[col] = le.transform(X[col])
    if col in X_test.columns:
        X_test[col] = le.transform(X_test[col])

    label_encoders[col] = le


In [68]:
print([col for col in X.columns if 'loan' in col.lower()])

['loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income']


In [69]:
# Feature engineering: create income to loan ratio
X['income_to_loan'] = X['person_income'] / (X['loan_amnt'] + 1e-5)     # ใช้ loan_amnt
X_test['income_to_loan'] = X_test['person_income'] / (X_test['loan_amnt'] + 1e-5)

**Fit Model**

In [70]:

# Split the training data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree Classifier with manual parameters
clf = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=6,
    min_samples_split=10,
    random_state=42
)
clf.fit(X_train, y_train)

# Validate model
y_pred = clf.predict(X_val)

# Evaluation
print("F1 Score:", f1_score(y_val, y_pred, average='weighted'))
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))


F1 Score: 0.811480553951196
Accuracy: 0.8192161820480405
Confusion Matrix:
 [[5360  490]
 [ 940 1120]]

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.92      0.88      5850
           1       0.70      0.54      0.61      2060

    accuracy                           0.82      7910
   macro avg       0.77      0.73      0.75      7910
weighted avg       0.81      0.82      0.81      7910



In [71]:
# Retrain on full training data
clf.fit(X, y)

# Predict on test set
test_preds = clf.predict(X_test)

# Save submission
sample_submission['loan_status'] = test_preds
sample_submission.to_csv('submission02.csv', index=False)
print("Submission file created with label encoding.")


Submission file created with label encoding.
