In [222]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb



Import data

In [223]:
train_df = pd.read_csv('train.csv')
train_df.drop('id', axis=1)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,29,28800,OWN,8.0,PERSONAL,A,6000,8.90,0.21,N,10,0
3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.10,N,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...
58640,34,120000,MORTGAGE,5.0,EDUCATION,D,25000,15.95,0.21,Y,10,0
58641,28,28800,RENT,0.0,MEDICAL,C,10000,12.73,0.35,N,8,1
58642,23,44000,RENT,7.0,EDUCATION,D,6800,16.00,0.15,N,2,1
58643,22,30000,RENT,2.0,EDUCATION,A,5000,8.90,0.17,N,3,0


Encode Features

In [224]:


from sklearn.metrics import accuracy_score, classification_report


def is_valid_column(column): # Count NaN values

    if train_df[column].isnull().any() or train_df[column].empty:
        print('ERROR')
        raise ValueError(f"Error in column{column}")
    else:
        return True
    



# Step 2: Feature selection - separate features (X) and target (y)
train_df = train_df.drop(columns=['id'])


# One hot encoding
one_hot_encoded_columns = ['person_home_ownership', 'loan_intent', 'cb_person_default_on_file', 'loan_grade']
for column in one_hot_encoded_columns:
    if is_valid_column(column):
        one_hot_encoded = pd.get_dummies(train_df[column], prefix=column, drop_first=True).astype(int)
        # Concatenate the original DataFrame with the one-hot encoded DataFrame
        train_df = pd.concat([train_df, one_hot_encoded], axis=1)
        # Drop the original 'homeownership' column
        train_df = train_df.drop(column, axis=1)


X = train_df.drop(columns=['loan_status'])  # Feature columns
y = train_df['loan_status']  # Target column (loan approval status) 

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
binary_features = [col for col in numeric_features if set(X[col].unique()) <= {0, 1}]
scalable_features = list(set(numeric_features) - set(binary_features))



# Step 5: Scale the relevant features directly in the DataFrame
# Create a StandardScaler object
scaler = StandardScaler()
# Scale only the numeric features that need scaling (not the binary ones)
X[scalable_features] = scaler.fit_transform(X[scalable_features])

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.3, random_state=42)
# Step 7: Create an XGBoost classifier
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',  # Logistic regression for binary classification
    eval_metric='logloss',        # Loss function for evaluation
    use_label_encoder=False       # Avoids label encoding warnings
)
# Step 8: Train the model
xgb_model.fit(X_train, y_train)
# Step 9: Make predictions on the test set
y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 1.0000

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     15126
           1       1.00      1.00      1.00      2468

    accuracy                           1.00     17594
   macro avg       1.00      1.00      1.00     17594
weighted avg       1.00      1.00      1.00     17594



Parameters: { "use_label_encoder" } are not used.



Model