In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [6]:
# Load the dataset (assuming a CSV file)
# Replace 'path_to_your_dataset.csv' with the actual path
df = pd.read_csv('credit_data.csv')

# View the first few rows
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Data preprocessing - handle missing values, if any (for simplicity, we'll drop them here)
df = df.dropna()

   i#clientid        income        age         loan  c#default
0           1  66155.925095  59.017015  8106.532131          0
1           2  34415.153966  48.117153  6564.745018          0
2           3  57317.170063  63.108049  8020.953296          0
3           4  42709.534201  45.751972  6103.642260          0
4           5  66952.688845  18.584336  8770.099235          1
i#clientid    0
income        0
age           3
loan          0
c#default     0
dtype: int64


In [7]:
# Rename 'c#default' to 'default' for easier access
df = df.rename(columns={'c#default': 'default'})

# Feature and target variable
X = df[['income', 'age', 'loan']]  # Independent features
y = df['default']  # Target variable

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Feature scaling (Standardize the features)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Building: Random Forest Classifier
model = RandomForestClassifier(random_state=42)

# Hyperparameter tuning (optional) - using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters:", grid_search.best_params_)

Best parameters: {'max_depth': 15, 'min_samples_split': 5, 'n_estimators': 200}


In [9]:
# Train the model with best parameters
best_model = grid_search.best_estimator_

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.985


In [10]:
# Classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       339
           1       1.00      0.90      0.95        61

    accuracy                           0.98       400
   macro avg       0.99      0.95      0.97       400
weighted avg       0.99      0.98      0.98       400

Confusion Matrix:
 [[339   0]
 [  6  55]]
