In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# --- 1. Load Data ---
d1_path = r"E:\Copy of default_of_credit_card_clients(1).csv"

In [3]:
d1 = pd.read_csv(d1_path, header=1)
d1.rename(columns={'default payment next month': 'target'}, inplace=True)
d1 = d1.drop(columns=['ID'], errors='ignore')

In [4]:
# --- 2. Define Features and Clean Data ---
# Define which columns are numerical and which are categorical
num_features = ['LIMIT_BAL', 'AGE', 'BILL_AMT1', 'PAY_AMT1']
cat_features = ['MARRIAGE']
all_features = num_features + cat_features


In [5]:
# Create the feature matrix (X) and target vector (y)
X = d1[all_features].copy()
y = d1['target'].astype(int)

In [6]:
# Convert categorical column to string to ensure consistency
X['MARRIAGE'] = X['MARRIAGE'].astype(str)

In [7]:
# Clean numerical columns by forcing them to be numbers and filling missing values
for col in num_features:
    X[col] = pd.to_numeric(X[col], errors='coerce')
    X[col] = X[col].fillna(X[col].median())

# --- 3. Preprocess Data ---
# Set up a preprocessor to scale numerical data and one-hot encode categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ])


In [8]:
# Apply the preprocessing
X_processed = preprocessor.fit_transform(X)

# --- 4. Split Data ---
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# --- 5. Train the RandomForest Model ---
print("--- Training RandomForest Classifier on d1 ---")
# Create an instance of the RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Train the model on the training data
rf_model.fit(X_train, y_train)
print("Training complete.")

# --- 6. Evaluate the Model ---
print("\n--- Evaluating Model ---")
# Make predictions on the unseen test data
y_pred = rf_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of the RandomForest model on the d1 test set: {accuracy:.4f}")

--- Training RandomForest Classifier on d1 ---
Training complete.

--- Evaluating Model ---
Accuracy of the RandomForest model on the d1 test set: 0.7562
