In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [2]:
# --- 1. Load Data ---
d2_path = r"E:\LOAN.csv"

In [3]:
d2 = pd.read_csv(d2_path)
# The target 'y' is categorical ('yes'/'no'), so we convert it to 0s and 1s
y = d2['y'].apply(lambda x: 1 if x == 'yes' else 0)
X = d2.drop(columns=['y'])

In [4]:
# --- 2. Define Features ---
# Identify all columns that are numerical
num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Identify all columns that are categorical (object type)
cat_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numerical features: {num_features}")
print(f"Categorical features: {cat_features}")

Numerical features: ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
Categorical features: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']


In [5]:
# --- 3. Preprocess Data ---
# Set up a preprocessor to handle both feature types
preprocessor = ColumnTransformer(
    transformers=[
        # Scale all the numerical features
        ('num', StandardScaler(), num_features),
        # One-hot encode all the categorical features
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ])

# Apply the preprocessing to our feature matrix X
X_processed = preprocessor.fit_transform(X)


In [6]:
# --- 4. Split Data ---
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)


# --- 5. Train the RandomForest Model ---
print("\n--- Training RandomForest Classifier on d2 ---")
# Create an instance of the RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Train the model on the training data
rf_model.fit(X_train, y_train)
print("Training complete.")


# --- 6. Evaluate the Model ---
print("\n--- Evaluating Model ---")
# Make predictions on the unseen test data
y_pred = rf_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of the RandomForest model on the d2 test set: {accuracy:.4f}")



--- Training RandomForest Classifier on d2 ---
Training complete.

--- Evaluating Model ---
Accuracy of the RandomForest model on the d2 test set: 0.9102
