# Classification trees

|                |   |
:----------------|---|
| **Name**     | Eddie Aguilar  |
| **Date**      | 04/07/2025  |
| **ID** | 739352  | 

In [108]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

In [109]:
import pandas as pd 

data = pd.read_csv(r"C:\Users\AgJo413\Documents\GitHub\Lab_std\labstds\Data\Default.csv")

data.head()

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.13895
3,No,No,529.250605,35704.49394
4,No,No,785.655883,38463.49588


## Simple tree

In [110]:
num_features = ["balance", "income"]
cat_features = ["student"]
target = ["default"]

In [111]:
X = data[num_features + cat_features]
y = data[target]

In [112]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ])

In [113]:
simple_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [115]:
cv_scores = cross_val_score(simple_pipeline, X, y, cv=10, scoring= "roc_auc")

In [116]:
print(f"Cross validation average: {cv_scores.mean():.3f}")

Cross validation average: 0.654


## Random Forest Classifier

Random Forest with optimized hyperparameters (number of trees and depth levels) with a cross validatino of k = 10

In [117]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [118]:
param_grid = {
    'classifier__n_estimators': range(5, 41, 5),
    'classifier__max_depth': [2, 3, 4]
}

In [119]:
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))  
])

In [120]:
grid_search = GridSearchCV(
    rf_pipeline,
    param_grid,
    cv=10,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

In [121]:
grid_search.fit(X, y)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  return fit_method(estimator, *args, **kwargs)


In [122]:
print(f"Best 10-fold CV AUC: {grid_search.best_score_:.4f}")

Best 10-fold CV AUC: 0.9378


In [123]:
y_pred_proba = grid_search.predict_proba(X)[:, 1]
test_auc = roc_auc_score(y, y_pred_proba)
print(f"\nTest Set AUC: {test_auc:.4f}")
print(grid_search.best_params_)


Test Set AUC: 0.9538
{'classifier__max_depth': 4, 'classifier__n_estimators': 35}
