In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# Load the dataset

df = pd.read_csv('bank-additional-full.csv')

# Exclude 'duration' from predictors and consider remaining columns as predictors
predictors = df.drop(['duration', 'y'], axis=1)
target = df['y']

# Encode categorical variables
le = LabelEncoder()
for col in predictors.select_dtypes(include=['object']).columns:
    predictors[col] = le.fit_transform(predictors[col])

# Split data into train and test sets (80:20 ratio)
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.2, random_state=42)


In [5]:
# Initialize Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Fit the model on training data
dt_classifier.fit(X_train, y_train)

# Predictions on training and test data
y_train_pred = dt_classifier.predict(X_train)
y_test_pred = dt_classifier.predict(X_test)

# Evaluate the initial model
print("Initial Model Performance:")
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("")

# Print precision and recall for each class
print("Classification Report on Test Data:")
print(classification_report(y_test, y_test_pred))


Initial Model Performance:
Training Accuracy: 0.9953566009104704
Test Accuracy: 0.8336974993930566

Classification Report on Test Data:
              precision    recall  f1-score   support

          no       0.91      0.90      0.91      7303
         yes       0.30      0.34      0.32       935

    accuracy                           0.83      8238
   macro avg       0.61      0.62      0.61      8238
weighted avg       0.84      0.83      0.84      8238



In [6]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for GridSearchCV
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,  # 5-fold cross-validation
                           verbose=1,
                           n_jobs=-1)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)

# Best parameters and best score from GridSearchCV
print("Best Parameters Found:")
print(grid_search.best_params_)
print("Best Cross-validation Score:", grid_search.best_score_)
print("")

# Use the best model from GridSearchCV
best_dt_model = grid_search.best_estimator_

# Predictions using the best model
y_test_pred_best = best_dt_model.predict(X_test)

# Evaluate the tuned model
print("Tuned Model Performance:")
print("Test Accuracy:", accuracy_score(y_test, y_test_pred_best))
print("")
print("Classification Report on Test Data:")
print(classification_report(y_test, y_test_pred_best))


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters Found:
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best Cross-validation Score: 0.8936267071320183

Tuned Model Performance:
Test Accuracy: 0.8941490653071134

Classification Report on Test Data:
              precision    recall  f1-score   support

          no       0.91      0.97      0.94      7303
         yes       0.57      0.29      0.38       935

    accuracy                           0.89      8238
   macro avg       0.74      0.63      0.66      8238
weighted avg       0.87      0.89      0.88      8238

