<a href="https://colab.research.google.com/github/Abiola97/ajaoabiola0812/blob/main/SCT_DS_03/SCT_DS_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**TASK 03: Build a decision tree classifier to predict whether a customer will purchase a product or service based on their demographic and behavioral data. Use the Bank Marketing dataset from the UCI Machine Learning Repository.**

IMPORTING NECESSARY LIBARIES

In [1]:
# IMPORTING NECESSARY LIBARIES
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn import tree
import matplotlib.pyplot as plt


LOADING DATASET

In [2]:
# Load the dataset from file path
file_path = "/content/Bank Marketing Dataset combined.xlsx"

# Read both sheets
df_full = pd.read_excel(file_path, sheet_name="bank-full")  # Full dataset (for training)
df_test = pd.read_excel(file_path, sheet_name="bank")  # Test dataset

# Display the first few rows
print("Full Dataset Sample:")
display(df_full.head())

print("\nTest Dataset Sample:")
display(df_test.head())


Full Dataset Sample:


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,Month list,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,May,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,May,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,May,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,May,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,May,198,1,-1,0,unknown,no



Test Dataset Sample:


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,Month list,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,October,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,May,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,April,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,June,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,May,226,1,-1,0,unknown,no


In [3]:
df_full.dtypes

Unnamed: 0,0
age,int64
job,object
marital,object
education,object
default,object
balance,int64
housing,object
loan,object
contact,object
day,int64


**SPLITTING THE BANK-FULL DATASET IN THE COMBINED DATASET**

In [4]:
from sklearn.model_selection import train_test_split

# Split "bank full" into training (80%) and validation (20%)
train_df, val_df = train_test_split(df_full, test_size=0.2, random_state=42, stratify=df_full['y'])

# Print dataset sizes
print(f"Training Set: {train_df.shape}")
print(f"Validation Set: {val_df.shape}")
print(f"Test Set (from 'bank' sheet): {df_test.shape}")

Training Set: (36168, 17)
Validation Set: (9043, 17)
Test Set (from 'bank' sheet): (4521, 17)


**PREROCESSING THE DATASET FOR LABEL ENCODING OF CATEGORICAL VARIBLES**

In [5]:
# Define target variable (y) and features (X)
target_column = 'y'  # Assuming 'y' is the column indicating purchase (Yes/No)
X_train = train_df.drop(columns=[target_column])
y_train = train_df[target_column]

X_val = val_df.drop(columns=[target_column])
y_val = val_df[target_column]

X_test = df_test.drop(columns=[target_column])
y_test = df_test[target_column]

# Import and initialize LabelEncoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder() # Initialize LabelEncoder

# Iterate through columns and apply label encoding to object type columns
for col in X_train.select_dtypes(include=['object']).columns:
    X_train[col] = label_encoder.fit_transform(X_train[col])
    X_val[col] = label_encoder.transform(X_val[col])  # Use the same encoder for validation
    X_test[col] = label_encoder.transform(X_test[col]) # Use the same encoder for test


# Confirm dataset shapes
print(f"Training Features: {X_train.shape}, Training Labels: {y_train.shape}")
print(f"Validation Features: {X_val.shape}, Validation Labels: {y_val.shape}")
print(f"Test Features: {X_test.shape}, Test Labels: {y_test.shape}")

Training Features: (36168, 16), Training Labels: (36168,)
Validation Features: (9043, 16), Validation Labels: (9043,)
Test Features: (4521, 16), Test Labels: (4521,)


**TRAINING THE DECISION TREE MODEL**

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Decision Tree model
clf = DecisionTreeClassifier(random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Predict on validation set
y_val_pred = clf.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy:.2f}")

# Classification report
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

# Confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_val_pred))


Validation Accuracy: 0.88

Classification Report:
               precision    recall  f1-score   support

          no       0.93      0.93      0.93      7985
         yes       0.47      0.48      0.48      1058

    accuracy                           0.88      9043
   macro avg       0.70      0.70      0.70      9043
weighted avg       0.88      0.88      0.88      9043


Confusion Matrix:
 [[7423  562]
 [ 551  507]]


**IMPROVING THE MODEL'S  PRECISION AT PREDICTING ACTUAL PURCHASES BY REEVALUATING HYPERPARAMETERS**

In [7]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'max_depth': [50, 25, 35, None],  # Limit depth to prevent overfitting
    'min_samples_split': [5, 15, 20],  # Minimum samples needed to split a node
    'min_samples_leaf': [6, 14, 30],  # Minimum samples in a leaf node
    'criterion': ['gini', 'entropy']  # Split strategy
}

# Initialize Decision Tree
clf = DecisionTreeClassifier(random_state=42)

# Perform Grid Search
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters found
print("Best Hyperparameters:", grid_search.best_params_)

# Train best model
best_clf = grid_search.best_estimator_

# Predict on validation set
y_val_pred = best_clf.predict(X_val)

# Evaluate tuned model
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy After Tuning: {accuracy:.4f}")

print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_val_pred))


Best Hyperparameters: {'criterion': 'gini', 'max_depth': 50, 'min_samples_leaf': 30, 'min_samples_split': 5}
Validation Accuracy After Tuning: 0.9024

Classification Report:
               precision    recall  f1-score   support

          no       0.93      0.96      0.95      7985
         yes       0.61      0.46      0.52      1058

    accuracy                           0.90      9043
   macro avg       0.77      0.71      0.73      9043
weighted avg       0.89      0.90      0.90      9043


Confusion Matrix:
 [[7678  307]
 [ 576  482]]


**Saving the model for re-use later via joblib**

In [None]:
import joblib

# Save the final Decision Tree model
joblib.dump(best_clf, "/content/decision_tree_model.pkl")

print("Model saved successfully!")


Model saved successfully!


**Loading the presaved model to make prediction on new data**

In [None]:
# Load the saved model
loaded_model = joblib.load("/content/decision_tree_model.pkl")

# Make predictions on new data
new_predictions = loaded_model.predict(X_val)

print(new_predictions[:10])  # Show first 10 predictions


[0 0 0 0 0 0 0 1 0 0]


## **Final Model evaluation with test datatset in sheet named "Bank"**

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict on test dataset
y_test_pred = best_clf.predict(X_test)

# Evaluate the final model
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

print("\nTest Classification Report:\n", classification_report(y_test, y_test_pred))
print("\nTest Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


Test Accuracy: 0.9131

Test Classification Report:
               precision    recall  f1-score   support

          no       0.94      0.96      0.95      4000
         yes       0.65      0.52      0.58       521

    accuracy                           0.91      4521
   macro avg       0.80      0.74      0.77      4521
weighted avg       0.91      0.91      0.91      4521


Test Confusion Matrix:
 [[3856  144]
 [ 249  272]]
