In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import urllib.request
import zipfile
import os

# Function to download a file from a URL
def download_file(url, filename):
    if not os.path.exists(filename):
        urllib.request.urlretrieve(url, filename)

# URL and filename for the Bank Marketing dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip'
zip_file = 'bank.zip'
data_folder = 'bank-full.csv'

# Download the dataset ZIP file
download_file(url, zip_file)

# Extract the dataset from the ZIP file
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall()

# Load the dataset into a Pandas DataFrame
data_df = pd.read_csv(data_folder, sep=';')

# Display the first few rows of the DataFrame
print(data_df.head())

# Perform label encoding for categorical variables
label_encoder = LabelEncoder()

# Encode categorical variables
for column in data_df.columns:
    if data_df[column].dtype == 'object':
        data_df[column] = label_encoder.fit_transform(data_df[column])

# Define features (X) and target variable (y)
X = data_df.drop('y', axis=1)  # Features
y = data_df['y']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Fit the classifier
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Visualize the decision tree (optional)
plt.figure(figsize=(15, 10))
plot_tree(clf, filled=True, feature_names=X.columns, class_names=['No Purchase', 'Purchase'])
plt.show()


   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  
Accuracy: 0.8740462235983634

Confusion Matrix:
 [[7378  574]
 [ 565  526]]

Classification Report:
           