In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.utils import resample

In [2]:
url = r"D:\ProdigyInfotech\PRODIGY_DS_03\PRODIGY_DS_03_Data.csv"
data = pd.read_csv(url, sep=';')

In [3]:
data_sample = data.sample(n=1000, random_state=42)

In [4]:
categorical_cols = data_sample.select_dtypes(include=['object']).columns
label_encoders = {col: LabelEncoder().fit(data_sample[col]) for col in categorical_cols}

for col, le in label_encoders.items():
    data_sample[col] = le.transform(data_sample[col])

In [5]:
X = data_sample.drop('y', axis=1)
y = data_sample['y']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=42)

In [7]:
# Resample the training set to balance the dataset
X_train_res, y_train_res = resample(X_train[y_train == 0],
                                    y_train[y_train == 0],
                                    replace=True,
                                    n_samples=len(y_train[y_train == 1]),
                                    random_state=42)
X_train_balanced = pd.concat([X_train_res, X_train[y_train == 1]])
y_train_balanced = pd.concat([y_train_res, y_train[y_train == 1]])

In [8]:
dt = DecisionTreeClassifier(random_state=42)

In [9]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [10]:
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train_balanced, y_train_balanced)

In [11]:
best_dt = grid_search.best_estimator_
best_dt.fit(X_train_balanced, y_train_balanced)

In [12]:
y_pred_dt = best_dt.predict(X_test)

In [13]:
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt, pos_label=label_encoders['y'].transform(['yes'])[0])
recall_dt = recall_score(y_test, y_pred_dt, pos_label=label_encoders['y'].transform(['yes'])[0])
conf_matrix_dt = confusion_matrix(y_test, y_pred_dt)
class_report_dt = classification_report(y_test, y_pred_dt)

In [14]:
results_df = X_test.copy()
results_df['Actual'] = y_test
results_df['Predicted'] = y_pred_dt

In [15]:
# Print evaluation metrics
print(f"Accuracy: {accuracy_dt}")
print(f"Precision: {precision_dt}")
print(f"Recall: {recall_dt}")
print("Confusion Matrix:")
print(conf_matrix_dt)
print("Classification Report:")
print(class_report_dt)
print("Results (first 10 rows):")
print(results_df.head(10))

Accuracy: 0.8133333333333334
Precision: 0.35
Recall: 0.875
Confusion Matrix:
[[216  52]
 [  4  28]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.81      0.89       268
           1       0.35      0.88      0.50        32

    accuracy                           0.81       300
   macro avg       0.67      0.84      0.69       300
weighted avg       0.91      0.81      0.84       300

Results (first 10 rows):
       age  job  marital  education  default  housing  loan  contact  month  \
1681    56    0        2          3        0        0     0        1      6   
21108   33    2        1          5        0        2     0        0      1   
40678   58    9        1          4        0        2     0        0      9   
19487   46    1        1          2        1        0     0        0      1   
1291    45    1        1          0        1        0     2        1      6   
28550   29   10        2          3        0       