In [135]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [154]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

In [138]:
# Step 1: Load the data
# Load the dataset
file_path = '/content/bank-full.csv'
data = pd.read_csv(file_path, sep=';')

# Display the first few rows of the dataset
print(data.head())

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  


In [140]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB
None


In [141]:
data.isnull().sum()

Unnamed: 0,0
age,0
job,0
marital,0
education,0
default,0
balance,0
housing,0
loan,0
contact,0
day,0


#**Mapping based on the printed known labels:**

**Job:**
0: 'admin.'
1: 'blue-collar'
2: 'entrepreneur'
3: 'housemaid'
4: 'management'
5: 'retired'
6: 'self-employed'
7: 'services'
8: 'student'
9: 'technician'
10: 'unemployed'
11: 'unknown'

**Marital:**
0: 'divorced'
1: 'married'
2: 'single'

**Education:**
0: 'primary'
1: 'secondary'
2: 'tertiary'
3: 'unknown'

**Default:**
0: 'no'
1: 'yes'

**Housing:**
0: 'no'
1: 'yes'

**Loan:**
0: 'no'
1: 'yes'

**Contact:**
0: 'cellular'
1: 'telephone'
2: 'unknown'

**Month:**
0: 'apr'
1: 'aug'
2: 'dec'
3: 'feb'
4: 'jan'
5: 'jul'
6: 'jun'
7: 'mar'
8: 'may'
9: 'nov'
10: 'oct'
11: 'sep'

**Poutcome:**
0: 'failure'
1: 'other'
2: 'success'
3: 'unknown'

In [155]:
# Step 2: Data preprocessing
# Handle missing values, encode categorical variables, and split the dataset into features and labels.

# Handle missing values if any
data.fillna(method='ffill', inplace=True)

# Encoding categorical features
label_encoders = {}
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Encode the target variable
target_encoder = LabelEncoder()
data['y'] = target_encoder.fit_transform(data['y'])

# Split the dataset into features and target variable
X = data.drop(columns=['y'])
y = data['y']

  data.fillna(method='ffill', inplace=True)


In [157]:
# Step 3: Train-test split
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [158]:
# Step 4: Resample the training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Step 5: Initialize the classifier
clf = DecisionTreeClassifier(random_state=42)

# Step 6: Set up the parameter grid for Grid Search
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10]
}

# Step 7: Perform Grid Search
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='f1')
grid_search.fit(X_resampled, y_resampled)

# Step 8: Train the best model
best_clf = grid_search.best_estimator_
best_clf.fit(X_resampled, y_resampled)

In [160]:
# Step 9: Evaluate the model
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)
print('Confusion Matrix:')
print(conf_matrix)

Accuracy: 0.84
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.88      0.91      7952
           1       0.40      0.57      0.47      1091

    accuracy                           0.84      9043
   macro avg       0.67      0.73      0.69      9043
weighted avg       0.87      0.84      0.85      9043

Confusion Matrix:
[[7001  951]
 [ 467  624]]


#####**Now let's test our project, for this we are taking a subset of 20 entries from the dataset then we will be predicting on that subset after the the prediction we will be matching the results with the actual one that whether the the user will purchase the product or service or not.**

In [166]:
# Test with a subset of the dataset
def test_subset_of_dataset():
    subset_size = 20
    subset_data = data.sample(n=subset_size, random_state=42)
    subset_X = subset_data.drop(columns=['y'])
    subset_y = subset_data['y']

    subset_X_encoded = encode_input_data(subset_X, label_encoders)

    if subset_X_encoded is None:
        print("Encoding failed due to previously unseen labels.")
        return

    # Ensure that encoded subset has the correct shape
    if subset_X_encoded.shape[1] != X.shape[1]:
        print("Shape mismatch between encoded subset and training data.")
        print(f"Encoded subset shape: {subset_X_encoded.shape}")
        print(f"Training data shape: {X.shape}")
        return

    subset_predictions = best_clf.predict(subset_X_encoded)

    print("Testing with a subset of the dataset:")
    for i in range(subset_size):
        print(f"Data point {i+1}:")
        print(f"Actual: {'Purchase' if subset_y.iloc[i] == 1 else 'No Purchase'}")
        print(f"Prediction: {'Purchase' if subset_predictions[i] == 1 else 'No Purchase'}")
        print()

In [167]:
# Function to encode input data using the trained label encoders
def encode_input_data(input_df, label_encoders):
    encoded_df = input_df.copy()
    for col in label_encoders:
        if col in encoded_df.columns:
            le = label_encoders[col]
            encoded_df[col] = le.transform(encoded_df[col])
        else:
            print(f"Column {col} not found in input data.")
    return encoded_df

In [168]:
# Run the subset test
test_subset_of_dataset()

Testing with a subset of the dataset:
Data point 1:
Actual: No Purchase
Prediction: No Purchase

Data point 2:
Actual: No Purchase
Prediction: No Purchase

Data point 3:
Actual: No Purchase
Prediction: No Purchase

Data point 4:
Actual: No Purchase
Prediction: Purchase

Data point 5:
Actual: No Purchase
Prediction: No Purchase

Data point 6:
Actual: No Purchase
Prediction: Purchase

Data point 7:
Actual: No Purchase
Prediction: No Purchase

Data point 8:
Actual: No Purchase
Prediction: No Purchase

Data point 9:
Actual: No Purchase
Prediction: No Purchase

Data point 10:
Actual: No Purchase
Prediction: No Purchase

Data point 11:
Actual: Purchase
Prediction: No Purchase

Data point 12:
Actual: No Purchase
Prediction: No Purchase

Data point 13:
Actual: No Purchase
Prediction: No Purchase

Data point 14:
Actual: Purchase
Prediction: No Purchase

Data point 15:
Actual: No Purchase
Prediction: No Purchase

Data point 16:
Actual: Purchase
Prediction: Purchase

Data point 17:
Actual: No Pur