Data Processing

In [233]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [234]:
df = pd.read_csv('data/Train.csv')

In [235]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [236]:
df.isnull().sum().sum()

0

In [237]:
print(df['job'].unique())
print(df['marital'].unique())
print(df['housing'].unique())
print(df['loan'].unique())
print(df['contact'].unique())
print(df['month'].unique())
print(df['poutcome'].unique())
print(df['default'].unique())

['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown'
 'retired' 'admin.' 'services' 'self-employed' 'unemployed' 'housemaid'
 'student']
['married' 'single' 'divorced']
['yes' 'no']
['no' 'yes']
['unknown' 'cellular' 'telephone']
['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'jan' 'feb' 'mar' 'apr' 'sep']
['unknown' 'failure' 'other' 'success']
['no' 'yes']


In [6]:
print(df['job'].value_counts())
print(df['marital'].value_counts())
print(df['housing'].value_counts())
print(df['loan'].value_counts())
print(df['contact'].value_counts())
print(df['month'].value_counts())
print(df['poutcome'].value_counts())

job
blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
unknown           288
Name: count, dtype: int64
marital
married     27214
single      12790
divorced     5207
Name: count, dtype: int64
housing
yes    25130
no     20081
Name: count, dtype: int64
loan
no     37967
yes     7244
Name: count, dtype: int64
contact
cellular     29285
unknown      13020
telephone     2906
Name: count, dtype: int64
month
may    13766
jul     6895
aug     6247
jun     5341
nov     3970
apr     2932
feb     2649
jan     1403
oct      738
sep      579
mar      477
dec      214
Name: count, dtype: int64
poutcome
unknown    36959
failure     4901
other       1840
success     1511
Name: count, dtype: int64


In [238]:
one_hot_encoded_data = pd.get_dummies(df.iloc[:,:-1], columns = ['job','education','marital','contact','month','poutcome'])
one_hot_encoded_data['loan'] = one_hot_encoded_data['loan'].replace({'yes': 1, 'no': 0})
one_hot_encoded_data['housing'] = one_hot_encoded_data['housing'].replace({'yes': 1, 'no': 0})
one_hot_encoded_data['default'] = one_hot_encoded_data['default'].replace({'yes': 1, 'no': 0})
one_hot_encoded_data['y']=df.iloc[:,-1]
one_hot_encoded_data['y'] = one_hot_encoded_data['y'].replace({'yes': 1, 'no': 0})
print(one_hot_encoded_data.head())

   age  default  balance  housing  loan  day  duration  campaign  pdays  \
0   58        0     2143        1     0    5       261         1     -1   
1   44        0       29        1     0    5       151         1     -1   
2   33        0        2        1     1    5        76         1     -1   
3   47        0     1506        1     0    5        92         1     -1   
4   33        0        1        0     0    5       198         1     -1   

   previous  ...  month_mar  month_may  month_nov  month_oct  month_sep  \
0         0  ...      False       True      False      False      False   
1         0  ...      False       True      False      False      False   
2         0  ...      False       True      False      False      False   
3         0  ...      False       True      False      False      False   
4         0  ...      False       True      False      False      False   

   poutcome_failure  poutcome_other  poutcome_success  poutcome_unknown  y  
0             False  

  one_hot_encoded_data['loan'] = one_hot_encoded_data['loan'].replace({'yes': 1, 'no': 0})
  one_hot_encoded_data['housing'] = one_hot_encoded_data['housing'].replace({'yes': 1, 'no': 0})
  one_hot_encoded_data['default'] = one_hot_encoded_data['default'].replace({'yes': 1, 'no': 0})
  one_hot_encoded_data['y'] = one_hot_encoded_data['y'].replace({'yes': 1, 'no': 0})


Upsampling

In [239]:
from sklearn.utils import resample
df = pd.DataFrame(one_hot_encoded_data)
df_majority = df[df['y'] == 0]
df_minority = df[df['y'] == 1]
df_minority_upsampled = resample(df_minority, replace=True,n_samples=len(df_majority),random_state=42)
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled = df_upsampled.sample(frac=1, random_state=42).reset_index(drop=True)
one_hot_encoded_data = df_upsampled
print(df_upsampled['y'].value_counts())

y
1    39922
0    39922
Name: count, dtype: int64


Decision Tree Implementation

In [336]:
##Decision Tree Implementation
from collections import Counter
class Node():
    def __init__(self,feature=None, threshold=None, left=None, right=None, gain=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.gain = gain
        self.value = value

class Decision_Tree():
    def __init__(self, min_samples=25, max_depth=2, alpha=0.0):
        self.min_samples = min_samples
        self.max_depth = max_depth
        self.alpha = alpha

    def split_data(self, X, feature, threshold):
        left_indices = np.where(X[:, feature] <= threshold)[0]
        right_indices = np.where(X[:, feature] > threshold)[0]
        return left_indices, right_indices
    
    def entropy(self, y):
        unique_vals, counts = np.unique(y, return_counts=True)
        probs = counts / len(y)
        entropy = -np.sum(probs * np.log2(probs + 1e-9)) 
        return entropy
    
    def gini_index(self, y):
        unique_vals, counts = np.unique(y, return_counts=True)
        probs = counts / len(y)
        gini = 1 - np.sum(probs ** 2)
        return gini
    
    def info_gain(self, parent, left, right, metric):
        info_gain=0
        if len(left) == 0 or len(right) == 0:
            return 0
        if (metric=="entropy"):
            parent_en = self.entropy(parent)
            weight_left = len(left)/len(parent)
            weigth_right = len(right)/len(parent)
            left_en = self.entropy(left)
            right_en = self.entropy(right)
            info_gain = parent_en - (weight_left*left_en + weigth_right*right_en)
        else:
            parent_gn = self.gini_index(parent)
            weight_left = len(left)/len(parent)
            weigth_right = len(right)/len(parent)
            left_gn = self.gini_index(left)
            right_gn = self.gini_index(right)
            info_gain = parent_gn - (weight_left*left_gn + weigth_right*right_gn)
        return info_gain
    
    def best_split(self, X,y, num_samples, num_features):
        best_split = {'gain':-1, 'feature':None, 'threshold':None}
        for feature_idx in range(num_features):
            thresholds = np.unique(X[:,feature_idx])
            for threshold in thresholds:
                left_idx,right_idx= self.split_data(X, feature_idx, threshold)
                if (len(left_idx)>0 and len(right_idx)>0):
                    left_y = y[left_idx]
                    right_y = y[right_idx]
                    info_gain = self.info_gain(y,left_y,right_y,"entropy")
                    #print(info_gain)
                    if (info_gain > best_split['gain']):
                        best_split['feature']=feature_idx
                        best_split['threshold']=threshold
                        best_split['gain']=info_gain
                        best_split['left_idx']=left_idx
                        best_split['right_idx']=right_idx
        return best_split
    
    def leaf_value(self,y):
        return Counter(y).most_common(1)[0][0]


    def build_tree(self, X, y, curr_depth=0):
        if (self.max_depth is not None and curr_depth >= self.max_depth) or (X.shape[0] < self.min_samples) or len(set(y)) == 1:
            leaf_val=self.leaf_value(y)
            return Node(value=leaf_val)
        n_samples , n_features = X.shape
        best_split = self.best_split(X,y, n_samples, n_features)
        if best_split['feature'] is None:
            leaf_val=self.leaf_value(y)
            return Node(value=leaf_val)
        
        if not np.any(best_split['left_idx']) or not np.any(best_split['right_idx']):
            leaf_val=self.leaf_value(y)
            return Node(value=leaf_val)

        left_node = self.build_tree(X[best_split['left_idx']], y[best_split['left_idx']], curr_depth + 1)  
        right_node = self.build_tree(X[best_split['right_idx']], y[best_split['right_idx']], curr_depth + 1)

        return Node(best_split['feature'],best_split['threshold'],left_node,right_node,best_split['gain'])

    
    def fit(self, X, y):
        self.root = self.build_tree(X,y)

    def predict(self, X):
        predictions = []
        for x in X:
            pred = self.make_pred(x,self.root)
            predictions.append(pred)
        predictions = np.array(predictions)
        return predictions
    
    def make_pred(self, x, node):
        if node.value!=None:
            return node.value
        else:
            feature=x[node.feature]
            if (feature<=node.threshold):
                return self.make_pred(x, node.left)
            else:
                return self.make_pred(x, node.right)
    
    def count_nodes(self, node=None):
        if node is None:
            node = self.root 
        if node.value is not None: 
            return 1
        left_count = self.count_nodes(node.left) if node.left is not None else 0
        right_count = self.count_nodes(node.right) if node.right is not None else 0
        return 1 + left_count + right_count  
    
    def prune_tree(self, X_val, y_val, node=None):
        if node is None:
            node = self.root
        if node.value is not None:
            return node

        left_mask = X_val[:, node.feature] <= node.threshold
        right_mask = ~left_mask

        if node.left is not None:
            node.left = self.prune_tree(X_val[left_mask], y_val[left_mask],node.left)
        if node.right is not None:
            node.right = self.prune_tree(X_val[right_mask], y_val[right_mask],node.right)

        before_prune_error = self.calculate_error(y_val, self.predict(X_val))

        node_value = self.leaf_value(y_val)
        pruned_error = self.calculate_error(y_val, np.full_like(y_val, node_value))

        if pruned_error + self.alpha <= before_prune_error:
            node.left = None
            node.right = None
            node.value = node_value

        return node

    def calculate_error(self, y_true, y_pred):
        return np.sum(y_true != y_pred) / len(y_true)
    

    def precision(self, y_true, y_pred):
        true_positive = np.sum((y_true == 1) & (y_pred == 1))
        false_positive = np.sum((y_true == 0) & (y_pred == 1))
        
        precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) != 0 else 0
        return precision

    def recall(self, y_true, y_pred):
        true_positive = np.sum((y_true == 1) & (y_pred == 1))
        false_negative = np.sum((y_true == 1) & (y_pred == 0))
        recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) != 0 else 0
        return recall

    def f1_score(self, y_true, y_pred):
        precision = self.precision(y_true, y_pred)
        recall = self.recall(y_true, y_pred)
        
        if precision + recall == 0:
            return 0
        f1 = 2 * (precision * recall) / (precision + recall)
        return f1

    def calculate_metrics(self, y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        precision = self.precision(y_true, y_pred)
        recall = self.recall(y_true, y_pred)
        f1 = self.f1_score(y_true, y_pred)
        
        return accuracy, precision, recall, f1

In [337]:
def train_test_split(X, y, random_state=42, test_size=0.2):
    n_samples = X.shape[0]
    np.random.seed(random_state)
    shuffled_indices = np.random.permutation(np.arange(n_samples))
    test_size = int(n_samples * test_size)
    test_indices = shuffled_indices[:test_size]
    train_indices = shuffled_indices[test_size:]
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(np.array(one_hot_encoded_data.iloc[:,:-1]), np.array(one_hot_encoded_data.iloc[:,-1]), random_state=42, test_size=0.2)

Hyperparameter tuning

In [230]:
#Hyperparameter tuning
max_depth = [2,5,10,12,15,20]
min_samples = [25, 50, 100, 125]
for i in max_depth:
    for j in min_samples:
        model = Decision_Tree(min_samples=j,max_depth=i)
        model.fit(X_train, y_train)
        pred = model.predict(X_train)
        print(i,j)
        print(f"Model's Train Accuracy: {accuracy(y_train, pred)}")
        predictions = model.predict(X_test)
        print(f"Model's Test Accuracy: {accuracy(y_test, predictions)}")
        print()


2 25
Model's Train Accuracy: 0.8896845364815174
Model's Test Accuracy: 0.887082503870825

2 50
Model's Train Accuracy: 0.8896845364815174
Model's Test Accuracy: 0.887082503870825

2 100
Model's Train Accuracy: 0.8896845364815174
Model's Test Accuracy: 0.887082503870825

2 125
Model's Train Accuracy: 0.8896845364815174
Model's Test Accuracy: 0.887082503870825

5 25
Model's Train Accuracy: 0.9039785451629849
Model's Test Accuracy: 0.8948241539482416

5 50
Model's Train Accuracy: 0.9036744173186984
Model's Test Accuracy: 0.8946029639460297

5 100
Model's Train Accuracy: 0.9036467693328541
Model's Test Accuracy: 0.8951559389515594

5 125
Model's Train Accuracy: 0.9036467693328541
Model's Test Accuracy: 0.8951559389515594

10 25
Model's Train Accuracy: 0.9158671790760043
Model's Test Accuracy: 0.893607608936076

10 50
Model's Train Accuracy: 0.9124111808454755
Model's Test Accuracy: 0.8944923689449237

10 100
Model's Train Accuracy: 0.9105587657939119
Model's Test Accuracy: 0.89570891395708

Training model and calculating Metrics for training and testing data 

In [331]:
# Fit your model
tree = Decision_Tree(min_samples=25, max_depth=10)
tree.fit(X_train, y_train)

# Predict on test data
y_pred = tree.predict(X_train)

# Calculate metrics
accuracy, precision, recall, f1 = tree.calculate_metrics(y_train, y_pred)

# Print results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")


# Predict on test data
y_pred = tree.predict(X_test)

# Calculate metrics
accuracy, precision, recall, f1 = tree.calculate_metrics(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Accuracy: 0.8719550378859039
Precision: 0.8537543525490313
Recall: 0.8978154732098147
F1-Score: 0.8752307293335164
Accuracy: 0.8623496993987976
Precision: 0.8429182509505704
Recall: 0.8900878293601003
F1-Score: 0.8658611009398267


In [332]:
# Count the number of nodes in the tree
node_count = tree.count_nodes()
print(f"Total number of nodes in the tree: {node_count}")

Total number of nodes in the tree: 705


Pruning Tree

In [333]:
tree.prune_tree(X_val=X_test,y_val=y_test)

<__main__.Node at 0x16dee2460>

calculating Metrics for training and testing data after pruning

In [335]:
y_pred = tree.predict(X_train)

# Calculate metrics
accuracy, precision, recall, f1 = tree.calculate_metrics(y_train, y_pred)

# Print results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")


# Predict on test data
y_pred = tree.predict(X_test)

# Calculate metrics
accuracy, precision, recall, f1 = tree.calculate_metrics(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Accuracy: 0.8685108648005511
Precision: 0.8482626053526542
Recall: 0.8977215823735604
F1-Score: 0.8722915748019524
Accuracy: 0.8635395791583166
Precision: 0.8425411096652076
Recall: 0.8936010037641154
F1-Score: 0.8673202216403825


In [334]:
# Count the number of nodes in the tree
node_count = tree.count_nodes()
print(f"Total number of nodes in the tree: {node_count}")

Total number of nodes in the tree: 385


Generating results on test.xlsx

In [345]:
df = pd.read_csv('data/Test.csv')
one_hot_encoded_data = pd.get_dummies(df.iloc[:,:-1], columns = ['job','education','marital','contact','month','poutcome'])
one_hot_encoded_data['loan'] = one_hot_encoded_data['loan'].replace({'yes': 1, 'no': 0})
one_hot_encoded_data['housing'] = one_hot_encoded_data['housing'].replace({'yes': 1, 'no': 0})
one_hot_encoded_data['default'] = one_hot_encoded_data['default'].replace({'yes': 1, 'no': 0})
y_pred = tree.predict(np.array(one_hot_encoded_data))

  one_hot_encoded_data['loan'] = one_hot_encoded_data['loan'].replace({'yes': 1, 'no': 0})
  one_hot_encoded_data['housing'] = one_hot_encoded_data['housing'].replace({'yes': 1, 'no': 0})
  one_hot_encoded_data['default'] = one_hot_encoded_data['default'].replace({'yes': 1, 'no': 0})


In [346]:
df = pd.DataFrame(y_pred)
df.to_csv('output.csv', index=False, header=False)