In [1]:
import pandas as pd
import numpy as np
from typing import Tuple, List, Dict
import matplotlib.pyplot as plt
import math

from scipy.optimize import minimize

import sys
sys.path.insert(0, '../scripts/')
import utils as utl
from decision_tree import Decision_Tree_Classifier

NameError: name 'Tuple' is not defined

In [13]:
class Decision_Tree_Classifier:
    def __init__(self) -> None:
        self.res_dict = {}
    
    def fit(self, X, y, X_cat_cols):
        self.res_dict = self.des_tree_nodes(X, y, X_cat_cols)
        
    def predict(self, X):
        return X.apply(lambda x: self.predict_single(x, self.res_dict), axis=1)

    def predict_single(self, x, res_dict):
        tp = res_dict['type']
        attr = res_dict['attr']
        if tp == 'cat':
            res = res_dict['res'][x[attr]]
        else:
            val = res_dict['vals']
            i = self.recover_index(x[attr],val)
            res = res_dict['res'][str(i)]
        
        if isinstance(res,dict):
            return self.predict_single(x, res)
        else:
            return res

    def new_nodes(self, X: pd.DataFrame, y: pd.Series, attr_max_gain: str, X_cat_cols) -> Dict:
        dct_res = {}
        cols = list(X.columns)
        cols.remove(attr_max_gain)
        X_upd = X[cols]
        Xcc = X_cat_cols.copy()
        if attr_max_gain in X_cat_cols:
            Xcc.remove(attr_max_gain)
        
        dct_res['attr'] = attr_max_gain
        if attr_max_gain in X_cat_cols:
            dct_res['type'] = 'cat'
            dct_res['vals'] = np.sort(X[attr_max_gain].unique())
            childs = {}

            for u in dct_res['vals']:
                Xn = X_upd.loc[X[attr_max_gain]==u]
                yn = y.loc[X[attr_max_gain]==u]
                if len(yn)>0:
                    childs[u] = yn.iloc[0]
                    if len(yn.unique()) > 1:
                        childs[u] = self.des_tree_nodes(Xn, yn, Xcc)
            dct_res['res'] = childs
        else:
            dct_res['type'] = 'cont'
            splits = self.get_splits(X[attr_max_gain], y)
            dct_res['vals'] = splits
            childs = {}
            for i in range(len(splits) + 1):                
                Xn = self.iloc_ranges(X_upd, X[attr_max_gain], splits, i)
                yn = self.iloc_ranges(y, X[attr_max_gain], splits, i)
                if len(yn)>0:
                    childs[str(i)] = self.des_tree_nodes(Xn, yn, Xcc)
            dct_res['res'] = childs
        return dct_res
    
    def des_tree_nodes(self, X: pd.DataFrame, y: pd.Series, X_cat_cols):
        cols = list(X.columns)
        attr_max_gain = self.get_max_ig(X, y, X_cat_cols)
        cols.remove(attr_max_gain)
        if len(cols)!=0:
            out = self.new_nodes(X, y, attr_max_gain, X_cat_cols)
            return out
        y_count = {}
        for u in y.unique():
            y_count[u] = y.loc[y==u].count()
        out = max(y_count, key=y_count.get)
        return out
    
    def info_gain_cat(self, x:pd.Series, y:pd.Series) -> float:
        """Calculates the info gain of using a given categorical attribute to describe categorical data

        Parameters
        ----------
        x : pd.Series
            Categorical values of the attribute
        y : pd.Series
            Categorical target

        Returns
        -------
        float
            Information gain
        """
        ES = self.entropy(y)
        ESv = 0
        nt = len(y)
        if nt != 0:
            for u in np.sort(x.unique()):
                Sv = y.loc[x==u]
                n = len(Sv)        
                ESv += - (n/nt)*self.entropy(Sv)
        return ES - ESv

    def info_gain_con(self, x:pd.Series, y:pd.Series) -> float:
        """Calculates the info gain of using a given continuous attribute to describe categorical data

        Parameters
        ----------
        x : pd.Series
            Categorical values of the attribute
        y : pd.Series
            Categorical target

        Returns
        -------
        float
            Information gain
        """
        splits = self.get_splits(x, y)
        ES = self.entropy(y)
        ESv = 0
        nt = len(y)
        if nt != 0:
            for i in range(len(splits) + 1):
                Sv = self.iloc_ranges(y, x, splits, i)
                n = len(Sv)
                ESv += - (n/nt)*self.entropy(Sv)
        return ES - ESv

    def get_max_ig(self, X: pd.DataFrame, y: pd.Series, X_cat_cols: List) -> str:
        """Identifies the column of X with the maximum info gain related to the target y

        Parameters
        ----------
        X : pd.DataFrame
            Input data
        y : pd.Series
            Target data
        X_cat_cols : List
            List of categorical columns of X

        Returns
        -------
        str
            Name of the column with maximum information gain
        """
        dct = {}
        cols = list(X.columns)
        for attr in cols:
            if attr in X_cat_cols:
                dct[attr] = self.info_gain_cat(X[attr], y)
            else:
                dct[attr] = self.info_gain_con(X[attr], y)
        return max(dct, key=dct.get)
    
    def iloc_ranges(self, y: pd.Series, x: pd.Series, splits: List, i: int) -> pd.Series:
        """Filters the data from the series y for which the continuous data x is between the values split[i-1] and split[i]

        Parameters
        ----------
        x : pd.Series
            Continuous data to use as base for the filtering
        y : pd.Series or pd.DataFrame
            Data to be filtered
        splits : List
            Values at which the values of x can be splitted
        i : int
            Index of the split to be used


        Returns
        -------
        pd.Series
            Filtered data
        """
        if len(x) > 0:
            if i > 0 and i < len(splits):
                Sv = y.loc[(x>splits[i-1]) & (x<=splits[i])]
            elif i == 0:
                Sv = y.loc[x<=splits[0]]
            else:
                Sv = y.loc[x>splits[-1]]
            return Sv
        return pd.Series([])

    def recover_index(self, val: float, splits: List) -> int:
        """Recovers the index at which val falls whitin the range of the values specified by splits

        Parameters
        ----------
        val : pd.Series
            Value to evaluate
        splits : List
            Limits of the split ranges.

        Returns
        -------
        int
            Recovered index
        """
        for i,v in enumerate(splits):
            if i > 0:
                if val > splits[i-1] and val <= v:
                    return i
            else:
                if val <= v:
                    return i
        return len(splits)
    
    def get_splits(self, x:pd.Series, y:pd.Series) -> np.array:
        """Returns the splits of the categorical data x, using the median x for each category of y

        Parameters
        ----------
        x : pd.Series
            Input data for which the splits will be calculated
        y : pd.Series
            Target categorical data 

        Returns
        -------
        np.array
            Limits of the splits
        """
        vals = []
        for u in y.unique():
            x_np = x.loc[y==u].to_numpy()
            vals.append(np.median(x_np))
        vals_sorted = np.sort(np.unique(np.array(vals)))
        return vals_sorted
    
    def entropy(self, y: pd.Series) -> float:
        """Calculates the entropy of y

        Parameters
        ----------
        y : pd.Series
            Categorical value for which the entropy will be calculated

        Returns
        -------
        float
            Entropy
        """
        entropy = 0
        ntt = len(y)
        if ntt != 0:
            for u in y.unique():
                y_c = y.loc[y==u]
                p = len(y_c)/ntt
                if p != 0:
                    entropy += -p*math.log2(p)
        return entropy

In [3]:
df = pd.read_csv('../data/1.raw/customerClassification.csv')#, parse_dates=['DateTime'],index_col=['DateTime'])
df.columns
X_cols = ['Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession',
       'Work_Experience', 'Spending_Score', 'Family_Size',
       'Segmentation']
X_cat_cols = ['Gender', 'Ever_Married', 'Graduated', 'Profession',
       'Spending_Score', 'Segmentation']
y_col = 'Var_1'

X = df[X_cols]
X = X.fillna(0)

for cat in X_cat_cols:
    X[cat] = X[cat].astype('category').cat.codes
# for c in X.columns:
#     X[c] = utl.min_max_scaling(X[c])[0]

y = df[y_col].fillna(0).astype('category').cat.codes

In [4]:
dt = Decision_Tree_Classifier()
dt.fit(X, y, X_cat_cols)

In [5]:
y_p = dt.predict(X)

In [6]:
np.set_printoptions(formatter={'float_kind':'{:4.0f}'.format})
cm = utl.confusion_matrix(y, y_p)
acc = utl.accuracy_classification(cm)
pres = utl.presicion_classification(cm)
rec = utl.recall_classification(cm)
f1 = utl.fbeta_classification(cm)

acc, pres, rec, f1

(0.7222359940505702,
 0.2236842105263158,
 0.8095238095238095,
 0.3505154639175258)

In [7]:
cm

array([[  17,    0,    0,    1,    5,    1,   52,    0],
       [   0,   35,    3,    5,    5,    0,   84,    1],
       [   0,    2,   93,    7,   12,    2,  306,    0],
       [   0,    4,   11,  204,   43,    2,  556,    2],
       [   1,    4,    8,   21,  387,    3,  659,    6],
       [   0,    1,    1,    2,    5,   26,   50,    0],
       [   3,   14,   26,   45,  117,    7, 5016,   10],
       [   0,    0,    1,    3,   10,    0,  140,   49]])

## Comparison with scikit learn

In [8]:
from sklearn.tree import DecisionTreeClassifier

In [9]:
clf = DecisionTreeClassifier(random_state=0).fit(X,y)
y_pred_skl_np = clf.predict(X)
y_pred_skl = pd.Series(y_pred_skl_np)

In [10]:
cm_skl = utl.confusion_matrix(y, y_pred_skl)
acc_skl = utl.accuracy_classification(cm_skl)
pres_skl = utl.presicion_classification(cm_skl)
rec_skl = utl.recall_classification(cm_skl)
f1_skl = utl.fbeta_classification(cm_skl)

acc_skl, pres_skl, rec_skl, f1_skl

(0.9556271690629649, 0.9342105263157895, 0.922077922077922, 0.9281045751633986)

In [11]:
cm_skl

array([[  71,    0,    0,    0,    0,    0,    5,    0],
       [   0,  124,    0,    0,    1,    0,    8,    0],
       [   1,    1,  399,    0,    1,    0,   20,    0],
       [   0,    2,    9,  770,    2,    0,   38,    1],
       [   2,    2,   16,   14, 1025,    0,   29,    1],
       [   0,    0,    0,    0,    1,   83,    1,    0],
       [   3,    4,   36,   61,   66,    4, 5063,    1],
       [   0,    0,    1,    3,    4,    0,   20,  175]])