In [None]:
import sys
import warnings
import numpy as np
import pandas as pd

from tabulate import tabulate
from prettytable import PrettyTable
from sklearn.metrics import hamming_loss
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import multilabel_confusion_matrix
from skmultilearn.problem_transform import LabelPowerset
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

warnings.filterwarnings('ignore')
np.set_printoptions(threshold=sys.maxsize)

In [None]:
class DecisionTree:
    def __init__(self, type, depth=10, features=7, criterion='gini'):
        self.type = type
        self.depth = depth
        self.features = features
        self.criterion = criterion

    def setDepth(self, depth):
        self.depth = depth

    def setFeatures(self, features):
        self.features = features

    def setCriterion(self, criterion):
        self.criterion = criterion
    
    def classify(self, x_train, y_train, x_test):
        model = DecisionTreeClassifier(criterion=self.criterion, max_depth=self.depth, max_features=self.features)
        if self.type == 'powerset':
            clf = LabelPowerset(classifier=model).fit(x_train, y_train)
        else:
            clf = MultiOutputClassifier(model).fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        return y_pred
    

In [None]:
def encode_label(vec):
    labels = []
    for x in vec:
        temp = []
        for y in x.split():
            temp.append(y)
        labels.append(temp)
    mlb = MultiLabelBinarizer()
    y_labels = mlb.fit_transform(labels)
    return y_labels

def preprocess_data(data):
    names = list(data.columns)
    names[9] = 'most_brought_item'
    data.columns = names

    # one hot encoding
    X = data.iloc[:,0:10]
    Y = data.iloc[:,10]
    crt = ['gender', 'education', 'married', 'city', 'occupation', 'most_brought_item']
    X = pd.get_dummies(X, columns=crt)
    Y = encode_label(Y)
    return X, Y

def split_data(X, Y):
    x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8)
    return x_train, x_test, y_train, y_test

def get_accuracy(actual, predicted):
    return 1-hamming_loss(actual, predicted)

def get_scores(actual, predicted):
    fmacro = f1_score(actual, predicted, zero_division=0, average='macro')
    fmicro = f1_score(actual, predicted, zero_division=0, average='micro')
    accuracy = get_accuracy(actual, predicted)
    # accuracy = accuracy_score(actual, predicted)
    precision = precision_score(actual, predicted, zero_division=0, average='weighted')
    recall = recall_score(actual, predicted, zero_division=0, average='weighted')
    return [fmacro, fmicro, accuracy, precision, recall]

def print_results(actual, predicted):
    result = get_scores(actual, predicted)
    t = PrettyTable(['Measure', 'Value'])
    t.add_row(['F1-macro', result[0]])
    t.add_row(['F1-micro', result[1]])
    t.add_row(['Accuracy', result[2]])
    t.add_row(['Precision', result[3]])
    t.add_row(['Recall', result[4]])
    print(t)
    print('confusion Matrix: ',)
    print(np.sum(multilabel_confusion_matrix(actual, predicted), axis=0))
    print()

In [None]:
data = pd.read_csv('advertisement.csv')
X, Y = preprocess_data(data)
x_train, x_test, y_train, y_test = split_data(X, Y)

criterion = ['gini', 'entropy']
max_depth = [3, 5, 10, 20, 30]
max_features = [3, 5, 7, 9, 11]

In [None]:
# POWERSET FORMULATION
dTreePS = DecisionTree('powerset')
# y_pred = dTreePS.classify(x_train, y_train, x_test)
# print_results(y_test, y_pred)

resultPS = []
for crt in criterion:
    for depth in max_depth:
        for features in max_features:
            dTreePS.setCriterion(crt)
            dTreePS.setDepth(depth)
            dTreePS.setFeatures(features)
            y_pred = dTreePS.classify(x_train, y_train, x_test)
            resultPS.append([crt, depth, features, y_pred])

# Report the metrics
topPS = []
for vec in resultPS:
    print('Criterion: ', vec[0])
    print('Max depth: ', vec[1])
    print('Max features: ', vec[2])
    print_results(y_test, vec[3])
    score = get_scores(y_test, vec[3])
    topPS.append([score[2], vec[0], vec[1], vec[2]])

In [None]:
# MULTIOUTPUT FORMULATION
dTreeMO = DecisionTree('multioutput')
# y_pred = dTreeMO.classify(x_train, y_train, x_test)
# print_results(y_test, y_pred)

resultMO = []
for crt in criterion:
    for depth in max_depth:
        for features in max_features:
            dTreeMO.setCriterion(crt)
            dTreeMO.setDepth(depth)
            dTreeMO.setFeatures(features)
            y_pred = dTreeMO.classify(x_train, y_train, x_test)
            resultMO.append([crt, depth, features, y_pred])

# Report the metrics
topMO = []
for vec in resultMO:
    print('Criterion: ', vec[0])
    print('Max depth: ', vec[1])
    print('Max features: ', vec[2])
    print_results(y_test, vec[3])
    score = get_scores(y_test, vec[3])
    topMO.append([score[2], vec[0], vec[1], vec[2]])

In [None]:
# Top 3 performing set of hyperparamters according to F1 Score(macro).
headers = ['F1-score', 'Criterion', 'Max depth', 'Max features']

topPS = sorted(topPS, key=lambda x:x[0], reverse=True)[:3]
topMO = sorted(topMO, key=lambda x:x[0], reverse=True)[:3]
print('POWERSET FORMULATION')
print(tabulate(topPS, headers=headers,tablefmt='grid'))
print()

print('MULTIOUTPUT FORMULATION')
print(tabulate(topMO, headers=headers,tablefmt='grid'))

In [None]:
# K Fold validation metrics
K = 8
kf = KFold(n_splits=K, shuffle=True)

# powerset formulation
dTreePS.setCriterion(topPS[0][1])
dTreePS.setDepth(topPS[0][2])
dTreePS.setFeatures(topPS[0][3])

# multioutput formulation
dTreeMO.setCriterion(topMO[0][1])
dTreeMO.setDepth(topMO[0][2])
dTreeMO.setFeatures(topMO[0][3])

X = x_train
Y = y_train
valuesPS = []
valuesMO = []

for train_index, test_index in kf.split(X):
    x_train, x_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = Y[train_index], Y[test_index]

    valuesPS.append(get_scores(y_test, dTreePS.classify(x_train, y_train, x_test))[2])
    valuesMO.append(get_scores(y_test, dTreeMO.classify(x_train, y_train, x_test))[2])

print('Average Accuracy after k Fold validation: ')
print('Powerset formulation: ', round(sum(valuesPS) / len(valuesPS), 2))
print('Multioutput formulation: ', round(sum(valuesMO) / len(valuesMO), 2))