In [None]:
# ------------------------2.2.1 Distribution of labels-------------------------
import numpy as np
import matplotlib.pyplot as plt
import heapq
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
import math
import time
from prettytable import PrettyTable

# ------------------Loading the .npy file------------------------
data = np.load('data.npy',allow_pickle=True)

# ------------------Extracting the label column------------------
label_column = data[:, 3]
unique_labels, label_counts = np.unique(label_column, return_counts=True)

#print(len(unique_labels)) There are total of 193 unique columns

# --------------------Creating a bar chart------------------------
plt.bar(unique_labels,label_counts)
plt.xlabel('Labels')
plt.ylabel('Frequency')
plt.title('Distribution of labels')

In [None]:
# ------------------------2.3.1 KNN Class-------------------------
class KNN:
    #--------------------Initializing the class with parameters---------------
    def __init__(self, encoder_type, k, distance_metric):
        self.encoder_type = encoder_type
        self.k = k
        self.distance_metric = distance_metric

    #--------------------Modifying the parameters of class------------------
    def modify_parameters(self, encoder_type, k, distance_metric):
        self.set_encoder_type(encoder_type)
        self.set_k(k)
        self.set_distance_metric(distance_metric)

    def set_encoder_type(self, encoder_type):
        self.encoder_type = encoder_type

    def set_k(self, k):
        self.k = k

    def set_distance_metric(self, distance_metric):
        self.distance_metric = distance_metric
    
    def calculate_metric(self,X_val,y_val):
        
        y_val_pred=self.predict(X_val)
        f1 = f1_score(y_val, y_val_pred, average='weighted',zero_division=0)
        accuracy = accuracy_score(y_val, y_val_pred)
        precision = precision_score(y_val, y_val_pred, average='weighted',zero_division=0)
        recall = recall_score(y_val, y_val_pred, average='weighted',zero_division=0)

        return f1,accuracy,precision,recall
    
    def split_data(self,data,test_ratio):

        if self.encoder_type == 'ResNet':
            encoded_data = data[:,1]
        elif self.encoder_type == 'VIT':
            encoded_data = data[:,2]
        else:
            encoded_data=None

        label_data=data[:,3]

        # Split the data into training and testing sets
        X_train, X_val, y_train, y_val = train_test_split(encoded_data,label_data,test_size=test_ratio)

        return X_train,X_val,y_train,y_val
    
    def fit(self,X_train,y_train):
        self.X_train=X_train
        self.y_train=y_train
    

    def predict(self, X_test):
        predictions = []
        for i in range(len(X_test)):
            distances=np.array([])
            #maintaining distances list for distance between every pair of training and testing set 
            for j in range(len(self.X_train)):
                distance=self.calculate_distance(X_test[i],self.X_train[j])
                distances=np.append(distances,distance)
                
            #returning indices of the first k neighbours
            k_indices = heapq.nsmallest(self.k, range(len(distances)), key=lambda i: distances[i])

            #extracting the labels of the k nearest neighbours training samples
            k_nearest_labels = [self.y_train[i] for i in k_indices]

            # Finding unique elements and their counts
            unique_elements, counts = np.unique(k_nearest_labels, return_counts=True)

            # Finding the index of the element with the maximum count
            max_frequency_index = np.argmax(counts)

            # Getting the most common element
            most_common_element = unique_elements[max_frequency_index]
            predictions.append(most_common_element)

        return predictions


    # ---------------------Computing the  distance---------------------------
    def calculate_distance(self, data_point1, data_point2):
        if self.distance_metric == 'euclidean':
            distance = self.euclidean_distance(data_point1, data_point2)
        elif self.distance_metric == 'manhattan':
            distance = self.manhattan_distance(data_point1, data_point2)
        elif self.distance_metric == 'cosine':
            distance = self.cosine_distance(data_point1, data_point2)
        else:
            distance = None 
        
        return distance
    
    #--------------------Calculating euclidean distance--------------------
    def euclidean_distance(self,x,y):
        return np.sqrt(np.sum((x - y) ** 2))

    
    #--------------------Calculating manhattan distance----------------------
    def manhattan_distance(self,x,y):
        return np.sum(np.abs(x - y))
    
    #--------------------Calculating cosine distance----------------------
    def cosine_distance(self,x,y):
        np_x = np.array(x)
        np_y = np.array(y)
        np_y=np_y.T
        dot_product = np.dot(np_x,np_y)

        norm_1 = np.linalg.norm(np_x)
        norm_2 = np.linalg.norm(np_y)

        cos_theta = dot_product / (norm_1 * norm_2)

        return 1 - cos_theta

    #-------------------Calcuating inference time for any triplet-------------------
    def calculate_inference_time(self,encoder,k,distance_metric,test_ratio):

        #------------------Fitting the hyperparameters-----------------------
        self.modify_parameters(encoder,k,distance_metric)
        X_train,X_val,y_train,y_val = self.split_data(data,test_ratio)
        self.fit(X_train,y_train)

        #-----------------------Initializing start time-----------------------
        start_time = time.time()

        f1,accuracy,precision,recall=self.calculate_metric(X_val,y_val)

        #-----------------Taking difference between cur_time and start time--------------
        inference_time = time.time() - start_time
        
        return inference_time,accuracy
    
    #-------------------Calcuating inference time for default triplet-------------------
    def calculate_default_inference_time(self,encoder,k,distance_metric,test_ratio):

        X_train,X_val,y_train,y_val = self.split_data(data,test_ratio)

        knn_classifier = KNeighborsClassifier(n_neighbors=k)

        X_train_flat = [item for sublist in X_train for item in sublist]
        X_val_flat = [item for sublist in X_val for item in sublist]

        #------------------Fitting the hyperparameters-----------------------
        knn_classifier.fit(X_train_flat, y_train)

        #-----------------------Initializing start time-----------------------
        start_time = time.time()

        #----------------Prdicting the label----------------------------
        predictions = knn_classifier.predict(X_val_flat)

        #-----------------Calculting the default accuracy------------------
        accuracy_default = accuracy_score(y_val, predictions)

        #-----------------Taking difference between cur_time and start time--------------
        inference_time = time.time() - start_time
        return inference_time,accuracy_default


In [None]:
# ------------------------2.4.1 Best triplet-------------------------
from prettytable import PrettyTable

distance_calculator = KNN(encoder_type='ResNet', k=29, distance_metric='cosine')

test_ratio = 0.2

N=(1-test_ratio)*len(data)
max_len=int(math.sqrt(N))
encoders = ['ResNet','VIT']
distance_metrics = ['euclidean', 'manhattan', 'cosine']
triplets_list=[]
k_list=[]
accuracy_list=[]

for k in range(1,max_len+1,2):
    for encoder in encoders:
        for distance_metric in distance_metrics:

            distance_calculator.modify_parameters(encoder,k,distance_metric)
            X_train,X_val,y_train,y_val = distance_calculator.split_data(data,test_ratio)
            distance_calculator.fit(X_train,y_train)

            f1,accuracy,precision,recall=distance_calculator.calculate_metric(X_val,y_val)
            
            triplets_list.append(((k, encoder, distance_metric), accuracy))

sorted_triplets = sorted(triplets_list, key=lambda x: x[1], reverse=True)

#------------------------------Printing best triplet--------------------------
print(f"Best triplet: k={sorted_triplets[0][0][0]} Encoder={sorted_triplets[0][0][1]} Distance_metric={sorted_triplets[0][0][2]}")

In [None]:
# ------------------------2.4.2 20 Best triplets-------------------------

# Selecting the top 20 triplets
top_20_triplets = sorted_triplets[:20]

table=PrettyTable()
table.field_names = ["Rank","k","Encoder","Distance_Metric","Accuracy"]


# Printing the top 20 triplets and their accuracies
for rank, ((k, encoder, metric), accuracy) in enumerate(top_20_triplets, 1):
    table.add_row([rank]+[k]+[encoder]+[metric]+[accuracy])

print(table)

In [None]:
# ------------------------2.4.3 Accuracy vs k plot-------------------------
for k in range(1,max_len+1,2):

    k_list.append(k)

    distance_calculator.modify_parameters('VIT',k,'manhattan')
    X_train,X_val,y_train,y_val = distance_calculator.split_data(data,test_ratio)
    distance_calculator.fit(X_train,y_train)

    f1,accuracy,precision,recall=distance_calculator.calculate_metric(X_val,y_val)
    
    accuracy_list.append(accuracy)

#--------------------------Plotting top 20 accuracies-------------------
plt.plot(k_list,accuracy_list, marker='o', linestyle='-')
plt.title('k vs. Accuracy Plot(Encoder=VIT distance_metric=manhattan)')
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.grid(True) 
plt.show()

In [None]:
# ------------------------2.6.2 Inference time plot-------------------------
inference_time_initial,accuracy_initial=distance_calculator.calculate_inference_time('ResNet',29,'manhattan',test_ratio)
inference_time_default,accuracy_default=distance_calculator.calculate_default_inference_time('ResNet',29,'manhattan',test_ratio)

inference_time_optimized = float('inf')
best_accuracy=float('-inf')

for k in range(1,max_len+1,2):
    for encoder in encoders:
        for distance_metric in distance_metrics:
            inference_time,accuracy_optimized=distance_calculator.calculate_inference_time(encoder,k,distance_metric,test_ratio)
            inference_time_optimized=min(inference_time_optimized,inference_time)

            if(accuracy_optimized>best_accuracy):
                best_accuracy=accuracy_optimized
                best_triplet=(encoder,k,distance_metric)


inference_time_best,accuracy_best = distance_calculator.calculate_inference_time(best_triplet[0],best_triplet[1],best_triplet[2],test_ratio)

print(f"Initial inference time={inference_time_initial} Initial accuracy={accuracy_initial}")
print(f"Best inference time={inference_time_best} Best accuracy={best_accuracy}")
print(f"Optimized inference time={inference_time_optimized} Optimized accuracy={accuracy_optimized}")
print(f"Default inference time={inference_time_default} Default accuracy={accuracy_default}")

models=['Initial KNN','Best KNN','Optimized KNN','Default KNN']
Inference_time = [inference_time_initial,inference_time_best,inference_time_optimized,inference_time_default]

table=PrettyTable()
table.field_names = ["Model","Time","Accuracy"]
table.add_row(['Initial']+[inference_time_initial]+[accuracy_initial])
table.add_row(['Best']+[inference_time_best]+[best_accuracy])
table.add_row(['Optimized']+[inference_time_optimized]+[accuracy_optimized])
table.add_row(['Default']+[inference_time_initial]+[accuracy_default])

print(table)

plt.bar(models,Inference_time)
plt.xlabel("Model")
plt.ylabel("inference time")
plt.title("Model vs Inference time")
plt.show()

In [None]:
# ------------------------2.6.2 Inference time vs train datasize-------------------------
inference_time_initial_list=[]
inference_time_best_list=[]
inference_time_optimized_list=[]
inference_time_default_list=[]
train_ratio_list=[]

for test_ratio in np.arange(0.1,0.6,0.1):

    N=(1-test_ratio)*len(data)
    max_len=int(math.sqrt(N))

    inference_time_initial,accuracy_initial=distance_calculator.calculate_inference_time('ResNet',k,'manhattan',test_ratio)
    inference_time_initial_list.append(inference_time_initial)

    inference_time_default,accuracy_default=distance_calculator.calculate_default_inference_time('ResNet',k,'manhattan',test_ratio)
    inference_time_default_list.append(inference_time_default)

    inference_time_optimized = float('inf')
    best_accuracy=float('-inf')

    for k in range(1,max_len+1,2):
        for encoder in encoders:
            for distance_metric in distance_metrics:

                inference_time,accuracy_optimized=distance_calculator.calculate_inference_time(encoder,k,distance_metric,test_ratio)
                inference_time_optimized=min(inference_time_optimized,inference_time)

                if(accuracy_optimized>best_accuracy):
                    best_accuracy=accuracy_optimized
                    best_triplet=(encoder,k,distance_metric)
       
    train_ratio_list.append(test_ratio)

    inference_time_optimized_list.append(inference_time_optimized)

    inference_time_best,accuracy_best = distance_calculator.calculate_inference_time(best_triplet[0],best_triplet[1],best_triplet[2],test_ratio)
    inference_time_best_list.append(inference_time_best)


for i in range(len(train_ratio_list)):
    train_ratio_list[i] *= len(data)


plt.plot(train_ratio_list, inference_time_initial_list, marker='o', label='Initial KNN')
plt.plot(train_ratio_list, inference_time_best_list, marker='o', label='Best KNN')
plt.plot(train_ratio_list, inference_time_optimized_list, marker='o', label='Optimized KNN')
plt.plot(train_ratio_list, inference_time_default_list, marker='o', label='Default KNN')

plt.title('Inference Time vs Training Dataset Size for KNN Models')
plt.xlabel('Training Dataset Size')
plt.ylabel('Inference Time (seconds)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# ------------------------3.3.1 Decision Tree Class-------------------------
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from itertools import chain, combinations
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score,confusion_matrix,multilabel_confusion_matrix,hamming_loss
from prettytable import PrettyTable

class MultiPowerDecisionTreeClassifier:

    #--------------------Initializing the class with parameters---------------
    def __init__(self, max_depth, max_features, criterion):
        self.max_depth = max_depth
        self.max_features = max_features
        self.criterion = criterion
        self.dt_model = DecisionTreeClassifier()
    
    #--------------------Modifying the parameters of class------------------
    def modify_parameters(self,max_depth, max_features, criterion):
        self.set_max_depth(max_depth)
        self.set_max_features(max_features)
        self.set_criterion(criterion)
        
    def set_max_depth(self,max_depth):
        self.max_depth = max_depth

    def set_max_features(self,max_features):
        self.max_features = max_features

    def set_criterion(self,criterion):
        self.criterion = criterion
    
    def calculate_metric(self,X_test,y_test,formulation_type):    
        y_val_pred=self.predict(X_test)

        y_test = np.array(y_test)

        accuracy = accuracy_score(y_test, y_val_pred)
        f1_micro = f1_score(y_test, y_val_pred, average='micro',zero_division=0)
        f1_macro = f1_score(y_test, y_val_pred, average='macro',zero_division=0)
        confusion = multilabel_confusion_matrix(y_test, y_val_pred)
        cm=self.append_confusion_matrix(confusion,formulation_type,y_val_pred,y_test)
        precision = precision_score(y_test, y_val_pred, average='weighted',zero_division=0)
        recall = recall_score(y_test, y_val_pred, average='weighted',zero_division=0)
        loss = hamming_loss(y_test, y_val_pred)

        return accuracy,f1_micro,f1_macro,cm,precision,recall,1-loss
    
    def append_confusion_matrix(self,confusion_matric,formulation_type,y_pred,y_test):
        if(formulation_type=='MultiOutput'):
            result = np.sum(confusion_matric, axis=0)
            return result
        
        confusion_matrices_per_label = {}
        final_cm=[]

        for label in range(8):
            #8 is the length of unique labels
            # Extract binary vectors for the current label
            label_y_test = y_test[:, label]
            label_y_pred = y_pred[:, label]
            
            # Calculate the confusion matrix for the current label
            label_confusion_matrix = confusion_matrix(label_y_test, label_y_pred, labels=[0, 1])
    
            # Store the confusion matrix in the dictionary
            confusion_matrices_per_label[label] = label_confusion_matrix
            confusion_matrices_per_label[label] = np.array(confusion_matrices_per_label[label]) 
            # Convert the NumPy array to a nested list and store it in the dictionary
            confusion_matrices_per_label[label] = label_confusion_matrix.tolist() 
            final_cm.append(confusion_matrices_per_label[label])

        # Convert the list of matrices to a NumPy array
        matrix_array = np.array(final_cm)

        # Perform element-wise addition using NumPy
        result = np.sum(matrix_array, axis=0)

        return result

    
    def split_data(self,df,test_ratio,formulation_type):
        X_encoded,binary_label_list = self.encode_data(df,formulation_type)
        # Split the dataset into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_encoded,binary_label_list, test_size=test_ratio)
        return X_train,X_test,y_train,y_test
    
    def encode_data(self,df,formulation_type):
        # Perform one-hot encoding on categorical columns
        X_encoded = pd.get_dummies(df.drop('labels', axis=1), columns=['gender','education','married','occupation','most bought item'])
        y = df['labels']

        #  Flat the list of strings into a single string
        flat_string = ' '.join(y)
        labels = flat_string.split()

        # Get unique labels
        unique_labels = set(labels)

        binary_label_list=[]

        if(formulation_type=='Powerset'):
            # Generating power set
            label_powerset = list(chain.from_iterable(combinations(unique_labels, r) for r in range(len(unique_labels) + 1)))

            # Remove the empty subset
            label_powerset = [subset for subset in label_powerset if len(subset) > 0]

            filtered_label_powerset=[]

            for tup in label_powerset:
                result_string = ' '.join([' '.join(tup)])
                filtered_label_powerset.append(result_string)

            for i,tup in enumerate(filtered_label_powerset):
                result_string = ' '.join(sorted(tup.split()))
                filtered_label_powerset[i]=result_string


            # Create a mapping dictionary to map label subsets to their corresponding indices
            subset_to_index = {subset: i for i, subset in enumerate(filtered_label_powerset)}
            index_to_subset = {i: subset for i, subset in enumerate(filtered_label_powerset)}


            for label in y:
                decimal_val=subset_to_index[' '.join(sorted(label.split()))]

                # Convert the binary string to a list of binary digits (as integers)
                binary_vector = np.zeros(2**len(unique_labels)-1,dtype=int)

                binary_vector[decimal_val]=1

                binary_label_list.append(binary_vector)

        elif formulation_type=='MultiOutput':
            # Create a mapping dictionary to map label to their corresponding indices
            label_to_index = {label: i for i,label in enumerate(unique_labels)}
            index_to_label = {i: label for i, label in enumerate(unique_labels)}

            for label in y:
                all_labels = label.split()
                sorted_labels = sorted(all_labels)

                # Create a binary vector
                binary_vector = np.zeros(len(unique_labels),dtype=int)

                for each_label in sorted_labels:
                    binary_vector[label_to_index[each_label]]=1
                
                binary_label_list.append(binary_vector)
        
        else:
            return None

        return X_encoded,binary_label_list


    def predict(self,X_test):
        y_pred = self.dt_model.predict(X_test)
        return y_pred
    
    def fit(self,X_train,y_train):
        self.dt_model=DecisionTreeClassifier(max_depth=self.max_depth,max_features=self.max_features,criterion=self.criterion)
        self.dt_model.fit(X_train,y_train)
    
    #----Used ChatGPT-------
    def get_params(self, deep=True):
        return {
            'max_depth': self.max_depth,
            'max_features': self.max_features,
            'criterion': self.criterion,
            # Include other hyperparameters as needed
        }
    
    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self

In [None]:
# ------------------------3.4.1 Metric-------------------------
# Load the CSV file
df = pd.read_csv('advertisement.csv')
df = df.drop('city', axis=1)
criterions = ['gini','entropy']
max_depths = [3,5,10,20,30]
max_features = [3,5,7,9,11]
formulation_types=['Powerset','MultiOutput']
powerset_metrics=[]
multioutput_metrics=[]
test_ratio=0.2

DT = MultiPowerDecisionTreeClassifier(max_depth=30,max_features=11,criterion="gini")

for formulation_type in formulation_types:
    for criterion in criterions:
        for max_depth in max_depths:
            for max_feature in max_features:
                X_train,X_test,y_train,y_test = DT.split_data(df,test_ratio,formulation_type)
                DT.fit(X_train,y_train)
                val=DT.calculate_metric(X_test,y_test,formulation_type)
                if formulation_type=='Powerset':
                    powerset_metrics.append((formulation_type,criterion,max_depth,max_feature,) + val)
                else:
                    multioutput_metrics.append((formulation_type,criterion,max_depth,max_feature,) + val)

# Create a PrettyTable instance with column names
table_powerset = PrettyTable()
table_powerset.field_names = ['Formulation_type','criterion','max_depth','max_features','Accuracy','F1_micro','F1_macro','Confusion_Matrix','Precision', 'Recall','Hamming']

# Add rows of data 
for metrics_tuple in powerset_metrics:
    table_powerset.add_row(list(metrics_tuple))

print(table_powerset)     

# Create a PrettyTable instance with column names
table_MultiOutput = PrettyTable()
table_MultiOutput.field_names = ['Formulation_type','criterion','max_depth','max_features','Accuracy','F1_micro','F1_macro','Confusion_Matrix','Precision', 'Recall','Hamming']

# Add rows of data
for metrics_tuple in multioutput_metrics:
    table_MultiOutput.add_row(list(metrics_tuple))

print(table_MultiOutput)

In [None]:
# ------------------------3.4.2 Top three oerforming set of hyperparameters-------------------------
# Sort the results based on F1 micro score in descending order
sorted_results = sorted(powerset_metrics, key=lambda x: x[5], reverse=True)

# Take the top 3 combinations based on F1 micro score
top_results_micro = sorted_results[:3]

# Create a PrettyTable
table_f1micro = PrettyTable()
table_f1micro.field_names = ["Formulation_Type","criterion","max_depth","max_features","F1 Micro Score"]

# Add data to the table
for result in top_results_micro:
    table_f1micro.add_row(['Powerset']+[result[1]]+[result[2]] +[result[3]] + [result[5]])

print(table_f1micro)

# Sort the results based on F1 macro score in descending order
sorted_results = sorted(powerset_metrics, key=lambda x: x[6], reverse=True)

# Take the top 3 combinations based on F1 macro score
top_results_macro_powerset = sorted_results[:3]

# Create a PrettyTable
table_f1macro = PrettyTable()
table_f1macro.field_names = ["Formulation_Type","criterion","max_depth","max_features","F1 Macro Score"]

# Add data to the table
for result in top_results_macro_powerset:
    table_f1macro.add_row(['Powerset']+[result[1]]+[result[2]] +[result[3]] + [result[6]])

print(table_f1macro)

# Sort the results based on F1 micro score in descending order
sorted_results = sorted(multioutput_metrics, key=lambda x: x[5], reverse=True)

# Take the top 3 combinations based on F1 micro score
top_results_micro = sorted_results[:3]

# Create a PrettyTable
table_f1micro = PrettyTable()
table_f1micro.field_names = ["Formulation_Type","criterion","max_depth","max_features","F1 Micro Score"]

# Add data to the table
for result in top_results_micro:
    table_f1micro.add_row(['MultiOutput']+[result[1]]+[result[2]] +[result[3]] + [result[5]])

print(table_f1micro)

# Take the top 3 combinations based on F1 macro score
top_results_macro = sorted_results[:3]

# Create a PrettyTable
table_f1macro = PrettyTable()
table_f1macro.field_names = ["Formulation_Type","criterion","max_depth","max_features","F1 Macro Score"]

# Add data to the table
for result in top_results_macro:
    table_f1macro.add_row(['MultiOutput']+[result[1]]+[result[2]] +[result[3]] + [result[6]])

print(table_f1macro)

In [None]:
# ------------------------3.4.3 K-Fold-------------------------

from sklearn.model_selection import cross_val_score, KFold

#Implementing cross validation
k =9
kf = KFold(n_splits=k, random_state=None)
DT = MultiPowerDecisionTreeClassifier(top_results_macro_powerset[0][2],top_results_macro[0][3],top_results_macro[0][1])
X,y = DT.encode_data(df,'MultiOutput')
DT.fit(X,y)
scores = cross_val_score(DT, X, y, cv=kf, scoring='accuracy')
mean_accuracy = scores.mean()
table_accuracy = PrettyTable()
table_accuracy.field_names = ["Type","Model","Accuracy"]
table_accuracy.add_row(['Default']+['MultiOutput']+[mean_accuracy])

acc_score = []

for train_index , test_index in kf.split(X):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = [y[i] for i in train_index] , [y[i] for i in test_index]
    DT.fit(X_train,y_train)
    pred_values = DT.predict(X_test)
    acc = accuracy_score(pred_values , y_test)
    acc_score.append(acc)
     
avg_acc_score = sum(acc_score)/k
table_accuracy.add_row(['Self']+['MultiOutput']+[avg_acc_score])

DT = MultiPowerDecisionTreeClassifier(top_results_macro_powerset[0][2],top_results_macro[0][3],top_results_macro[0][1])
X,y = DT.encode_data(df,'Powerset')
DT.fit(X,y)
scores = cross_val_score(DT, X, y, cv=kf, scoring='accuracy')
mean_accuracy = scores.mean()
table_accuracy.add_row(['Default']+['Powerset']+[mean_accuracy])

acc_score = []

for train_index , test_index in kf.split(X):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = [y[i] for i in train_index] , [y[i] for i in test_index]
    DT.fit(X_train,y_train)
    pred_values = DT.predict(X_test)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score.append(acc)
     
avg_acc_score = sum(acc_score)/k
table_accuracy.add_row(['Self']+['Powerset']+[avg_acc_score])

print(table_accuracy)
