<a href="https://colab.research.google.com/github/GabiSnow/ClassifierChainModification/blob/main/TFM_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Intalaciones e Includes**

In [None]:
!pip install scikit-multilearn==0.2.0



In [None]:
!pip install arff



In [None]:
from sklearn.cluster import KMeans
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression
import numpy as np
import statistics
from sklearn.datasets import make_multilabel_classification
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from skmultilearn.problem_transform import LabelPowerset
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

#**Declaracion de balanced K Means**

In [None]:
import numpy as np
from sklearn.cluster import KMeans
import networkx as nx
from networkx.algorithms.community.kernighan_lin import kernighan_lin_bisection
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize as normalize_features
from sklearn.metrics.pairwise import euclidean_distances


class BalancedKMeansPartitioner():

	def __init__(self,num_partitions=2,tol=0.003,max_iter=300):
		self.num_partitions=num_partitions
		self.tol=tol
		self.max_iter=max_iter

	def partition(self,x_mat):
		num_samples=x_mat.shape[0]
		assert(x_mat.shape[0]==num_samples)
		num_dims=x_mat.shape[1]
		# base case
		if self.num_partitions >= num_samples:
			padding = [[] for i in range(0,self.num_partitions-num_samples)]
			one_per_part=[[idx] for idx in range(0,num_samples)]
			return one_per_part + padding
		# maximum size of each balanced cluster
		max_size=np.ceil(num_samples/self.num_partitions)
		# initialize means
		permute=np.random.permutation(num_samples)
		means=x_mat[permute[:self.num_partitions],:]
		assert(means.shape[0]==self.num_partitions)
		# initialize loop vars
		new_means=np.zeros(means.shape)
		diff=np.inf
		num_iter=0
		sorted_clusters=[None]*self.num_partitions
		# loop to convergence / limit
		while diff > self.tol and num_iter < self.max_iter:
			# calculate distances between means and labels
			# numpy voodoo
			dist= euclidean_distances(x_mat,means)
			assert(dist.shape==(num_samples,self.num_partitions))
			dist=np.sqrt(dist)

			# initialize sorted (w.r.t distance from mean)
			for i in range(0,self.num_partitions):
			 	sorted_clusters[i]=[]
			# assign new labels keeping clusters balanced
			for idx in range(0,num_samples):
				finished=False
				# insert idx into its best cluster, possibly setting off a cascade to maintain
				# size of cluster < max_size
				ins_idx=idx
				while not finished:
					best_cluster=np.argmin(dist[ins_idx,:])
					sorted_clusters[best_cluster]=self._insert_in_sorted_list(sorted_clusters[best_cluster],
												(ins_idx,dist[ins_idx,best_cluster]))
					# check if size is still ok
					if len(sorted_clusters[best_cluster])<=max_size:
						# done with the cascades (if any)
						finished=True
					else:
						# remove the worst/last label from this cluster
						ins_idx=sorted_clusters[best_cluster].pop(-1)[0]
						dist[ins_idx,best_cluster]=np.inf

			# calculate new means
			for i in range(0,self.num_partitions):
				idcs = [el[0] for el in sorted_clusters[i]]
				# print(x_mat[idcs,:].shape)
				new_means[i,:]=np.mean(x_mat[idcs,:],axis=0)
			# update
			diff=np.linalg.norm(means-new_means,ord=2)
			means=new_means
			num_iter=num_iter+1
		partitions=[None]*self.num_partitions
		for i in range(0,len(sorted_clusters)):
			partitions[i]=[j for (j,dist) in sorted_clusters[i]]
		return partitions

	def _insert_in_sorted_list(self,sorted_list,element):
		# insert element into a sorted list of (sample_id,sorting_key) pairs
		_,sorting_key=element
		# dummy element for easier indexing
		sorted_list=[(-50,-np.inf)]+sorted_list
		# linear search for correct position
		for i in range(0,len(sorted_list)):
			# reached the last position
			if i==len(sorted_list)-1:
				sorted_list.append(element)
				break
			if sorted_list[i][1]<=sorting_key and sorted_list[i+1][1]>sorting_key:
				sorted_list=sorted_list[:i+1]+[element]+sorted_list[i+1:]
				inserted=True
				break
		sorted_list=sorted_list[1:]
		return sorted_list

#**ClassifierChain basado en powersets**

In [None]:
class ClassChain_Powerset ():
  lista_clasificadores = [] # nuestro modelo
  etiquetas = [] # debemos guardar a que cluster pertenece cada etiqueta
  nClusters = 2  # Numero de clusters // hay que calcularlo, lo que necesito es un tamaño de clusters de 5

  def __init__(self,X_trainF,Y_trainF):
    self.aplicar_kmeans_etiquetas(Y_trainF)
    self.entrenar_modelo(X_trainF,Y_trainF)

  def entrenar_modelo(self,X_trainF,Y_trainF): # tengo que ajustar los parametros

    self.lista_clasificadores = []
    Y_train_list = self.separar_etiquetas(Y_trainF) # separo la matriz de etiquetas en una lista de matrices
    X_train_aux = X_trainF.copy()

    for i in range(self.nClusters):
      #m = ClassifierChain(LogisticRegression(max_iter=150)) # aqui tengo que declarar un clasificador multiclase, cuya salida es el powerset
      m = LabelPowerset(RandomForestClassifier(n_estimators=25))
      #print(Y_train_list[i])
      m.fit(X_train_aux , Y_train_list[i])
      self.lista_clasificadores += [m]

      # concatenación del los resultados y el dataset
      X_train_aux = np.concatenate((Y_train_list[i], X_train_aux),axis=1)

# hecho
  def predict(self,X_test):
    # cada clasificador predice un cluster distinto de etiquetas
    #lista_predicciones = []
    X_test_aux = X_test.copy()

    Y = np.zeros([X_test_aux.shape[0],self.etiquetas.shape[0]]) # creo la Y de salida

    # Sacamos las etiquetas predichas por cada clasificador
    for i in range(self.nClusters):
      # toarrayt() porque devolvia una matriz de dispersion
      labelsPred = self.lista_clasificadores[i].predict(X_test_aux).toarray() # etiquetas predichas por cada clasificado

      # le hago la transpuesta para poder aplicar lo de buscar etiquetas
      Y = np.transpose(Y) #esto deberia hacerlo fuera
      Y[self.etiquetas == i] = np.transpose(labelsPred)
      Y = np.transpose(Y) # la devuelvo al estado normal

      # concatenación del los resultados y el dataset
      X_test_aux = np.concatenate((labelsPred, X_test_aux),axis=1)

    return Y

  # Hecho
  # Este apartado nos divide las etiquetas en n clusters
  def aplicar_kmeans_etiquetas(self,labels):
    labels_tp = np.transpose(labels)
    num_lb = labels_tp.shape[0]
    self.nClusters = int(num_lb/5)

    if(num_lb % 5 != 0):
      self.nClusters +=1

    k_means = BalancedKMeansPartitioner(num_partitions=self.nClusters)
    particion = k_means.partition(labels_tp)
    #k_means = balanced_kmeans(X = np.transpose(labels), n_clusters=self.nClusters)
    print(particion)

    self.etiquetas = self.cambioCadena(particion,num_lb)

# Hecho
# divide nuestra matriz de etiquetas en una lista de matrices de etiquetas (una por cada cluster)
  def separar_etiquetas(self,labels):
    lista = []
    for i in range(self.nClusters):
      aux = np.transpose(labels)
      lista += [np.transpose(aux[self.etiquetas==i])]
    return lista

  def cambioCadena(self,etiquetas_cluster,num_lb):
    # Inicializa la lista resultado con 0.
    resultado = np.zeros(num_lb)

    # Asigna el número de cluster a cada elemento en la base de datos original.
    for cluster_num, elementos_cluster in enumerate(etiquetas_cluster):
        for elemento_pos in elementos_cluster:
            resultado[elemento_pos] = cluster_num
    return resultado

#**Homer**

In [None]:
import numpy as np
import copy


class LabelTree:

	def __init__(self, partitioner, leaf_classifier,
					internal_classifier, stopping_condition):
		# tree params
		self.partitioner=partitioner
		self.stopping_condition=stopping_condition
		# classification params
		self.leaf_classifier=leaf_classifier
		self.internal_classifier=internal_classifier
		# initialize tree params
		self.root=None
		self.nodes=[]

	def _get_new_node(self,parent,node_labels,train_idcs,depth):
		new_node=LabelTreeNode(self,parent,node_labels,train_idcs,depth)
		new_node.idx=len(self.nodes)
		self.nodes.append(new_node)
		return new_node

	def fit(self,x_mat,y_mat,repre):
		assert(x_mat.shape[0]==y_mat.shape[0])
		assert(repre.shape[0]==y_mat.shape[1])
		self.num_features=x_mat.shape[1]
		self.num_labels=y_mat.shape[1]
		self.num_trn_points=x_mat.shape[0]
		self._fit_tree(x_mat,y_mat,repre)
		self._fit_classifiers(x_mat,y_mat)

	def _fit_tree(self,x_mat,y_mat,repre):
		all_idcs=list(range(0,self.num_trn_points))
		all_labels=list(range(0,self.num_labels))
		self.root=self._get_new_node(None,all_labels,all_idcs,0)
		self.root._split(x_mat,y_mat,repre)

	def _fit_classifiers(self,x_mat,y_mat):
		print("Fitting ",len(self.nodes)," classifiers...")
		for i,node in enumerate(self.nodes):
			node._fit(x_mat,y_mat)
		print("Done fitting")

	def predict_proba(self,x_tst,method="beam_search",num_paths=10,recurse_threshold=0.5):
		assert(x_tst.shape[1]==self.num_features)
		assert(method in ["beam_search","recursive"])
		if method=="recursive":
			probs=np.zeros((x_tst.shape[0],self.num_labels))
			tst_idcs=list(range(0,x_tst.shape[0]))
			self.root._predict_proba(x_tst,probs,tst_idcs,recurse_threshold)
			return probs
		elif method=="beam_search":
			return self._predict_proba_beam_search(x_tst,num_paths)

	def predict(self,x_tst,threshold=0.5,method="beam_search",num_paths=10,
				recurse_threshold=0.5,return_probs=False):
		probs=self.predict_proba(x_tst,method,num_paths,recurse_threshold)
		y_pred=(probs>threshold)*1
		if return_probs:
			return y_pred,probs
		else:
			return y_pred

	def _predict_proba_beam_search(self,x_tst,num_paths):
		# vectorized beam search! way faster than calling per sample
		# but bug : in case of ties
		num_nodes=len(self.nodes)
		num_samples=x_tst.shape[0]
		# for each sample store the boundary as an array of path probs (to a node)
		boundary=np.zeros((num_samples,num_nodes))
		search_done=np.zeros(num_samples).astype(bool)
		# initialize loop variables
		boundary[:,self.root.idx]=1.0
		num_iter=0
		while not np.all(search_done):
			# get nodes which have at least one sample at it, and set search_done to true
			active_nodes_list=np.nonzero(np.sum(boundary,axis=0)>0)[0]
			# expand internal nodes in active_nodes_list
			search_done[:]=True
			for node_idx in active_nodes_list:
				node=self.nodes[node_idx]
				# active nodes which are leaves are skipped
				# ~ thus for points at leaves, search_done=True
				if node.node_type=="leaf":
					continue
				# otherwise expand this node and mark search_done=False for its points
				children_global_idcs=[self.nodes.index(ch) for ch in node.children]
				active_points=np.nonzero(boundary[:,node_idx])[0]
				search_done[active_points]=False
				# route points to children by assigning values to
				routing_probs=node.classifier.predict_proba(
									x_tst[active_points,:].reshape((-1,self.num_features)))
				assert(routing_probs.shape==(len(active_points),len(children_global_idcs)))
				path_probs= (routing_probs.T * boundary[active_points,node_idx] ).T
				for ch_par_idx,ch_glo_idx in enumerate(children_global_idcs):
					boundary[active_points,ch_glo_idx]=path_probs[:,ch_par_idx]
				# erase parent from boundary
				boundary[active_points,node_idx]=0
			# filter out all except best num_paths nodes for each sample
			best_nodes_per_sample=np.argsort(boundary,axis=1)[:,-num_paths:]
			new_boundary=boundary.copy()*0
			for s_idx in range(0,num_samples):
				for best_idx in range(0,num_paths):
					best_node_idx=best_nodes_per_sample[s_idx,-(best_idx+1)]
					# boundary is < num_paths
					if boundary[s_idx,best_node_idx]==0:
						break
					new_boundary[s_idx,best_node_idx]=boundary[s_idx,best_node_idx]
			boundary=new_boundary
			num_iter+=1

		# the active_nodes_list should be just leaves now, so lets fill the final probability arr
		# by multiplying path probs (boundary) with leaf probs
		active_nodes_list=np.nonzero(np.sum(boundary,axis=0)>0)[0]
		probs=np.zeros((num_samples,self.num_labels))
		for node_idx in active_nodes_list:
			node=self.nodes[node_idx]
			assert(node.node_type=="leaf")
			active_points=np.nonzero(boundary[:,node_idx])[0]
			probs_leaf=node.classifier.predict_proba(x_tst[active_points,:].reshape((-1,self.num_features)))
			total_probs=(probs_leaf.T * boundary[active_points,node_idx] )
			# assign probs
			for i,lab in enumerate(node.labels):
				probs[active_points,lab]=total_probs[:,i]
		return probs

	def walk_tree(self,walker):
		self.root._walk(walker)


class LabelTreeNode:

	def __init__(self,tree,parent,node_labels,train_idcs,depth):
		# labels values refer to original labelset (0,y_mat.shape[1]-1)
		self.labels=list(node_labels)
		# x_idcs refer to the training set (x_mat,y_mat)
		self.train_idcs=list(train_idcs)
		self.tree=tree
		self.depth=depth
		self.parent=parent
		self.node_type=None
		self.children=[]
		self.classifier=None

	def _check_partitions(self,lparts,dparts):
		union_labels=set()
		union_data=set()
		for i in range(0,len(dparts)):
			union_labels=union_labels|set(lparts[i])
			union_data=union_data|set(dparts[i])
		assert(union_labels==set(self.labels))
		if self.parent is not None:
			assert(union_data==set(self.train_idcs))

	def _split(self,x_mat,y_mat,repre):
		# check if we have reached a leaf
		if self.tree.stopping_condition.check(self,x_mat,y_mat,repre):
			self.node_type="leaf"
			return
		self.node_type="internal"
		# partition labels and filter empty partiyions
		lparts_temp=self.tree.partitioner.partition(repre[self.labels,:])
		lparts=[]
		for lpart in lparts_temp:
			if lpart==[]:
				continue
			# translate to original indices
			lp=[self.labels[lab] for lab in lpart]
			lparts.append(lp)
		# partition data
		dparts=self._partition_data_by_labels(lparts,y_mat)
		self._check_partitions(lparts,dparts)
		if len(dparts)<=1:
			self.node_type="leaf"
			return
		# for each partition instantiate a new node
		for i in range(0,len(lparts)):
			child=self.tree._get_new_node(self,lparts[i],dparts[i],self.depth+1)
			self.children.append(child)
			child._split(x_mat,y_mat,repre)

	def _partition_data_by_labels(self,label_partitions,y_mat):
		# check that all labels are valid indices
		for lpart in label_partitions:
			for label in lpart:
				assert(label<y_mat.shape[1])
		# partition data
		data_partitions=[]
		for lpart in label_partitions:
			if lpart==[]:
				continue
			# get points where at least one of the labels is active / =1
			active_bool=np.sum(y_mat[:,lpart],axis=1) > 0
			data_idcs=np.nonzero(active_bool)[0]
			data_partitions.append(data_idcs.tolist())
		return data_partitions

	def _fit(self,x_mat,y_mat):
		if self.node_type=="leaf":
			self.classifier=copy.deepcopy(self.tree.leaf_classifier)
		else:
			self.classifier=copy.deepcopy(self.tree.internal_classifier)
		y_mat_new=None
		if self.node_type=="internal":
			# for internal nodes, generate pseudo labels for children
			y_mat_new=np.zeros((y_mat.shape[0],len(self.children)))
			for i,child in enumerate(self.children):
				y_mat_new[child.train_idcs,i]=1
			# restrict to active points only
			y_mat_new=y_mat_new[self.train_idcs,:]
		elif self.node_type=="leaf":
			# restrict to active points only with active labels
			y_mat_new=y_mat[self.train_idcs,:][:,self.labels]
		# restrict x_mat to active points only
		x_mat_new=x_mat[self.train_idcs,:]
		self.classifier.fit(x_mat_new,y_mat_new)

	def _predict_proba(self,x_tst_global,probs_global,tst_idcs,recurse_threshold):
		if self.node_type=="internal":
			routing_probs=self.classifier.predict_proba(x_tst_global[tst_idcs,:])
			routing_labels=(routing_probs>recurse_threshold)*1
			for ch_idx,child in enumerate(self.children):
				child_tst_idcs=np.nonzero(routing_labels[:,ch_idx]>0)[0]
				# translate to original array
				child_tst_idcs=[tst_idcs[idx] for idx in child_tst_idcs]
				if len(child_tst_idcs)>0:
					child._predict_proba(x_tst_global,probs_global,child_tst_idcs,recurse_threshold)
		if self.node_type=="leaf":
			probs_leaf=self.classifier.predict_proba(x_tst_global[tst_idcs,:])
			for i,lab in enumerate(self.labels):
				probs_global[tst_idcs,lab]=probs_leaf[:,i]

	def _walk(self,walker):
		walker.process_node(self)
		for child in self.children:
			child._walk(walker)


class LeafSizeStoppingCondition:

	def __init__(self,min_leaf_size):
		assert(min_leaf_size>0)
		self.min_leaf_size=min_leaf_size

	def check(self,lnode,x_mat,y_mat,repre):
		return len(lnode.labels) <= self.min_leaf_size

In [None]:
def generate_parabel_label_representations(x_mat,y_mat):
	assert(y_mat.shape[0]==x_mat.shape[0])
	x_dim=x_mat.shape[1]
	num_labels=y_mat.shape[1]
	repre=np.zeros((num_labels,x_dim))
	# for each label calculate the mean of its support
	for l in range(0,num_labels):
		supp_idcs=np.nonzero(y_mat[:,l]>0)[0]
		if len(supp_idcs)==0:
			print("Warning! Label ",l,"has no positive examples. Setting representation to 0.")
			continue
		label_support=x_mat[supp_idcs,:]
		mean_support_vec=np.mean(label_support,axis=0)
		repre[l,:]=mean_support_vec
	# normalize features after
	repre=normalize_features(repre)
	return repre


**Declaracion de Classifier Chain**

In [None]:
class ClassifierChainClass:

	def __init__(self,max_itera=150):
		# base classifier should be like an sklearn classifier
		# importantly support fit(x_mat,)
		self.classifier=ClassifierChain(LogisticRegression(max_iter=max_itera))
		self.num_labels=None
		self.classifier_list=None

	def fit(self,x_mat,y_mat):
		self.classifier.fit(x_mat,y_mat)

	def predict_proba(self,x_tst):
		return self.classifier.predict_proba(x_tst).toarray()

	def predict(self,x_tst):
			return self.classifier.predict(x_tst).toarray()

#**Pruebas de rendimiento**

In [None]:
from skmultilearn.problem_transform import ClassifierChain, BinaryRelevance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, jaccard_score
import time

In [None]:
def evaluar_dataset(X_train,Y_train,X_test,Y_test,max_itera=150,leaf_cond=5):

  cc_train_time = 0
  cc_test_time = 0
  cc_f1 = 0
  cc_acc = 0

  pw_train_time = 0
  pw_test_time = 0
  pw_f1 = 0
  pw_acc = 0

  lb_train_time = 0
  lb_test_time = 0
  lb_f1 = 0
  lb_acc = 0

  num = 3
  for x in range(num):
    # Declaro un classifier chain normal
    cc = ClassifierChain(LogisticRegression(max_iter=max_itera))
    #cc = ClassifierChain(RandomForestClassifier(n_estimators=25))
    ini = time.time()
    cc.fit(X_train, Y_train)
    fin = time.time()
    cc_train_time += fin - ini

    ini = time.time()
    cc_Y_pred = cc.predict(X_test).toarray()
    fin = time.time()
    cc_test_time += fin - ini

    cc_f1 += f1_score(Y_test,cc_Y_pred, average='micro')
    cc_acc += jaccard_score(Y_test,cc_Y_pred, average='micro')

    # Declaro un classifier chain basado en powersets
    ini = time.time()
    pw = ClassChain_Powerset(X_train,Y_train)
    fin = time.time()
    pw_train_time += fin - ini

    ini = time.time()
    pw_Y_pred = pw.predict(X_test)
    fin = time.time()
    pw_test_time += fin - ini

    pw_f1 += f1_score(Y_test,pw_Y_pred, average='micro')
    pw_acc += jaccard_score(Y_test,pw_Y_pred, average='micro')

    # Declaracion de homer
    repre = generate_parabel_label_representations(X_train,Y_train)

    lb = LabelTree(BalancedKMeansPartitioner(), ClassifierChainClass(max_itera),ClassifierChainClass(max_itera), LeafSizeStoppingCondition(leaf_cond))

    ini = time.time()
    lb.fit(X_train,Y_train,repre)
    fin = time.time()
    lb_train_time += fin - ini

    ini = time.time()
    lb_Y_pred = lb.predict(X_test,method="recursive",num_paths=10,recurse_threshold=0.5)
    fin = time.time()
    lb_test_time += fin - ini

    lb_f1 += f1_score(Y_test,lb_Y_pred, average='micro')
    lb_acc += jaccard_score(Y_test,lb_Y_pred, average='micro')
    # Main


  print("-----------------------")
  print("Classifier Chain")
  print(f"Tiempo entrenamiento: {cc_train_time/num}")
  print(f"Tiempo prediccion: {cc_test_time/num}")
  print(f"F1 Score: {cc_f1/num}")
  print(f"Accuracy: {cc_acc/num}")

  print("\n-----------------------")
  print("Powerset Classifier Chain")
  print(f"Tiempo entrenamiento: {pw_train_time/num}")
  print(f"Tiempo prediccion: {pw_test_time/num}")
  print(f"F1 Score: {pw_f1/num}")
  print(f"Accuracy: {pw_acc/num}")


  print("\n-----------------------")
  print("Homer Class Chain")
  print(f"Tiempo entrenamiento: {lb_train_time/num}")
  print(f"Tiempo prediccion: {lb_test_time/num}")
  print(f"F1 Score: {lb_f1/num}")
  print(f"Accuracy: {lb_acc/num}")



#**Carga de datasets**

In [None]:
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from skmultilearn.dataset import load_dataset
from sklearn.preprocessing import StandardScaler #Para escalar los datos

In [None]:
from skmultilearn.dataset import available_data_sets
set([x[0] for x in available_data_sets().keys()])

{'Corel5k',
 'bibtex',
 'birds',
 'delicious',
 'emotions',
 'enron',
 'genbase',
 'mediamill',
 'medical',
 'rcv1subset1',
 'rcv1subset2',
 'rcv1subset3',
 'rcv1subset4',
 'rcv1subset5',
 'scene',
 'tmc2007_500',
 'yeast'}

#**Scene**

In [None]:
# Scene Dataset
X_train, Y_train, feature_names, label_names = load_dataset('scene', 'train')
X_test, Y_test, feature_names, label_names = load_dataset('scene', 'test')
print(X_train.shape)
print(Y_train.shape)

X_train = X_train.toarray()
Y_train = Y_train.toarray()
X_test = X_test.toarray()
Y_test = Y_test.toarray()

scene:train - exists, not redownloading
scene:test - exists, not redownloading
(1211, 294)
(1211, 6)


In [None]:
evaluar_dataset(X_train,Y_train,X_test,Y_test,150,3)

[[2, 0, 4], [1, 3, 5]]
Fitting  3  classifiers...
Done fitting
[[1, 3, 2], [0, 5, 4]]
Fitting  3  classifiers...
Done fitting
[[1, 3, 2], [0, 5, 4]]
Fitting  3  classifiers...
Done fitting
[[1, 3, 5], [2, 0, 4]]
Fitting  3  classifiers...
Done fitting
[[1, 3, 5], [2, 0, 4]]
Fitting  3  classifiers...
Done fitting
-----------------------
Classifier Chain
Tiempo entrenamiento: 0.6644410610198974
Tiempo prediccion: 0.06157040596008301
F1 Score: 0.7101159536185526
Accuracy: 0.5505269683818971

-----------------------
Powerset Classifier Chain
Tiempo entrenamiento: 1.0079029560089112
Tiempo prediccion: 0.1577674388885498
F1 Score: 0.7132513792975697
Accuracy: 0.5544744776429196

-----------------------
Homer Class Chain
Tiempo entrenamiento: 0.6488527774810791
Tiempo prediccion: 0.11079916954040528
F1 Score: 0.680870503688609
Accuracy: 0.5161583861378245


#**Emotions**

In [None]:
# Emotions Dataset
X_train, Y_train, feature_names, label_names = load_dataset('emotions', 'train')
X_test, Y_test, feature_names, label_names = load_dataset('emotions', 'test')

print(X_train.shape)
print(Y_train.shape)
#print(X_train.toarray())
#print(Y_train.toarray())

X_train = X_train.toarray()
Y_train = Y_train.toarray()
X_test = X_test.toarray()
Y_test = Y_test.toarray()

X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

emotions:train - does not exists downloading
Downloaded emotions-train
emotions:test - does not exists downloading
Downloaded emotions-test
(391, 72)
(391, 6)


Número de clases únicas: 2
Clases únicas: [0 1]


In [None]:
evaluar_dataset(X_train,Y_train,X_test,Y_test,150,3)

[[0, 5, 1], [3, 4, 2]]
Fitting  3  classifiers...
Done fitting
[[4, 3, 0], [1, 2, 5]]
Fitting  3  classifiers...
Done fitting
[[0, 5, 1], [3, 4, 2]]
Fitting  3  classifiers...
Done fitting
[[0, 5, 4], [2, 3, 1]]
Fitting  3  classifiers...
Done fitting
[[1, 2, 5], [4, 3, 0]]
Fitting  3  classifiers...
Done fitting
-----------------------
Classifier Chain
Tiempo entrenamiento: 0.1836625099182129
Tiempo prediccion: 0.008986711502075195
F1 Score: 0.6066225165562914
Accuracy: 0.43536121673003797

-----------------------
Powerset Classifier Chain
Tiempo entrenamiento: 0.2200599193572998
Tiempo prediccion: 0.02763199806213379
F1 Score: 0.6307562952162277
Accuracy: 0.4608109637090907

-----------------------
Homer Class Chain
Tiempo entrenamiento: 0.18706755638122557
Tiempo prediccion: 0.02287611961364746
F1 Score: 0.6359444118222745
Accuracy: 0.46621591291661985


#**Birds**

In [None]:
# Scene Dataset
X_train, Y_train, feature_names, label_names = load_dataset('birds', 'train')
X_test, Y_test, feature_names, label_names = load_dataset('birds', 'test')

print(X_train.shape)
print(Y_train.shape)
#print(X_train.toarray())
#print(Y_train.toarray())

X_train = X_train.toarray()
Y_train = Y_train.toarray()
X_test = X_test.toarray()
Y_test = Y_test.toarray()

X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

birds:train - does not exists downloading
Downloaded birds-train
birds:test - does not exists downloading
Downloaded birds-test
(322, 260)
(322, 19)


In [None]:
evaluar_dataset(X_train,Y_train,X_test,Y_test,150,3)

[[18, 12, 7, 2, 9], [0, 5, 4, 11, 14], [16, 3, 13, 17, 15], [6, 8, 1, 10]]
Fitting  15  classifiers...
Done fitting
[[6, 9, 8, 10], [12, 7, 18, 2, 1], [3, 16, 13, 17, 5], [15, 0, 4, 11, 14]]
Fitting  15  classifiers...
Done fitting
[[16, 3, 13, 17, 15], [14, 2, 7, 8, 6], [18, 9, 1, 10], [0, 5, 11, 4, 12]]
Fitting  15  classifiers...
Done fitting
[[16, 13, 17, 0, 11], [12, 6, 1, 10], [3, 5, 18, 2, 7], [15, 4, 14, 8, 9]]
Fitting  15  classifiers...
Done fitting
[[16, 3, 13, 15, 4], [0, 5, 17, 18, 9], [11, 12, 14, 6, 1], [2, 7, 8, 10]]
Fitting  15  classifiers...
Done fitting
-----------------------
Classifier Chain
Tiempo entrenamiento: 0.5789075374603272
Tiempo prediccion: 0.041504859924316406
F1 Score: 0.4350877192982455
Accuracy: 0.27802690582959644

-----------------------
Powerset Classifier Chain
Tiempo entrenamiento: 0.4663733959197998
Tiempo prediccion: 0.0937422275543213
F1 Score: 0.300337773495092
Accuracy: 0.17695618647525282

-----------------------
Homer Class Chain
Tiempo e

#**Mediamill**

In [None]:
# Scene Dataset
X_train, Y_train, feature_names, label_names = load_dataset('mediamill', 'train')
X_test, Y_test, feature_names, label_names = load_dataset('mediamill', 'test')
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
#print(X_train.toarray())
#print(Y_train.toarray())

X_train = X_train.toarray()
Y_train = Y_train.toarray()
X_test = X_test.toarray()
Y_test = Y_test.toarray()

mediamill:train - does not exists downloading
Downloaded mediamill-train
mediamill:test - does not exists downloading
Downloaded mediamill-test
(30993, 120)
(30993, 101)
(12914, 120)


In [None]:
evaluar_dataset(X_train,Y_train,X_test,Y_test,450,10)

[[40, 23, 8, 1, 41], [80, 61, 38, 0, 79], [70, 59, 99, 85, 7], [11, 95, 93, 24, 43], [57, 56, 16, 96, 54], [75, 78, 97, 65, 31], [48, 87, 47, 74, 63], [37, 13, 76, 64, 60], [53, 49, 62, 71, 18], [55, 68, 44, 98, 22], [5, 50, 77, 46, 89], [27, 29, 9, 25, 14], [84, 51, 2], [88, 39, 4, 6, 86], [20, 15, 17, 28, 72], [81, 83, 94, 58, 34], [12, 32, 90, 10, 21], [35, 26, 69, 3, 92], [30, 36, 73, 42, 52], [19, 91, 45, 100, 82], [66, 67, 33]]
Fitting  31  classifiers...
Done fitting
[[55, 60, 81, 68, 44], [32, 6, 90, 79, 4], [17, 28, 72, 74, 29], [27, 9, 25, 63, 73], [69, 38, 0, 100, 92], [41, 36, 42, 52, 62], [51, 24, 33, 67], [40, 14, 8, 1, 49], [5, 50, 77, 46, 99], [23, 71, 12, 30, 39], [48, 7, 85, 15, 47], [10, 21, 26, 82, 3], [16, 95, 75, 57, 11], [97, 96, 31], [98, 58, 94, 83, 34], [2, 54, 56, 43, 84], [70, 20, 89, 59, 87], [53, 88, 35, 80, 18], [37, 64, 76, 13, 22], [91, 86, 19, 61, 45], [93, 78, 65, 66]]
Fitting  31  classifiers...
Done fitting
[[81, 98, 94, 83, 58], [71, 100, 55, 18, 0

#**tmc2007_500**

In [None]:
# Scene Dataset
X_train, Y_train, feature_names, label_names = load_dataset('tmc2007_500', 'train')
X_test, Y_test, feature_names, label_names = load_dataset('tmc2007_500', 'test')

print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
#print(X_train.toarray())
#print(Y_train.toarray())

X_train = X_train.toarray()
Y_train = Y_train.toarray()
X_test = X_test.toarray()
Y_test = Y_test.toarray()

tmc2007_500:train - exists, not redownloading
tmc2007_500:test - exists, not redownloading
(21519, 500)
(21519, 22)
(7077, 500)


In [None]:
evaluar_dataset(X_train,Y_train,X_test,Y_test,300,6)

[[8, 16, 21, 19, 15], [14, 20, 10, 2, 3], [9, 17, 13, 0, 7], [6, 12, 4, 5, 18], [1, 11]]
Fitting  7  classifiers...
Done fitting
[[9, 15, 17, 6, 7], [16, 8, 3, 21, 13], [0, 12, 4, 11, 18], [14, 20, 10, 2, 19], [1, 5]]
Fitting  7  classifiers...
Done fitting
[[3, 17, 13, 0, 6], [8, 16, 21, 15, 9], [7, 12, 4, 11, 5], [14, 20, 10, 2, 19], [1, 18]]
Fitting  7  classifiers...
Done fitting
-----------------------
Classifier Chain
Tiempo entrenamiento: 72.80069414774577
Tiempo prediccion: 0.5329740047454834
F1 Score: 0.7088711329645564
Accuracy: 0.5490320533164075

-----------------------
Powerset Classifier Chain
Tiempo entrenamiento: 16.432985464731853
Tiempo prediccion: 2.5953524907430015
F1 Score: 0.9926594916194421
Accuracy: 0.985426445563078

-----------------------
Homer Class Chain
Tiempo entrenamiento: 53.23684024810791
Tiempo prediccion: 1.0303197701772053
F1 Score: 0.71344245663539
Accuracy: 0.5545380579921628


#**Yeast**

In [None]:
# Scene Dataset
X_train, Y_train, feature_names, label_names = load_dataset('yeast', 'train')
X_test, Y_test, feature_names, label_names = load_dataset('yeast', 'test')

print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
#print(X_train.toarray())
#print(Y_train.toarray())

X_train = X_train.toarray()
Y_train = Y_train.toarray()
X_test = X_test.toarray()
Y_test = Y_test.toarray()

yeast:train - does not exists downloading
Downloaded yeast-train
yeast:test - does not exists downloading
Downloaded yeast-test
(1500, 103)
(1500, 14)
(917, 103)


In [None]:
evaluar_dataset(X_train,Y_train,X_test,Y_test,150,10)

[[12, 11, 4, 3], [9, 13, 8, 10, 2], [7, 6, 0, 1, 5]]
Fitting  3  classifiers...
Done fitting
[[12, 11, 4, 5], [13, 9, 8, 10, 0], [7, 6, 2, 3, 1]]
Fitting  3  classifiers...
Done fitting
[[6, 5, 7, 13, 8], [11, 12, 0, 4], [9, 2, 10, 3, 1]]
Fitting  3  classifiers...
Done fitting
[[13, 9, 10, 8, 6], [0, 7, 5, 1], [12, 11, 3, 2, 4]]
Fitting  3  classifiers...
Done fitting
[[8, 13, 9, 10, 7], [0, 6, 5, 1], [12, 11, 3, 2, 4]]
Fitting  3  classifiers...
Done fitting
-----------------------
Classifier Chain
Tiempo entrenamiento: 0.3684194564819336
Tiempo prediccion: 0.026512861251831055
F1 Score: 0.6097318768619663
Accuracy: 0.4385714285714285

-----------------------
Powerset Classifier Chain
Tiempo entrenamiento: 1.467143440246582
Tiempo prediccion: 0.21868138313293456
F1 Score: 0.6267452581403309
Accuracy: 0.45640470853379467

-----------------------
Homer Class Chain
Tiempo entrenamiento: 0.466257905960083
Tiempo prediccion: 0.08161706924438476
F1 Score: 0.6481305123603345
Accuracy: 0.479

#**Enron**

In [None]:
# Scene Dataset
X_train, Y_train, feature_names, label_names = load_dataset('enron', 'train')
X_test, Y_test, feature_names, label_names = load_dataset('enron', 'test')
print(X_train.shape)
print(Y_train.shape)

X_train = X_train.toarray()
Y_train = Y_train.toarray()
X_test = X_test.toarray()
Y_test = Y_test.toarray()

enron:train - exists, not redownloading
enron:test - exists, not redownloading
(1123, 1001)
(1123, 53)


In [None]:
import numpy as np

# Supongamos que 'y' es tu conjunto de etiquetas multietiqueta
num_labels = Y_train.shape[1]  # Número de etiquetas
etiq= []
for label_idx in range(num_labels):
    unique_classes = np.unique(Y_train[:, label_idx])  # Clases únicas en esta etiqueta
    num_unique_classes = len(unique_classes)  # Número de clases únicas

    if num_unique_classes != 2:
        etiq+=[label_idx]
print(etiq)



[45]


In [None]:
label_idx_to_remove = 45  # Reemplaza con la posición de la etiqueta que deseas eliminar (0 para la primera etiqueta, 1 para la segunda, y así sucesivamente)

# Crea un nuevo conjunto de etiquetas sin la etiqueta en la posición 'label_idx_to_remove'
Y_train = np.delete(Y_train, label_idx_to_remove, axis=1)
Y_test = np.delete(Y_test, label_idx_to_remove, axis=1)

In [None]:
evaluar_dataset(X_train,Y_train,X_test,Y_test,150,10)

[[0, 37, 18, 7, 42], [36, 19, 47, 28, 17], [3, 34, 49, 33, 9], [25, 6, 14], [13, 39, 45, 11], [4, 29, 12, 48, 21], [46, 51, 30, 2, 16], [41, 31, 22, 43, 24], [26, 8, 38, 10, 40], [1, 5, 20, 23, 44], [32, 27, 50, 15, 35]]
Fitting  39  classifiers...
Done fitting
[[49, 1, 43, 5, 22], [29, 44, 48, 39, 20], [36, 19, 41, 37, 18], [30, 50, 15, 16, 26], [8, 3, 28, 33, 34], [25, 6, 11], [35, 38, 10, 47, 17], [45, 23, 21, 14], [46, 51, 2, 32, 27], [40, 9, 7, 0, 42], [31, 4, 12, 24, 13]]
Fitting  39  classifiers...
Done fitting
[[28, 19, 49, 33, 18], [6, 11], [46, 51, 30, 2, 32], [27, 50, 15, 16, 8], [35, 26, 10, 36, 40], [41, 7, 31, 1, 43], [39, 45, 21, 25, 14], [9, 0, 37, 42, 22], [5, 24, 20, 23, 4], [38, 47, 17, 3, 34], [44, 12, 29, 13, 48]]
Fitting  39  classifiers...
Done fitting
-----------------------
Classifier Chain
Tiempo entrenamiento: 8.01002812385559
Tiempo prediccion: 0.2501362164815267
F1 Score: 0.5448071216617211
Accuracy: 0.37438825448613383

-----------------------
Powerset Cla

#**Bibtex**

In [None]:
# Scene Dataset
X_train, Y_train, feature_names, label_names = load_dataset('bibtex', 'train')
X_test, Y_test, feature_names, label_names = load_dataset('bibtex', 'test')
print(X_train.shape)
print(Y_train.shape)

X_train = X_train.toarray()
Y_train = Y_train.toarray()
X_test = X_test.toarray()
Y_test = Y_test.toarray()

bibtex:train - exists, not redownloading
bibtex:test - exists, not redownloading
(4880, 1836)
(4880, 159)


In [None]:
evaluar_dataset(X_train,Y_train,X_test,Y_test,150,10)

[[50, 46, 92, 116, 99], [123, 137, 103, 20, 2], [45, 69, 147, 158, 31], [61, 9, 124, 6, 117], [33, 34, 4, 125, 87], [37, 40, 82, 115, 74], [80, 35, 16, 22, 70], [155, 48, 11, 15, 66], [107, 77, 119, 151, 42], [89, 71, 7, 19, 5], [108, 47, 150, 30, 142], [120, 51, 67, 126, 110], [144, 138, 96, 97, 141], [100, 85, 143, 149, 111], [84, 27, 113, 36, 83], [10, 131, 14, 134], [145, 114, 153, 90, 65], [133, 58, 98, 8, 78], [53, 29, 55, 121, 101], [130, 128, 59, 109, 17], [112, 0, 26, 127, 102], [68, 106, 43, 76, 154], [1, 18, 93, 146, 13], [12, 39, 132, 94, 60], [44, 41, 63, 129, 88], [57, 25, 140, 79, 148], [105, 64, 32, 73, 24], [56, 95, 72, 28, 136], [49, 23, 81, 3, 135], [122, 156, 104, 75, 52], [62, 38, 118, 91, 86], [139, 54, 157, 21, 152]]
Fitting  31  classifiers...
Done fitting
[[76, 3, 61, 23, 119], [64, 65, 5, 114, 142], [157, 21, 139, 16, 1], [130, 59, 128, 0, 34], [143, 41, 141, 6, 117], [28, 136, 40, 58, 20], [68, 106, 43, 73, 121], [57, 140, 154, 110, 84], [22, 111, 9, 146, 96]

#**Delicious**

In [None]:
# Scene Dataset
X_train, Y_train, feature_names, label_names = load_dataset('delicious', 'train')
X_test, Y_test, feature_names, label_names = load_dataset('delicious', 'test')
print(X_train.shape)
print(Y_train.shape)

X_train = X_train.toarray()
Y_train = Y_train.toarray()
X_test = X_test.toarray()
Y_test = Y_test.toarray()

delicious:train - exists, not redownloading
delicious:test - exists, not redownloading
(12920, 500)
(12920, 983)


In [None]:
evaluar_dataset(X_train,Y_train,X_test,Y_test,250,10)

[[9, 28, 37, 36, 30], [14, 15, 25, 13, 29], [19, 20, 0, 26, 35], [2, 21, 23, 16, 10], [27, 4, 8], [3, 5, 6, 11, 12], [31, 1, 33, 18, 32], [7, 17, 22, 24, 34]]


KeyboardInterrupt: ignored

#**Medical**

In [None]:
# Scene Dataset
X_train, Y_train, feature_names, label_names = load_dataset('medical', 'train')
X_test, Y_test, feature_names, label_names = load_dataset('medical', 'test')
print(X_train.shape)
print(Y_train.shape)

X_train = X_train.toarray()
Y_train = Y_train.toarray()
X_test = X_test.toarray()
Y_test = Y_test.toarray()

medical:train - does not exists downloading
Downloaded medical-train
medical:test - does not exists downloading
Downloaded medical-test
(333, 1449)
(333, 45)


In [None]:
import numpy as np

# Supongamos que 'y' es tu conjunto de etiquetas multietiqueta
num_labels = Y_train.shape[1]  # Número de etiquetas
etiq= []
for label_idx in range(num_labels):
    unique_classes = np.unique(Y_train[:, label_idx])  # Clases únicas en esta etiqueta
    num_unique_classes = len(unique_classes)  # Número de clases únicas

    if num_unique_classes != 2:
        etiq+=[label_idx]
print(etiq)

In [None]:
evaluar_dataset(X_train,Y_train,X_test,Y_test,250,8)

[[37, 20, 26, 0, 35], [27, 4, 8], [1, 9, 29, 33, 18], [14, 25, 10, 13, 31], [3, 5, 6, 7, 11], [12, 17, 24, 34, 2], [22, 21, 23, 15, 16], [32, 28, 19, 36, 30]]
Fitting  15  classifiers...
Done fitting
[[32, 28, 19, 36, 37], [30, 20, 35, 0, 26], [1, 9, 29, 33, 18], [27, 4, 8], [14, 25, 10, 13, 31], [3, 5, 11, 12, 17], [7, 22, 24, 34, 2], [6, 21, 23, 15, 16]]
Fitting  15  classifiers...
Done fitting
[[3, 5, 7, 11, 22], [21, 25, 31, 1, 9], [10, 29, 33, 18, 30], [36, 20, 0, 26, 35], [12, 17, 24, 34, 2], [6, 23, 14, 15, 16], [27, 4, 8], [13, 32, 28, 19, 37]]
Fitting  15  classifiers...
Done fitting
-----------------------
Classifier Chain
Tiempo entrenamiento: 1.4407409032185872
Tiempo prediccion: 0.21867974599202475
F1 Score: 0.6863636363636362
Accuracy: 0.5224913494809689

-----------------------
Powerset Classifier Chain
Tiempo entrenamiento: 0.9213058153788248
Tiempo prediccion: 0.35993361473083496
F1 Score: 0.6669534318806715
Accuracy: 0.5004623716529512

-----------------------
Homer C

#Wilcoxon


In [None]:
better_is_higher = True
def Wilcoxon(x, y=None, zero_method="wilcox", correction=False):
    """
    Calculate the Wilcoxon signed-rank test.

    The Wilcoxon signed-rank test tests the null hypothesis that two
    related paired samples come from the same distribution. In particular,
    it tests whether the distribution of the differences x - y is symmetric
    about zero. It is a non-parametric version of the paired T-test.

    Parameters
    ----------
    x : array_like
        The first set of measurements.
    y : array_like, optional
        The second set of measurements.  If `y` is not given, then the `x`
        array is considered to be the differences between the two sets of
        measurements.
    zero_method : string, {"pratt", "wilcox", "zsplit"}, optional
        "pratt":
            Pratt treatment: includes zero-differences in the ranking process
            (more conservative)
        "wilcox":
            Wilcox treatment: discards all zero-differences
        "zsplit":
            Zero rank split: just like Pratt, but spliting the zero rank
            between positive and negative ones
    correction : bool, optional
        If True, apply continuity correction by adjusting the Wilcoxon rank
        statistic by 0.5 towards the mean value when computing the
        z-statistic.  Default is False.

    Returns
    -------
    T : float
        The sum of the ranks of the differences above or below zero, whichever
        is smaller.
    p-value : float
        The two-sided p-value for the test.

    Notes
    -----
    Because the normal approximation is used for the calculations, the
    samples used should be large.  A typical rule is to require that
    n > 20.

    References
    ----------
    .. [1] http://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test

    """

    if not zero_method in ["wilcox", "pratt", "zsplit"]:
        raise ValueError("Zero method should be either 'wilcox' or 'pratt' or 'zsplit'")

    if y is None:
        d = x
    else:
        x, y = map(asarray, (x, y))
        if len(x) != len(y):
            raise ValueError('Unequal N in wilcoxon.  Aborting.')
        if better_is_higher:
            d = x-y
        else:
            d = y-x

    if zero_method == "wilcox":
        d = compress(not_equal(d, 0), d, axis=-1)  # Keep all non-zero differences

    count = len(d)
    #if (count < 10):
     #   print("Warning: sample size too small for normal approximation.")
      #  return
    r = stats.rankdata(abs(d))
    r_plus = sum((d > 0) * r, axis=0)
    r_minus = sum((d < 0) * r, axis=0)

    if zero_method == "zsplit":
        r_zero = sum((d == 0) * r, axis=0)
        r_plus += r_zero/2.
        r_minus += r_zero/2.

    # T = np.min(r_plus, r_minus)   It does not work, I don't know why!
    T = r_plus if r_plus < r_minus else r_minus
    mn = count*(count + 1.) * 0.25
    se = count*(count + 1.) * (2. * count + 1.)

    if zero_method == "pratt":
        r = r[d != 0]

    replist, repnum = find_repeats(r)
    if repnum.size != 0:
        # Correction for repeated elements.
        se -= 0.5 * (repnum * (repnum * repnum - 1)).sum()

    se = sqrt(se/24)
    correction = 0.5 * int(bool(correction)) * sign(T - mn)
    z = (T - mn - correction)/se
    prob = 2. * distributions.norm.sf(abs(z))
    return r_minus, r_plus, prob

# Get average value of 1 or more columns
def AverageColums(c, clist, w):
    v = [(float(c[i]) if i < len(c) else 0.0) for i in clist]
    if sum(w) <= 0.0:   # If all weights are 0 use geometric mean
        mean  =  gmean(v)
    else:
        mean = sum([x*y for x,y in zip(v, w)])

    return mean

In [None]:
import sys
import numpy as np
import math
from numpy import *
from scipy.stats import wilcoxon, ttest_ind, mannwhitneyu, find_repeats, distributions, norm, rankdata, f, gmean
from scipy import stats
from pathlib import Path
import argparse
from scipy.stats import studentized_range

In [None]:
Wilcoxon([0.1,0.2],[0.2,0.3])

(3.0, 0.0, 0.17971249487899976)