In [1]:
import math
import warnings
warnings.filterwarnings("ignore")
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import NearestNeighbors
from skmultilearn.dataset import load_dataset
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.cluster import LabelCooccurrenceGraphBuilder
from skmultilearn.model_selection import IterativeStratification
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from scipy import sparse
import random

In [2]:
# data processing 
def feature_select(p, X, feature_names):
    if p == 1:
        return X.toarray(), feature_names
    else:
        feature_count = int(X.shape[1] * p)
        select_feature_index = [x[0] for x in (sorted(enumerate(X.sum(axis=0).tolist()[0]), key=lambda x: x[1], reverse=True))][:feature_count]
        all_feature_index = [i for i in range(X.shape[1])]
        feature_index = [i for i in all_feature_index if i not in select_feature_index]
        new_x = np.delete(X.toarray(), feature_index, axis=1)
        new_feature_names = [feature_names[i] for i in select_feature_index]
        return new_x, new_feature_names

def label_select(y, label_names):
    b = []
    new_label_names = [i for i in label_names]
    for i in range(y.shape[1]):
        if y[:, i].sum() <= 5:
            b.append(i)
            new_label_names.remove(label_names[i])
    new_y = np.delete(y.toarray(), b, axis=1)
    return new_y, new_label_names

def get_most_related_nodes(y):
    num_nodes=y.shape[1]
    graph_builder = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False)
    graph = graph_builder.transform(y)
    related_nodes = {}
    for edge, weight in graph.items():
        related_nodes.setdefault(edge[0], []).append((edge[1], weight))
        related_nodes.setdefault(edge[1], []).append((edge[0], weight))
    return [max(related_nodes.get(node, [(node, 0.0)]), key=lambda x: x[1])[0] for node in range(num_nodes)]

def Labeltype(X,y):
    ImbalanceRatioMatrix,MeanIR,_=Imbalance(X,y)
    DifferenceImbalanceRatioMatrix=[i-MeanIR for i in ImbalanceRatioMatrix]
    MinLabelIndex=[]
    MajLabelIndex=[]
    count=0
    for i in (DifferenceImbalanceRatioMatrix):
        if i>0:
            MinLabelIndex.append(count)
        else:
            MajLabelIndex.append(count)
        count+=1
    MinLabelName=[]
    MajLabelName=[]
    for i in MinLabelIndex:
        MinLabelName.append(label_names[i][0])
    for i in MajLabelIndex:
        MajLabelName.append(label_names[i][0])
    MinLabeldic=dict(zip(MinLabelIndex,MinLabelName))
    MajLabeldic=dict(zip(MajLabelIndex,MajLabelName))
    return MinLabeldic,MajLabeldic

def Imbalance(X,y):
    countmatrix=[]
    for i in range(y.shape[1]):
        count0=0
        count1=0
        for j in range(y.shape[0]):
            if y[j,i]==1:
                count1+=1
            else:
                count0+=1
        countmatrix.append(count1)
    maxcount=max(countmatrix)
    ImbalanceRatioMatrix=[maxcount/i for i in countmatrix]
    MaxIR=max(ImbalanceRatioMatrix)
    MeanIR=sum(ImbalanceRatioMatrix)/len(ImbalanceRatioMatrix)
    return ImbalanceRatioMatrix,MeanIR,countmatrix

def CardAndDens(X,y):
    cardmatrix=[]
    for i in range(X.shape[0]):
        count=0
        for j in range(y.shape[1]):
            if y[i,j]==1:
                count+=1
        cardmatrix.append(count)
    Card=sum(cardmatrix)/len(cardmatrix)
    Dens=Card/y.shape[1]
    return Card,Dens

def ImR(X,y):
    Imr=[]
    for i in range(y.shape[1]):
        count0=0
        count1=0
        for j in range(y.shape[0]):
            if y[j,i]==1:
                count1+=1
            else:
                count0+=1
        if count1<=count0:
            Imr.append(count0/count1)
        else:
            Imr.append(count1/count0)
    return Imr

def compare_arrays( a, b):
    num_common_ones = np.sum((a == 1) & (b == 1))
    num_non_zeros = np.sum((a + b) > 0)
    return num_common_ones / num_non_zeros

In [3]:
# Find Natrual neighbor
def SearchNaN(dfX):
    TH=dfX.shape[0]+1
    NaN=[]
    for k in range(1,dfX.shape[0]):
        RKNN=[]
        stopcount=0  
        nbs=NearestNeighbors(n_neighbors=k+1,metric='euclidean',algorithm='kd_tree').fit(dfX)
        euclidean,indices= nbs.kneighbors(dfX)
        for i in range(dfX.shape[0]):
            tem=[j for j in range(dfX.shape[0]) if i in indices[j,1:]]
            RKNN.append(tem)
        for q in RKNN:
            if q == []:
                stopcount=stopcount+1
        if TH==stopcount:
            break
        else:
            TH=stopcount 
    for i in range(dfX.shape[0]):
        NaNi=[j for j in indices[i,1:].tolist() if j in RKNN[i]]
        NaN.append(NaNi)
    return NaN
#  MLONC
def MLONC(df1,df2,p):
    most_related_nodes = get_most_related_nodes(np.array(df2)) 
    np.random.seed(10)
    NewSample=int(df1.shape[0]*p)
    MLONC_new_X=df1.copy(deep=True)
    MLONC_target=df2.copy(deep=True)
    TotalNaN=[]
    MinLabeldic,MajLabeldic=Labeltype(np.array(df1),np.array(df2))
    ImbalanceRatioMatrix,MeanIR,countmatrix=Imbalance(np.array(df1),np.array(df2))
    Card,Dens=CardAndDens(np.array(df1),np.array(df2))
    Imr=ImR(np.array(df1),np.array(df2))
    meanImR=sum(Imr)/len(Imr)
    minImR=[Imr[i] for i in MinLabeldic.keys()]
    samplematrix=[int(i*NewSample/sum(minImR)) for i in minImR]
    percount=0
    noisesample=[]
    dif=[]
    for tail_label in MinLabeldic.keys():
        pergenerate=samplematrix[percount]       
        sub_index=list(df2[df2[MinLabeldic[tail_label]]==1].index)
        dfX= df1[df1.index.isin(sub_index)].reset_index(drop = True)
        dfy= df2[df2.index.isin(sub_index)].reset_index(drop = True)
        new_X = np.zeros((pergenerate, dfX.shape[1]))
        target = np.zeros((pergenerate, dfy.shape[1]))
        if dfX.shape[0]==1:
            continue
        NaN=SearchNaN(dfX)
        TotalNaN.append(sum([len(i) for i in NaN]))
        W=[len(i) for i in NaN]
        W=[elem for elem in W if elem != 0]
        list1=[i for i in range(dfX.shape[0])]
        list2=[index for index,value in enumerate(NaN) if value==[]] 
        for i in list2:
            noisesample.append(sub_index[i])
        list3=list(set(list1)-set(list2))
        if len(list3)==0:
            continue
        sorted_tuples = sorted(zip(W, list3))
        list1_sorted, list3 = zip(*sorted_tuples)
        deletematrix=[]
        for i in range(pergenerate):
            seed=list3[i%len(list3)]
            reference=np.random.choice(NaN[seed])
            npseed=np.array(dfy.loc[seed])
            npreference=np.array(dfy.loc[reference])
            dist=compare_arrays(npseed, npreference)
            for j in range(dfX.shape[1]):
                rmd=np.random.random()
                if feature_names[j][1]=='NUMERIC':  
                    new_X[i,j] = dfX.iloc[seed,j] + rmd*(dfX.iloc[reference,j]-dfX.iloc[seed,j])
                else:
                    new_X[i,j]=dfX.iloc[seed,j] 
            difl=0
            for j in range(dfy.shape[1]):
                a=most_related_nodes[j]
                if npseed[j]==npreference[j]:
                    target[i,j]=dfy.iloc[seed,j]
                elif npseed[a]==1 or target[i,a]==1:
                    target[i,j]=1
                    difl+=1
                else:
                    target[i,j]=0  
                    difl+=1
            dif.append(difl)
        dfnew_X = pd.DataFrame(new_X,columns=[x[0] for x in feature_names])
        dftarget = pd.DataFrame(target,columns=[y[0] for y in label_names])
        MLONC_new_X=pd.concat([MLONC_new_X,dfnew_X], axis=0).reset_index(drop=True)
        MLONC_target=pd.concat([MLONC_target,dftarget], axis=0).reset_index(drop=True)
        percount+=1
    return MLONC_new_X,MLONC_target

In [4]:
def training(X, y, index, p=None):
    Randomlist=[7,10,19,30,23]
    Macro=[]
    for i in Randomlist:       
        k_fold = IterativeStratification(n_splits=2,order=1,random_state=i)
        for train,test in k_fold.split(X,y):
            classifier =BinaryRelevance(
                classifier = DecisionTreeClassifier(random_state=20),
                require_dense = [False, True]
            )
            if(index==1):
                X1,y1=X[train],y[train]
            else: 
                dfx=pd.DataFrame(X[train],columns=[x[0] for x in feature_names])
                dfy=pd.DataFrame(y[train],columns=[x[0] for x in label_names])
                new_X,new_y=MLONC(dfx,dfy,p)
                X1,y1=np.array(new_X),np.array(new_y)
            classifier.fit(X1,y1)
            X2,y2=X[test],y[test]
            ypred = classifier.predict(X2)
            yprob=classifier.predict_proba(X2)
            yprob=yprob.toarray()
            Macro.append(metrics.f1_score(y2, ypred,average='macro'))
    MacroF=sum(Macro)/len(Macro)
    return MacroF

In [5]:
def main(p):
    index = 1
    macro_f1 = training(X, y, index)
    print(f"Macro F1 score for baseline: {round(macro_f1, 4)}")

    index = 2
    macro_f1 = training(X, y, index,p)
    print(f"Macro F1 score for MLONC: {round(macro_f1, 4)}")

if __name__ == "__main__":
# choose dataset
    X, y, feature_names, label_names = load_dataset('emotions', 'undivided')
    X, feature_names = feature_select(1, X, feature_names)
    y, label_names = label_select(y, label_names)
# set sampling count p
    main(p=0.6)

emotions:undivided - exists, not redownloading
Macro F1 score for baseline: 0.5509
Macro F1 score for MLONC: 0.5649
