# Эксперименты с дисбалансом классов

2020, Дьяконов Александр (https://dyakonov.org/ag/)

In [2]:
# подгружаем все нужные пакеты
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, balanced_accuracy_score, cohen_kappa_score, f1_score, log_loss, matthews_corrcoef, roc_auc_score, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_moons
from imblearn.datasets import make_imbalance
from sklearn.model_selection import cross_val_predict
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_circles

# за какими показателями качества следить
scores = [accuracy_score, balanced_accuracy_score, cohen_kappa_score, f1_score, matthews_corrcoef]
scores2 = [log_loss, roc_auc_score, average_precision_score]

### Полезные функции

In [3]:
def imbalancing(X, y, p=0.1):
    """
    Внести дисбаланс в данные
    """
    I = (y <= 0) | (np.random.rand(X.shape[0]) < p) 
    X = X[I, :] # прореживание
    y = y[I]
    return (X, y)

from imblearn import FunctionSampler
def func(X, y):
    """
    Тождественный сэмлер
    """
    return X, y

def investigate_model(model, X, y, cv, X_test, y_test):
    """
    оценить стратегию подбора порога
    """
    df = pd.DataFrame(index=['train', 'test0', 'cv', 'test', 'theta'])
    # ф-ии качества бинаризации
    scores = [accuracy_score, balanced_accuracy_score, cohen_kappa_score, f1_score, matthews_corrcoef]
    # ф-ии качества оценок
    scores2 = [log_loss, roc_auc_score, average_precision_score]
    # на cv
    a_cv = cross_val_predict(model, X, y, cv=cv, method='predict_proba')[:,1]
    model.fit(X, y)
    # на обучении
    a_train = model.predict_proba(X)[:,1]
    b_train = model.predict(X)
    # на тесте
    a_test = model.predict_proba(X_test)[:,1]
    b_test = model.predict(X_test)
    
    # оптимальный порог по cv
    thetas = np.linspace(0, 1, 101)
    for s in scores:
        q = np.array([s(y, a_cv > t) for t in thetas])
        i = np.argmax(q)
        t = thetas[i]
        s_cv = q[i]
        s_test = s(y_test, a_test > t)
        sb_train = s(y, b_train)
        sb_test = s(y_test, b_test)
        print (s.__name__, sb_train, sb_test, s_cv, s_test, t)
        df[s.__name__] = [sb_train, sb_test, s_cv, s_test, t]
    for s in scores2:
        sb_train = s(y, a_train)
        sb_test = s(y_test, a_test)
        s_cv = s(y, a_cv)
        
        print (s.__name__, sb_train, sb_test, s_cv)
        df[s.__name__] = [sb_train, sb_test, s_cv, sb_test, np.NaN]
    return (df)

def make_statistics(model, X, y, X_test, y_test):
    """
    Посчитать статистику
    """ 
    df = pd.DataFrame(index=['train', 'test'])
    a = model.predict_proba(X)[:, 1]
    b = model.predict(X)
    a_test = model.predict_proba(X_test)[:, 1]
    b_test = model.predict(X_test)

    for s in scores:
        sb_test = s(y_test, b_test)
        sb_train = s(y, b)
        print (s.__name__, sb_train, sb_test)
        df[s.__name__] = [sb_train, sb_test]
        
    for s in scores2:
        sb_train = s(y, a)
        sb_test = s(y_test, a_test)
        #s_cv = s(y, a_cv)
        df[s.__name__] = [sb_train, sb_test]
        print (s.__name__, sb_train, sb_test)
    return df

# LogReg + TwoMoons

In [4]:
# модель

model = LogisticRegression()
model_w = LogisticRegression(class_weight='balanced')

# данные

X, y = make_moons(n_samples=1000, shuffle=True, noise=0.1, random_state=1)
X, y = imbalancing(X, y, p=0.06)
X_test, y_test = make_moons(n_samples=1000000, shuffle=True, noise=0.1, random_state=2)
X_test, y_test = imbalancing(X_test, y_test, p=0.06)

print('------ DataSet --------')
print(f'X.shape={X.shape}, y.mean={y.mean()}')
print(f'X_test.shape={X_test.shape}, y_test.mean={y_test.mean()}')

dfs = [] # для создания отчётности

print('------ None --------')

model.fit(X, y)
dfs.append(make_statistics(model, X, y, X_test, y_test))


print('------ weight_balanced --------')

model_w.fit(X, y)
dfs.append(make_statistics(model_w, X, y, X_test, y_test))

print('------ Selection of a threshold --------')

dfs.append(investigate_model(model, X, y, 10, X_test, y_test))
dfs.append(investigate_model(model_w, X, y, 10, X_test, y_test))


from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks, EditedNearestNeighbours


print('------ Oversampling methods ------')
techniques = [RandomOverSampler(), # FunctionSampler(func=func),
              SMOTE(),
              ADASYN()]

for sampler in techniques:
    technique = sampler.__class__.__name__
    print(f'Technique: {technique}')
    X_resampled, y_resampled = sampler.fit_resample(X, y)
    print(f'X_resampled={X_resampled.shape}, y_resampled={y_resampled.mean()}')
    model.fit(X_resampled, y_resampled)
    dfs.append(make_statistics(model, X, y, X_test, y_test))
    
print('------ Undersampling methods ------')
techniques = [RandomUnderSampler(),
              NearMiss(version=1),
              NearMiss(version=2),
              TomekLinks(),
              EditedNearestNeighbours()]
# dfs = []

for sampler in techniques:
    technique = sampler.__class__.__name__
    print(f'Technique: {technique}')
    X_resampled, y_resampled = sampler.fit_resample(X, y)
    print(f'X_resampled={X_resampled.shape}, y_resampled={y_resampled.mean()}')
    model.fit(X_resampled, y_resampled)
    dfs.append(make_statistics(model, X, y, X_test, y_test)) 

------ DataSet --------
X.shape=(526, 2), y.mean=0.049429657794676805
X_test.shape=(529806, 2), y_test.mean=0.05625832852025081
------ None --------
accuracy_score 0.9771863117870723 0.9687678131240491
balanced_accuracy_score 0.7692307692307692 0.7227056126954304
cohen_kappa_score 0.6892477353288696 0.6022842497269536
f1_score 0.7000000000000001 0.616087793786687
matthews_corrcoef 0.7251491888941517 0.6561664944867404
log_loss 0.08936455351083333 0.09757047610645972
roc_auc_score 0.9610769230769232 0.9626310765617659
average_precision_score 0.7757950101877162 0.7890625601162011
------ weight_balanced --------
accuracy_score 0.8821292775665399 0.8813358097114793
balanced_accuracy_score 0.8468461538461538 0.8817444625914246
cohen_kappa_score 0.3561048807455378 0.40589639923394794
f1_score 0.40384615384615385 0.45548636312457236
matthews_corrcoef 0.42310898355047466 0.4778614572799835
log_loss 0.2554812209422345 0.2552905595748801
roc_auc_score 0.9613076923076923 0.9618829709454473
averag

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


matthews_corrcoef 0.7251491888941517 0.6561664944867404 0.7251491888941517 0.7312322788798225 0.35000000000000003
log_loss 0.08936455351083333 0.09757047610645972 0.09203396371918841
roc_auc_score 0.9610769230769232 0.9626310765617659 0.9576923076923077
average_precision_score 0.7757950101877162 0.7890625601162011 0.758585625992195
accuracy_score 0.8821292775665399 0.8813358097114793 0.9790874524714829 0.9742962518355776 0.91
balanced_accuracy_score 0.8468461538461538 0.8817444625914246 0.879 0.8646628845869959 0.29
cohen_kappa_score 0.3561048807455378 0.40589639923394794 0.7337811723566762 0.7018911591351833 0.91
f1_score 0.40384615384615385 0.45548636312457236 0.744186046511628 0.7144714219818006 0.91


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


matthews_corrcoef 0.42310898355047466 0.4778614572799835 0.7518326869207617 0.7270715961034643 0.91
log_loss 0.2554812209422345 0.2552905595748801 0.258477311412552
roc_auc_score 0.9613076923076923 0.9618829709454473 0.9566153846153846
average_precision_score 0.7860861866549161 0.7859703615202439 0.7766214965623943
------ Oversampling methods ------
Technique: RandomOverSampler
X_resampled=(1000, 2), y_resampled=0.5
accuracy_score 0.8821292775665399 0.878368685896347
balanced_accuracy_score 0.8468461538461538 0.8769070073810642
cohen_kappa_score 0.3561048807455378 0.3968983413323913
f1_score 0.40384615384615385 0.4474133273879451
matthews_corrcoef 0.42310898355047466 0.46926078902318197
log_loss 0.2517942761438466 0.2523349171788454
roc_auc_score 0.9603846153846154 0.9604503643561699
average_precision_score 0.7877516877681922 0.7821651826096271
Technique: SMOTE
X_resampled=(1000, 2), y_resampled=0.5
accuracy_score 0.8954372623574145 0.8896086492036708
balanced_accuracy_score 0.83561538

In [5]:
keys = ['None', 'Weights', 'Th-d', 'Th-d + W', 'RandOS', 'SMOTE', 'ADASYN', 'RandUS', 'NM1', 'NM2', 'TLinks', 'ENNs']
d = pd.concat([d.T['test'] for d in dfs], keys=keys, axis=1).round(3)
d.loc['log_loss',:] = -d.loc['log_loss',:]
d = d.rename(index={'log_loss':'$-$log_loss'})

d.style.set_precision(3).highlight_max(axis=1, color='#FFDDDD')

Unnamed: 0,None,Weights,Th-d,Th-d + W,RandOS,SMOTE,ADASYN,RandUS,NM1,NM2,TLinks,ENNs
accuracy_score,0.969,0.881,0.975,0.974,0.878,0.89,0.901,0.851,0.913,0.855,0.969,0.969
balanced_accuracy_score,0.723,0.882,0.862,0.865,0.877,0.873,0.882,0.87,0.798,0.77,0.723,0.723
cohen_kappa_score,0.602,0.406,0.699,0.702,0.397,0.418,0.45,0.346,0.421,0.283,0.602,0.602
f1_score,0.616,0.455,0.715,0.714,0.447,0.466,0.494,0.403,0.463,0.343,0.616,0.616
matthews_corrcoef,0.656,0.478,0.731,0.727,0.469,0.481,0.508,0.432,0.446,0.335,0.656,0.656
$-$log_loss,-0.098,-0.255,-0.098,-0.255,-0.252,-0.229,-0.211,-0.341,-0.344,-0.38,-0.098,-0.098
roc_auc_score,0.963,0.962,0.963,0.962,0.96,0.959,0.963,0.96,0.915,0.871,0.963,0.963
average_precision_score,0.789,0.786,0.789,0.786,0.782,0.78,0.789,0.788,0.636,0.439,0.789,0.789
