In [1]:
import pandas as pd
import numpy as np
from sklearn import (metrics, linear_model, datasets,
                     model_selection, preprocessing, dummy)
lfw_people = datasets.fetch_lfw_people(min_faces_per_person=20, resize=0.04)
data = pd.DataFrame(lfw_people.data)
feature_columns=data.columns
names=pd.Series(lfw_people.target_names, name='class_name')
names = names[names.isin(['George W Bush', 'Hugo Chavez',  'John Ashcroft'])]
data['target'] = lfw_people['target']
data = data.merge(names, left_on='target', right_index=True)
#data

In [2]:
le = preprocessing.LabelEncoder().fit(data.class_name)
X = np.array(data[feature_columns])
y = le.transform(data.class_name)
X.shape, y.shape

((654, 15), (654,))

In [3]:
name_dict = {str(i):c for i,c in enumerate(le.classes_)}
print(name_dict)
pd.Series(y).value_counts(normalize=True).sort_index()

{'0': 'George W Bush', '1': 'Hugo Chavez', '2': 'John Ashcroft'}


0    0.810398
1    0.108563
2    0.081040
dtype: float64

In [4]:
X_tr, X_te, y_tr, y_te = model_selection.train_test_split(X, y, 
            test_size=0.5, stratify=y, random_state=12345)

In [5]:
def classif_report(y, y_pred):
    return pd.DataFrame(metrics.classification_report(y, y_pred, 
      output_dict=True, zero_division=0)).rename_axis(columns='class'
      ).loc[['recall'],:].rename(columns={'macro avg':'macro avg = bal accur', 
                                'weighted avg':'wtd avg = accur'} | name_dict)

## First, dummy classifier which always chooses the majority class

In [6]:
classif_report(y_te, dummy.DummyClassifier().fit(X_tr, y_tr).predict(X_te))

class,George W Bush,Hugo Chavez,John Ashcroft,accuracy,macro avg = bal accur,wtd avg = accur
recall,1.0,0.0,0.0,0.810398,0.333333,0.810398


**already get 81% accuracy just from dummy classifier which always chooses Bush**

## Now original model with original imbalanced data

In [7]:
%%time
MAX_ITER=10000
orig_model = linear_model.LogisticRegression(penalty='none', 
               max_iter=MAX_ITER).fit(X_tr, y_tr)
classif_report(y_te, orig_model.predict(X_te))

CPU times: total: 844 ms
Wall time: 833 ms


class,George W Bush,Hugo Chavez,John Ashcroft,accuracy,macro avg = bal accur,wtd avg = accur
recall,0.962264,0.277778,0.269231,0.831804,0.503091,0.831804


**83% accur only slightly better than dummy classifier.  Bal accur still terrible at 50%.**

## using BenefitRebalancingClassifier (instead of PrevalenceAdjustingClassifier which was renamed from RebalancingClassifier)

In [8]:
%%time
import rebalancing # downloaded rebalancing.py from link above
model = rebalancing.BenefitRebalancingClassifier(orig_model).fit(X_tr, y_tr)
classif_report(y_te, model.predict(X_te))

CPU times: total: 15.6 ms
Wall time: 22.1 ms


class,George W Bush,Hugo Chavez,John Ashcroft,accuracy,macro avg = bal accur,wtd avg = accur
recall,0.686792,0.75,0.807692,0.703364,0.748162,0.703364


**Bal accur went from .50 to .75 just above!  This time it's even better than class_weight='balanced' down below, though it won't always be
**

## Now just testing explicit uniform benefit_per_class *list* with arbitrary overall scale to get same result as just above:

In [9]:
%%time
model = rebalancing.BenefitRebalancingClassifier(orig_model, 
  benefit_per_class=[7e-6, 7e-6, 7e-6]).fit(X_tr, y_tr)
classif_report(y_te, model.predict(X_te))

CPU times: total: 15.6 ms
Wall time: 15.6 ms


class,George W Bush,Hugo Chavez,John Ashcroft,accuracy,macro avg = bal accur,wtd avg = accur
recall,0.686792,0.75,0.807692,0.703364,0.748162,0.703364


## Now try assigning a little more benefit per class to classifying majority class Bush correctly:

In [10]:
%%time
model = rebalancing.BenefitRebalancingClassifier(orig_model, 
  benefit_per_class=[9, 7, 7]).fit(X_tr, y_tr)
classif_report(y_te, model.predict(X_te))

CPU times: total: 0 ns
Wall time: 0 ns


class,George W Bush,Hugo Chavez,John Ashcroft,accuracy,macro avg = bal accur,wtd avg = accur
recall,0.750943,0.75,0.769231,0.752294,0.756725,0.752294


## Now fully re-train the model with class_weight='balanced' for comparison (very similar to randomly oversampling the training set):

In [11]:
%%time
model = linear_model.LogisticRegression(penalty='none', 
          class_weight='balanced', max_iter=MAX_ITER).fit(X_tr, y_tr)
classif_report(y_te, model.predict(X_te))

CPU times: total: 484 ms
Wall time: 486 ms


class,George W Bush,Hugo Chavez,John Ashcroft,accuracy,macro avg = bal accur,wtd avg = accur
recall,0.686792,0.75,0.730769,0.697248,0.722521,0.697248


**Balanced accuracy went from original .50 to .72 just above.  I don't care about the precision or F1.**