In [None]:
%matplotlib inline

print('Loading libraries... Please wait.')

from IPython.display import display, clear_output
import ipywidgets as widgets
import random
import sys
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc
from sklearn.calibration import calibration_curve

from esper.prelude import *
from esper.stdlib import *
import esper.face_embeddings as face_embeddings

# Gender Validation
How good is our gender classifier? How accurate are the scores?

We have two sources of ground truth for gender labels: manual gender labels and genders from identities.

## Confusion Matrix from gender handlabels

In [None]:
HANDLABELER_NAME = 'handlabeled-gender-validation'
MODEL_LABELER_NAME = 'rudecarnie'
def print_gender_validation_stats(normalize=False, threshold=0.5):
    labeler = Labeler.objects.get(name=HANDLABELER_NAME)
    hand_face_genders = {
        fg['face__id']: fg['gender__id']
        for fg in FaceGender.objects.filter(
            labeler=labeler
        ).values('face__id', 'gender__id')
    }
    gender_id_dict = {g.name: g.id for g in Gender.objects.all()}
    male_count = sum((
        1 for g in hand_face_genders.values() if g == gender_id_dict['M']
    ))
    female_count = sum((
        1 for g in hand_face_genders.values() if g == gender_id_dict['F']
    ))
    print('{} faces have been hand-labeled ({} male, {} female)'.format(
          len(hand_face_genders), male_count, female_count))
    
    y_pred = []
    y_truth = []
    for fg in FaceGender.objects.filter(
        face__id__in=list(hand_face_genders.keys()), 
        labeler__name=MODEL_LABELER_NAME
    ).values('face__id', 'gender__id', 'probability'):
        male_probability = fg['probability'] if fg['gender__id'] == gender_id_dict['M'] else 1 - fg['probability']
        y_pred.append(
            gender_id_dict['M' if male_probability >= threshold else 'F']
        )
        y_truth.append(hand_face_genders[fg['face__id']])
    
    cm = confusion_matrix(y_truth, y_pred)
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    classes = ['Male', 'Female']
    plt.figure(figsize=(5, 5))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Reds)
    plt.title('Gender confusion matrix')
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('Hand label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()
    
    print('Overall accuracy: {:0.2f}'.format(
          accuracy_score(y_truth, y_pred)))

In [None]:
print_gender_validation_stats(normalize=False)
print_gender_validation_stats(normalize=False, threshold=0.8)
print_gender_validation_stats(normalize=True)
print_gender_validation_stats(normalize=True, threshold=0.8)

## ROC curve from gender handlabels (male)

In [None]:
def plot_gender_roc_curve(gender='M'):
    labeler = Labeler.objects.get(name=HANDLABELER_NAME)
    hand_face_genders = {
        fg['face__id']: fg['gender__id']
        for fg in FaceGender.objects.filter(
            labeler=labeler
        ).values('face__id', 'gender__id')
    }
    gender_id_dict = {g.name: g.id for g in Gender.objects.all()}
    male_count = sum((
        1 for g in hand_face_genders.values() if g == gender_id_dict['M']
    ))
    female_count = sum((
        1 for g in hand_face_genders.values() if g == gender_id_dict['F']
    ))
    print('{} faces have been hand-labeled ({} male, {} female)'.format(
          len(hand_face_genders), male_count, female_count))
    
    y_score = []
    y_truth = []
    for fg in FaceGender.objects.filter(
        face__id__in=list(hand_face_genders.keys()), 
        labeler__name=MODEL_LABELER_NAME
    ).values('face__id', 'gender__id', 'probability'):
        if fg['gender__id'] == gender_id_dict[gender]:
            y_score.append(fg['probability'])
        else:
            y_score.append(1-fg['probability'])
        y_truth.append(1 if hand_face_genders[fg['face__id']] == gender_id_dict[gender] else 0)
    fpr, tpr, _ = roc_curve(y_truth, y_score)
    roc_auc = auc(fpr, tpr)
    
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC for gender classifier')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
plot_gender_roc_curve(gender='M')
plot_gender_roc_curve(gender='F')

## Calibration Curves

In [None]:
def plot_calibration_curve(gender='M', bins=10):
    labeler = Labeler.objects.get(name=HANDLABELER_NAME)
    hand_face_genders = {
        fg['face__id']: fg['gender__id']
        for fg in FaceGender.objects.filter(
            labeler=labeler
        ).values('face__id', 'gender__id')
    }
    gender_id_dict = {g.name: g.id for g in Gender.objects.all()}
    male_count = sum((
        1 for g in hand_face_genders.values() if g == gender_id_dict['M']
    ))
    female_count = sum((
        1 for g in hand_face_genders.values() if g == gender_id_dict['F']
    ))
    print('{} faces have been hand-labeled ({} male, {} female)'.format(
          len(hand_face_genders), male_count, female_count))
    
    y_score = []
    y_truth = []
    for fg in FaceGender.objects.filter(
        face__id__in=list(hand_face_genders.keys()), 
        labeler__name=MODEL_LABELER_NAME
    ).values('face__id', 'gender__id', 'probability'):
        if fg['gender__id'] == gender_id_dict[gender]:
            y_score.append(fg['probability'])
        else:
            y_score.append(1-fg['probability'])
        y_truth.append(1 if hand_face_genders[fg['face__id']] == gender_id_dict[gender] else 0)
    
    fraction_of_positives, mean_predicted_values = calibration_curve(y_truth, y_score, n_bins=bins)
    
    plt.figure()
    lw = 2
    plt.plot(mean_predicted_values, fraction_of_positives, 's-', color='darkorange',
             lw=lw, label='Calibration curve')
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Mean predicted value')
    plt.ylabel('Count')
    plt.title('Calibration curve for gender classifier')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
plot_calibration_curve(bins=25)
plot_calibration_curve('F', bins=25)

## Train a logistic classifier on the output of the gender classifier

In [None]:
labeler = Labeler.objects.get(name=HANDLABELER_NAME)
hand_face_genders = {
    fg['face__id']: fg['gender__id']
    for fg in FaceGender.objects.filter(
        labeler=labeler
    ).values('face__id', 'gender__id')
}
gender_id_dict = {g.name: g.id for g in Gender.objects.all()}
male_count = sum((
    1 for g in hand_face_genders.values() if g == gender_id_dict['M']
))
female_count = sum((
    1 for g in hand_face_genders.values() if g == gender_id_dict['F']
))
print('{} faces have been hand-labeled ({} male, {} female)'.format(
      len(hand_face_genders), male_count, female_count))

y_score = []
y_truth = []
for fg in FaceGender.objects.filter(
    face__id__in=list(hand_face_genders.keys()), 
    labeler__name=MODEL_LABELER_NAME
).order_by('?').values('face__id', 'gender__id', 'probability'):
    if fg['gender__id'] == gender_id_dict['M']:
        y_score.append(fg['probability'])
    else:
        y_score.append(1-fg['probability'])
    y_truth.append(1 if hand_face_genders[fg['face__id']] == gender_id_dict['M'] else 0)

In [None]:
training_size=5000
y_train = np.array(y_truth[:training_size])
y_test = np.array(y_truth[training_size:])

x_train = np.array(y_score[:training_size])
x_test = np.array(y_score[training_size:])

In [None]:
from sklearn.linear_model import LogisticRegression as LR

lr = LR()                                                       
lr.fit( x_train.reshape( -1, 1 ), y_train )     # LR needs X to be 2-dimensional
#y_calibrated = lr.predict_proba( x_test.reshape( -1, 1 ))[:,1]

In [None]:
fraction_of_positives, mean_predicted_values = calibration_curve(y_test, y_calibrated, n_bins=25)
    
plt.figure()
lw = 2
plt.plot(mean_predicted_values, fraction_of_positives, 's-', color='darkorange',
         lw=lw, label='Calibration curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Mean predicted value')
plt.ylabel('Count')
plt.title('Calibration curve for gender classifier')
plt.legend(loc="lower right")
plt.show()

In [None]:
print(lr.coef_)

In [None]:
print(lr.intercept_)

In [None]:
lr.score( x_train.reshape( -1, 1 ), y_train )

In [None]:
lr.score( x_test.reshape( -1, 1 ), y_test )

In [None]:
lr.predict_proba(np.array([.001]).reshape(-1, 1))[:,1]

## Linear Regression on output of gender classifier

In [None]:
from sklearn.linear_model import LinearRegression as LinR

linr = LinR()                                                       
linr.fit( x_train.reshape( -1, 1 ), y_train )     # LR needs X to be 2-dimensional
print(linr.coef_[0], linr.intercept_)
y_calibrated = [linr.coef_[0] * score + linr.intercept_ for score in x_test]

In [None]:
fraction_of_positives, mean_predicted_values = calibration_curve(y_test, y_calibrated, n_bins=25)
    
plt.figure()
lw = 2
plt.plot(mean_predicted_values, fraction_of_positives, 's-', color='darkorange',
         lw=lw, label='Calibration curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Mean predicted value')
plt.ylabel('Count')
plt.title('Calibration curve for gender classifier')
plt.legend(loc="lower right")
plt.show()

In [None]:
linr.score( x_test.reshape( -1, 1 ), y_test )

## KNN on labeled dataset

In [None]:
import random
face_ids = list(hand_face_genders.keys())
random.shuffle(face_ids)

In [None]:
knn_training_ids = face_ids[:10000]
knn_test_ids = face_ids[10000:]

In [None]:
import esper.face_embeddings

In [None]:
training_features = face_embeddings.features(knn_training_ids)
test_features = face_embeddings.features(knn_test_ids)

In [None]:
training_ground_truth = [
    1 if hand_face_genders[fid] == gender_id_dict['M'] else 0
    for fid in knn_training_ids
]
test_ground_truth = [
    1 if hand_face_genders[fid] == gender_id_dict['M'] else 0
    for fid in knn_test_ids
]

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
neigh = KNeighborsClassifier(n_neighbors=10)

In [None]:
neigh.fit(training_features, training_ground_truth)

In [None]:
predicted = neigh.predict_proba(test_features)

In [None]:
predicted[0]

In [None]:
fraction_of_positives, mean_predicted_values = calibration_curve(test_ground_truth, [
    p[1] for p in predicted
], n_bins=10)
    
plt.figure()
lw = 2
plt.plot(mean_predicted_values, fraction_of_positives, 's-', color='darkorange',
         lw=lw, label='Calibration curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Mean predicted value')
plt.ylabel('Count')
plt.title('Calibration curve for gender classifier')
plt.legend(loc="lower right")
plt.show()

In [None]:
def show_confusion_matrix(y_truth, y_pred, normalize=False):
    cm = confusion_matrix(y_truth, y_pred)
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    classes = ['Male', 'Female']
    plt.figure(figsize=(5, 5))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Reds)
    plt.title('Gender confusion matrix')
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('Hand label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()
    
    print('Overall accuracy: {:0.2f}'.format(
          accuracy_score(y_truth, y_pred)))

In [None]:
show_confusion_matrix(
    test_ground_truth,
    [
        1 if p[1] >= 0.8 else 0 for p in predicted
    ],
    normalize=True
)

In [None]:
plt.hist([p[1] for p in predicted])