# Bias Buccaneers Image Recognition Challenge: Unsupervised Solution

In [1]:
#import required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import PIL.Image as Image

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import transforms

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


## Prepare the data

### Data Pre-processing

In [3]:
dataset_path = "/kaggle/input/bias-buccaneers"
test_df = pd.read_csv(f"{dataset_path}/test/labels.csv")

In [4]:
categories = test_df.columns[1:].tolist()
print(categories)

['skin_tone', 'gender', 'age']


In [5]:
skin_tone_labels = [f"monk_{i}" for i in range(1,11)]
gender_labels = ["male", "female"]
age_labels = ["0_17", "18_30", "31_60", "61_100"]

In [6]:
#encode test samples
test_df['skin_tone'].replace(skin_tone_labels, list(range(len(skin_tone_labels))), inplace=True)
test_df['gender'].replace(gender_labels, list(range(len(gender_labels))), inplace=True)
test_df['age'].replace(age_labels, list(range(len(age_labels))), inplace=True)

### Building the Dataset

In [7]:
class ImageDataset(Dataset):
    def __init__(self, df, data_path, image_transform):
        self.df = df
        self.data_path = data_path
        self.image_transform = image_transform

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        img = Image.open(f"{self.data_path}{self.df['name'][index]}")
        if self.image_transform:
            img = self.image_transform(img)

        #we need to provide labels for skin_tone, gender, age
        labels = (self.df['skin_tone'][index], self.df['gender'][index], self.df['age'][index])
        return img, labels

## Helper functions

In [8]:
def extract_features(dataset:torch.utils.data.Dataset, feature_extractor:torch.nn.Module):
    features = []
    num_samples = len(dataset)
    for i in range(num_samples):
        img,_ = dataset[i]
        img = img.unsqueeze(0)
        img = img.to(device)
        img = feature_extractor(img)
        img = torch.reshape(img, (-1,))
        img = img.detach().cpu().numpy()
        features.append(img)
    return np.array(features)

In [9]:
def make_predictions(data_features, model):
    predictions = model.predict(data_features)
    return predictions

In [10]:
# Since K-Means Does not know about our labels we have to change labels of k-means according to our usage

# mapping labels from cluster to original labels
def get_reference_dict(clusters,data_label):
    reference_label = {}
    # For loop to run through each label of cluster label
    for i in range(len(np.unique(clusters))):
        index = np.where(clusters == i,1,0)
        num = np.bincount(data_label[index==1]).argmax()
        reference_label[i] = num
    return reference_label

# Mapping predictions to original labels
def get_labels(clusters,refernce_labels):
    temp_labels = np.random.rand(len(clusters))
    for i in range(len(clusters)):
        temp_labels[i] = reference_labels[clusters[i]]
    return temp_labels

## Extract features using Resnet

In [11]:
from torchvision.models import resnet18

In [12]:
class Feature_Extractor(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = torchvision.models.alexnet(pretrained=True)
        #self.model.avgpool = nn.Identity()
        self.model.classifier = nn.Identity()
    def forward(self, x):
        x = self.model(x)
        x = torch.reshape(x, (-1,))
        return x

In [13]:
feature_extractor = Feature_Extractor().to(device)

Downloading: "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth" to /root/.cache/torch/hub/checkpoints/alexnet-owt-7be5be79.pth


  0%|          | 0.00/233M [00:00<?, ?B/s]

## Load the Model

In [14]:
import pickle

In [15]:
model_path = "/kaggle/input/savedmodels/saved models"
skintone_model = pickle.load(open(f"{model_path}/skintone_model.pkl", "rb"))
gender_model = pickle.load(open(f"{model_path}/gender_model.pkl", "rb"))
age_model = pickle.load(open(f"{model_path}/age_model.pkl", "rb"))

## Test the Model

In [16]:
test_image_transform = transforms.Compose([
                                  transforms.ToTensor(),
                                  transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
                                         ])

In [17]:
test_data = ImageDataset(test_df, f"/{dataset_path}/test/", test_image_transform)

In [18]:
test_features = extract_features(test_data, feature_extractor)

In [19]:
clusters = make_predictions(test_features, skintone_model)
data_label = test_df['skin_tone']
data_label = [int(i) for i in data_label.tolist()]
data_label = np.array(data_label)

reference_labels = get_reference_dict(clusters,data_label)
skintone_predictions = get_labels(clusters,reference_labels)

In [20]:
clusters = make_predictions(test_features, gender_model)
data_label = test_df['gender']
data_label = [int(i) for i in data_label.tolist()]
data_label = np.array(data_label)

reference_labels = get_reference_dict(clusters,data_label)
gender_predictions = get_labels(clusters,reference_labels)

In [21]:
clusters = make_predictions(test_features, age_model)
data_label = test_df['age']
data_label = [int(i) for i in data_label.tolist()]
data_label = np.array(data_label)

reference_labels = get_reference_dict(clusters,data_label)
age_predictions = get_labels(clusters,reference_labels)

In [22]:
predictions = [skintone_predictions, gender_predictions, age_predictions]

## Prepare Submission

In [23]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [24]:
# calculate accuracy
acc = {}
for i in range(3):
    icat = categories[i]
    iacc = accuracy_score(test_df[icat], predictions[i])
    acc[icat] = iacc

# calculate disparity
def disparity_score(ytrue, ypred):
    cm = confusion_matrix(ytrue,ypred)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    all_acc = list(cm.diagonal())
    return max(all_acc) - min(all_acc)

disp = {}
for i in range(3):
    icat = categories[i]
    idisp = disparity_score(test_df[icat], predictions[i])
    disp[icat] = idisp
disp

results = {'accuracy': acc, 'disparity': disp}
results

{'accuracy': {'skin_tone': 0.20633333333333334,
  'gender': 0.618,
  'age': 0.4866666666666667},
 'disparity': {'skin_tone': 0.6567944250871081,
  'gender': 0.35196150224918926,
  'age': 0.8885813148788927}}

In [25]:
def getScore(results):
    acc = results['accuracy']
    disp = results['disparity']
    ad = 2*acc['gender']*(1-disp['gender']) + 4*acc['age']*(1-disp['age']**2) + 10*acc['skin_tone']*(1-disp['skin_tone']**5)
    return ad

title = 'Unsupervised Submission'
    
submission = {
    'submission_name': title,
    'score': getScore(results),
    'metrics': results
}
submission

{'submission_name': 'Unsupervised Submission',
 'score': 3.02174932812456,
 'metrics': {'accuracy': {'skin_tone': 0.20633333333333334,
   'gender': 0.618,
   'age': 0.4866666666666667},
  'disparity': {'skin_tone': 0.6567944250871081,
   'gender': 0.35196150224918926,
   'age': 0.8885813148788927}}}

In [26]:
import json
with open("submission.json", "w") as f:
    json.dump(submission, f, indent=4)

---