In [1]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models
from torchvision.transforms import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image

# from utkface_loader import parse_utkface_data

In [2]:
random.seed(0)

In [2]:
def parse_utkface_data(path):

    images, ages, genders, races = [], [], [], []

    for filename in sorted(os.listdir(path)):
        try:
            parts = filename.split('_')
            age = int(parts[0])
            gender = int(parts[1])
            race = int(parts[2])

            if age < 15:
                continue

            ages.append(age)
            genders.append(gender)
            races.append(race)
            images.append(Image.open(path + '/' + filename))

        except Exception as e:
            print(f"Error processing file: {filename} - {e}")
            continue

    images = pd.Series(list(images), name='image')
    ages = pd.Series(list(ages), name='age')
    genders = pd.Series(list(genders), name='gender')
    races = pd.Series(list(races), name='race')

    dataframe = pd.concat([images, ages, genders, races], axis=1)

    return dataframe

In [3]:
path = 'data/utkcropped'
data = parse_utkface_data(path)
print(data)

Error processing file: .DS_Store - invalid literal for int() with base 10: '.DS'
Error processing file: 39_1_20170116174525125.jpg.chip.jpg - invalid literal for int() with base 10: '20170116174525125.jpg.chip.jpg'
Error processing file: 61_1_20170109142408075.jpg.chip.jpg - invalid literal for int() with base 10: '20170109142408075.jpg.chip.jpg'
Error processing file: 61_1_20170109150557335.jpg.chip.jpg - invalid literal for int() with base 10: '20170109150557335.jpg.chip.jpg'
Error processing file: 61_3_20170109150557335.jpg.chip.jpg - invalid literal for int() with base 10: '20170109150557335.jpg.chip.jpg'
                                                   image  age  gender  race
0      <PIL.JpegImagePlugin.JpegImageFile image mode=...  100       0     0
1      <PIL.JpegImagePlugin.JpegImageFile image mode=...  100       0     0
2      <PIL.JpegImagePlugin.JpegImageFile image mode=...  100       1     0
3      <PIL.JpegImagePlugin.JpegImageFile image mode=...  100       1     0
4  

In [4]:
data = data.sample(frac=1, random_state=0).reset_index(drop=True)

In [5]:
data.head()

Unnamed: 0,image,age,gender,race
0,<PIL.JpegImagePlugin.JpegImageFile image mode=...,45,1,0
1,<PIL.JpegImagePlugin.JpegImageFile image mode=...,27,0,0
2,<PIL.JpegImagePlugin.JpegImageFile image mode=...,29,0,1
3,<PIL.JpegImagePlugin.JpegImageFile image mode=...,26,0,2
4,<PIL.JpegImagePlugin.JpegImageFile image mode=...,52,0,0


In [6]:
d_train, d_aux, d_test = data.iloc[:7000], data.iloc[7000:14000], data.iloc[14000:]

In [7]:
d_train.head()

Unnamed: 0,image,age,gender,race
0,<PIL.JpegImagePlugin.JpegImageFile image mode=...,45,1,0
1,<PIL.JpegImagePlugin.JpegImageFile image mode=...,27,0,0
2,<PIL.JpegImagePlugin.JpegImageFile image mode=...,29,0,1
3,<PIL.JpegImagePlugin.JpegImageFile image mode=...,26,0,2
4,<PIL.JpegImagePlugin.JpegImageFile image mode=...,52,0,0


In [8]:
d_aux.head()

Unnamed: 0,image,age,gender,race
7000,<PIL.JpegImagePlugin.JpegImageFile image mode=...,21,1,2
7001,<PIL.JpegImagePlugin.JpegImageFile image mode=...,43,1,0
7002,<PIL.JpegImagePlugin.JpegImageFile image mode=...,48,0,2
7003,<PIL.JpegImagePlugin.JpegImageFile image mode=...,26,0,1
7004,<PIL.JpegImagePlugin.JpegImageFile image mode=...,50,1,3


In [9]:
d_test.head()

Unnamed: 0,image,age,gender,race
14000,<PIL.JpegImagePlugin.JpegImageFile image mode=...,50,0,0
14001,<PIL.JpegImagePlugin.JpegImageFile image mode=...,30,1,2
14002,<PIL.JpegImagePlugin.JpegImageFile image mode=...,30,1,2
14003,<PIL.JpegImagePlugin.JpegImageFile image mode=...,38,0,2
14004,<PIL.JpegImagePlugin.JpegImageFile image mode=...,26,1,1


## Model

In [87]:
class UTK_Dataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        image = self.dataframe.iloc[idx]['image']
        label = self.dataframe.iloc[idx]['gender']

        if self.transform:
            image = self.transform(image)

        return image, label
    
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
vgg16 = models.vgg16(pretrained=True)
for param in list(vgg16.parameters())[:-1]:
    param.requires_grad = False

num_features = vgg16.classifier[-1].in_features
vgg16.classifier[-1] = nn.Linear(num_features, 2)

criterion = nn.CrossEntropyLoss()
optimiser = optim.Adam(vgg16.classifier[-1].parameters(), lr=0.0001, weight_decay=0.01)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
vgg16.to(device)



VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [90]:
utk_train = UTK_Dataset(d_train, transform=transform)
utk_train_loader = DataLoader(utk_train, batch_size=32, shuffle=True)

num_epochs = 12

for epoch in range(num_epochs):
    vgg16.train()
    running_loss = 0.0
    correct = 0
    total = 0

    print(f"Epoch {epoch+1}")

    for images, labels in utk_train_loader:
        images, labels = images.to(device), labels.to(device)

        optimiser.zero_grad()
        outputs = vgg16(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimiser.step()

        running_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

    accuracy = 100 * correct / total

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(utk_train_loader):.4f}, Accuracy: {accuracy:.2f}")

print("Training complete!")

Epoch 1


KeyboardInterrupt: 

## FeatureMatch

In [10]:
def categorise_age(age):
    if 15 <= age < 30:
        return '[15, 30]'
    elif 30 <= age < 45:
        return '[30, 45]'
    elif 45 <= age < 60:
        return '[45, 60]'
    elif age >= 60:
        return '[60, inf]'
    else:
        return 'unknown'
    
bucket_mapping = {
    '[15, 30]': 0,
    '[30, 45]': 1,
    '[45, 60]': 2,
    '[60, inf]': 3
}

In [11]:
d_aux['age bucket'] = d_aux['age'].apply(categorise_age)
d_train['age bucket'] = d_train['age'].apply(categorise_age)
d_test['age bucket'] = d_test['age'].apply(categorise_age)

d_aux['age bucket'] = d_aux['age bucket'].map(bucket_mapping)
d_train['age bucket'] = d_train['age bucket'].map(bucket_mapping)
d_test['age bucket'] = d_test['age bucket'].map(bucket_mapping)

feature_columns = ['age bucket', 'race']

aux_feature = d_aux[feature_columns].values
test_feature = d_test[feature_columns].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_aux['age bucket'] = d_aux['age'].apply(categorise_age)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_train['age bucket'] = d_train['age'].apply(categorise_age)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_test['age bucket'] = d_test['age'].apply(categorise_age)
A value is trying to be set 

In [61]:
d_aux.head()

Unnamed: 0,image,age,gender,race,age bucket
7000,<PIL.JpegImagePlugin.JpegImageFile image mode=...,21,1,2,0
7001,<PIL.JpegImagePlugin.JpegImageFile image mode=...,43,1,0,1
7002,<PIL.JpegImagePlugin.JpegImageFile image mode=...,48,0,2,2
7003,<PIL.JpegImagePlugin.JpegImageFile image mode=...,26,0,1,0
7004,<PIL.JpegImagePlugin.JpegImageFile image mode=...,50,1,3,2


In [62]:
print(type(aux_feature))
print(type(aux_feature[0]))
print(aux_feature[0:5])

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[[0 2]
 [1 0]
 [2 2]
 [0 1]
 [2 3]]


In [12]:
af_tuples = [(x[0], int(x[1])) for x in aux_feature]
# af_tuples = np.array(af_tuples)
tuples, counts = np.unique(af_tuples, axis=0, return_counts=True)

print(f'There are {len(tuples)} unique tuples in the auxilliary dataset')
print(counts)
tuples = [(t[0], int(t[1])) for t in tuples]
print(tuples)

There are 20 unique tuples in the auxilliary dataset
[891 725 469 541 262 798 556 231 334 113 643 176  62 230  43 615 124 102
  81   4]
[(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (2, 0), (2, 1), (2, 2), (2, 3), (2, 4), (3, 0), (3, 1), (3, 2), (3, 3), (3, 4)]


In [13]:
poison_rates = [0.5, 1, 2]
features = [(subpop, count) for subpop, count in zip(tuples, counts)]

print(f"There are {len(features)} features in the auxilliary dataset")

There are 20 features in the auxilliary dataset


In [16]:
# subpop = d_aux[(d_aux['age bucket'] == tuples[0][0]) & (d_aux['race'] == int(tuples[0][1]))]
# print(subpop.head())
# for i, r in subpop.iterrows():
#     subpop.loc[i, 'gender'] = 1 - r['gender']
# print(subpop.head())

for i, (subpop, count) in enumerate(features):

    print('\n')
    print(f"Subpopulation {i}")

    aux_indices = np.where(np.linalg.norm(aux_feature - subpop, axis=1)==0)[0]
    aux_poison = d_aux.iloc[aux_indices]

    test_indices = np.where(np.linalg.norm(test_feature - subpop, axis=1)==0)[0]
    test_poison = d_test.iloc[test_indices]

    print(aux_indices)
    print(test_indices)

    sub_count = aux_indices.shape[0]
    print(f"Subpopulation count: {sub_count}")

    for j, pois_count in enumerate([int(sub_count * rate) for rate in poison_rates]):

        print(f'Poison rate: {poison_rates[j]}')
        print(f'Number of poisoned samples: {pois_count}')

        pois_indices = np.random.choice(aux_poison.shape[0], pois_count, replace=True)
        poison = aux_poison.iloc[pois_indices]
        for i, r in poison.iterrows():
            poison.loc[i, 'gender'] = 1 - r['gender']
        
        poisoned_train = pd.concat([d_train, poison])
        # pois_data = UTK_Dataset(poisoned_train, transform=transform)
        # pois_loader = DataLoader(pois_data, batch_size=32, shuffle=True)




Subpopulation 0
[  19   43   48   58   77   79   83   88   89   90   99  111  123  132
  143  145  150  152  176  180  187  206  216  221  236  245  247  253
  280  295  297  299  303  313  315  317  319  320  324  326  336  340
  343  346  349  355  356  362  390  396  397  407  411  431  437  448
  451  453  460  463  494  500  514  515  529  536  543  545  546  563
  566  573  596  598  600  601  608  614  616  622  625  629  631  635
  649  659  662  667  689  694  697  699  702  706  713  724  725  726
  730  731  739  746  751  755  767  777  785  787  796  797  800  807
  813  821  843  844  850  862  888  901  902  913  916  923  926  931
  933  942  943  964  969  972  983  990  999 1001 1013 1015 1033 1036
 1042 1045 1068 1074 1087 1092 1095 1098 1100 1108 1121 1133 1139 1145
 1150 1155 1163 1182 1197 1202 1203 1206 1207 1217 1235 1241 1242 1260
 1273 1277 1280 1301 1315 1316 1319 1321 1325 1331 1332 1335 1352 1359
 1362 1377 1379 1381 1386 1388 1391 1392 1393 1394 1395 139

In [91]:
print(len(d_train))

7000
