## Import Library

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim
from torchvision.models import resnet50
import time
from torch.autograd import Variable
from PIL import Image
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

## Data Preparation

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data_path='/content/drive/MyDrive/FaceReco/Dataset'

In [4]:
os.listdir(data_path)

['gender_classification.csv',
 'class_identity.txt',
 'list_attribute.txt',
 'gender_classification.xlsx',
 'Images']

In [5]:
images_list = os.listdir(data_path+'/Images')

In [6]:
# Membaca file list_attribute.txt yang berisi informasi atribut gambar. Merapikan data dengan separasi spasi dan skip kolom metadata
data = pd.read_csv(data_path+'/list_attribute.txt', sep='\s+', skiprows=1)

# images_list berisi nama file 5000 gambar yang benar-benar ada.
# Kode ini menyaring agar hanya atribut gambar tersebut yang diambil.
filtered_data = data[data.index.isin(images_list)]

# Hanya kolom 'Male' yang diambil dari dataset. reset_index() mengubah nama gambar dari index menjadi kolom biasa.
filtered_data = filtered_data[['Male']].reset_index()

# Kolom yang berisi nama gambar diubah namanya menjadi image_id agar lebih jelas.
filtered_data = filtered_data.rename(columns={'index': 'image_id'})

# Awalnya data 'Male' berisi 1 (laki-laki) dan -1 (bukan laki-laki). Diubah menjadi 1 (laki-laki) dan 0 (bukan laki-laki) supaya cocok untuk model machine learning.
filtered_data['Male'] = filtered_data['Male'].apply(lambda x: 1 if x == 1 else 0)

# Buat variabel filtered_data dengan data
data = filtered_data

print(data.head())
print(data.shape)
print("Unique values in 'Male' column after conversion:", data['Male'].unique())

  data = pd.read_csv(data_path+'/list_attribute.txt', sep='\s+', skiprows=1)


     image_id  Male
0  000051.jpg     1
1  000052.jpg     1
2  000352.jpg     1
3  000409.jpg     1
4  000545.jpg     1
(1768, 2)
Unique values in 'Male' column after conversion: [1 0]


In [7]:
# Informasi dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1768 entries, 0 to 1767
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   image_id  1768 non-null   object
 1   Male      1768 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 27.8+ KB


In [9]:
data.duplicated().sum()

np.int64(0)

In [None]:
# @title
!pip install imagehash

In [13]:
import imagehash
from PIL import Image

# Check for duplicate images
hashes = {}
duplicate_images = []

for image_name in images_list:
    image_path = os.path.join(data_path, 'Images', image_name)
    try:
        img = Image.open(image_path)
        img_hash = str(imagehash.average_hash(img))
        if img_hash in hashes:
            duplicate_images.append((image_name, hashes[img_hash]))
        else:
            hashes[img_hash] = image_name
    except Exception as e:
        print(f"Could not process image {image_name}: {e}")

if duplicate_images:
    print("Duplicate images found:")
    for img1, img2 in duplicate_images:
        print(f"- {img1} is a duplicate of {img2}")
else:
    print("No duplicate images found.")

Duplicate images found:
- 098446.jpg is a duplicate of 098446 (1).jpg
- 100409.jpg is a duplicate of 100409 (1).jpg
- 138425.jpg is a duplicate of 138425 (1).jpg
- 099764.jpg is a duplicate of 099764 (1).jpg
- 046845.jpg is a duplicate of 046845 (1).jpg
- 070893.jpg is a duplicate of 070893 (1).jpg
- 167323.jpg is a duplicate of 167323 (1).jpg
- 175928.jpg is a duplicate of 175928 (1).jpg
- 161746.jpg is a duplicate of 161746 (1).jpg
- 168979 (1).jpg is a duplicate of 168979 (2).jpg
- 168979.jpg is a duplicate of 168979 (2).jpg
- 183111.jpg is a duplicate of 183111(1).jpg
- 182793.jpg is a duplicate of 182793(1).jpg
- 044908.jpg is a duplicate of 044908 (1).jpg
- 199994.jpg is a duplicate of 199994 (1).jpg
- 182912(1).jpg is a duplicate of 182912.jpg
- 072921.jpg is a duplicate of 072921 (1).jpg
- 123213.jpg is a duplicate of 123213 (1).jpg
- 150707.jpg is a duplicate of 178562.jpg
- 147959.jpg is a duplicate of 147959 (1).jpg
- 155434.jpg is a duplicate of 155434 (1).jpg
- 074149.jpg 

In [14]:
# Handle duplicate images by removing one copy of each duplicate
# Create a set of duplicate image names to remove
duplicate_image_names_to_remove = set()
for img1, img2 in duplicate_images:
    # Decide which image to remove, for simplicity, remove the one with (1) or (2) in the name first
    if "(1)" in img1 or "(2)" in img1:
        duplicate_image_names_to_remove.add(img1)
    elif "(1)" in img2 or "(2)" in img2:
        duplicate_image_names_to_remove.add(img2)
    else:
        # If no (1) or (2) in name, just remove the second image found
        duplicate_image_names_to_remove.add(img2)

# Filter images_list to remove duplicate images
images_list_filtered = [image for image in images_list if image not in duplicate_image_names_to_remove]

print(f"Original number of images: {len(images_list)}")
print(f"Number of duplicate images removed: {len(duplicate_image_names_to_remove)}")
print(f"Number of images after removing duplicates: {len(images_list_filtered)}")

# Update the images_list to the filtered list for subsequent steps
images_list = images_list_filtered

Original number of images: 1802
Number of duplicate images removed: 33
Number of images after removing duplicates: 1769


In [16]:
# Create a new dataframe with the filtered image list
filtered_images_df = pd.DataFrame({'image_id': images_list})

# Merge with the original data to keep the 'Male' column
data_filtered = pd.merge(filtered_images_df, data, on='image_id', how='left')

print("Shape of the new filtered dataframe:", data_filtered.shape)
print(data_filtered.head())

# Update the 'data' variable to use the filtered dataframe for subsequent steps
data = data_filtered

Shape of the new filtered dataframe: (1769, 2)
     image_id  Male
0  122740.jpg   0.0
1  128816.jpg   1.0
2  133834.jpg   0.0
3  095987.jpg   1.0
4  128545.jpg   0.0


In [48]:
# Drop rows with NaN values in the 'Male' column
data = data.dropna(subset=['Male'])

print("Shape of data after dropping NaN:", data.shape)
print("Number of NaN values in 'Male' column:", data['Male'].isnull().sum())

Shape of data after dropping NaN: (1765, 2)
Number of NaN values in 'Male' column: 0


In [49]:
# split the data into train and test sets with a 80:20 ratio
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

## Preprocessing

In [50]:
class GenderDataset(Dataset):
    def __init__(self, data, image_folder_path, transform=None):
        self.data = data
        self.image_folder_path = image_folder_path
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_path = os.path.join(self.image_folder_path, self.data.iloc[idx, 0])
        # please define image convertion technique to RGB here
        image = Image.open(image_path).convert('RGB')
        gender = self.data.iloc[idx, 1]
        if self.transform:
            image = self.transform(image)
        return image, torch.tensor(int(gender), dtype=torch.long)

In [51]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),           # resize ke 224x224 (ukuran input Resnet D)
    transforms.RandomHorizontalFlip(p=0.5),  # augmentasi: flipping kiri/kanan
    transforms.ColorJitter(                  # augmentasi: variasi warna/kontras
        brightness=0.2,
        contrast=0.2,
        saturation=0.2,
        hue=0.1
    ),

    transforms.ToTensor(),                   # ubah ke tensor [0,1]
    transforms.Normalize(                    # normalisasi pakai mean & std ImageNet
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

In [52]:
train_set = GenderDataset(train_data, image_folder_path=os.path.join(data_path, "Images"), transform=transform)
train_loader = DataLoader(train_set, batch_size=32, shuffle=True, num_workers=2)

test_set = GenderDataset(test_data, os.path.join(data_path, "Images"), transform=transform)
test_loader = DataLoader(test_set, batch_size=32, shuffle=False, num_workers=2)

## Architecture

In [62]:
# please define the model (VGG/GoogleNet/ResNet) here
model = resnet50(pretrained=True)
num_ftrs = model.fc.in_features
# Add a dropout layer before the final fully connected layer
model.fc = nn.Sequential(
    nn.Dropout(p=0.5), # You can adjust the dropout probability (p) here
    nn.Linear(num_ftrs, 2)  # Output layer for 2 classes (male/female)
)



In [63]:
# Define optimizer and criterion
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) #Gunakan optimizer Adam agar model lebih konvergen, gunakan juga weight decay agar weight/ bobot tidak terlalu besar
criterion = nn.CrossEntropyLoss() #CrossEntropyLoss untuk multiclass classification

## Modeling

In [64]:
import time
import copy
from torch.autograd import Variable

def train_model(model, dataloaders, dataset_sizes, criterion, optimizer, use_gpu=torch.cuda.is_available(), num_epochs=10, patience=5, min_delta=0.001):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict()) # buat copy / model cadangan
    best_acc = 0.0 # Set akurasi terbaik mulai dari 0

    train_acc_history = []
    test_acc_history = []

    epochs_no_improve = 0 # Variabel epochs_no_improve dipakai untuk menghitung berapa kali berturut-turut akurasi test/validasi tidak membaik. Training berhenti otomatis ketika sudah jelas tidak ada perbaikan, hal ini dapat meminimalisir overfitting

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and evaluation phase
        for phase in ['train', 'test']:
            if phase == 'train':
                model.train(True)  # Set model to training mode
            else:
                model.train(False)  # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0
            # alat ukur yang bikin kita tahu apakah overfitting terjadi.

            # Iterate over data.
            for data in dataloaders[phase]:
                # get the inputs
                inputs, labels = data

                # wrap them in Variable
                if use_gpu: # Gunakan GPU jika ada (agar proses training lebih cepat)
                    inputs = Variable(inputs.cuda())
                    labels = Variable(labels.cuda())
                else: # Jika tidak ada GPU, gunakan CPU (lebih lambat dari GPU)
                    inputs = Variable(inputs)
                    labels = Variable(labels)
                    # Jika training berjalan lambat, ada potensi training berhenti di tengah jalan, hal ini akan menyebabkan model underfitting

                # zero the parameter gradients
                optimizer.zero_grad() # mengosongkan gradien lama sebelum hitung gradien baru.
                # Jika tiap batch dilakukan zero_grad, Model akan update bobot dengan informasi yang jernih, sehingga akurasi train & test lebih stabil.

                # forward
                # ResNet50 only outputs one value
                outputs = model(inputs)
                loss = criterion(outputs, labels)


                # backward + optimize only if in training phase
                if phase == 'train':
                    loss.backward() # inti dari deep learning. Tanpa ini, bobot tidak akan pernah belajar dari kesalahan.
                    optimizer.step() # kalau ini tidak ada, model akan stuck dengan bobot awal (tidak ada perbaikan).

                # statistics
                running_loss += loss.data * inputs.size(0) # Multiply by batch size for correct loss calculation
                _, preds = torch.max(outputs.data, 1)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase] # Convert to double for accurate division

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            if phase == 'train':
                train_acc_history.append(epoch_acc.item())
            else:
                test_acc_history.append(epoch_acc.item())

            # deep copy the model if test accuracy improved
            if phase == 'test':
                if epoch_acc > best_acc + min_delta:
                    best_acc = epoch_acc
                    best_model_wts = copy.deepcopy(model.state_dict())
                    epochs_no_improve = 0
                else:
                    epochs_no_improve += 1
                    if epochs_no_improve == patience:
                        print("Early stopping!")
                        time_elapsed = time.time() - since
                        print('Training complete in {:.0f}m {:.0f}s'.format(
                            time_elapsed // 60, time_elapsed % 60))
                        print('Best test Accuracy: {:4f}'.format(best_acc)) # Corrected print statement
                        model.load_state_dict(best_model_wts)
                        return model, train_acc_history, test_acc_history


        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best test Accuracy: {:4f}'.format(best_acc)) # Corrected print statement

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, train_acc_history, test_acc_history

In [65]:
dataloders = {
    "train":train_loader, "test":test_loader
}
dataset_sizes= {
    "train":len(train_set), "test":len(test_set)
}

In [66]:
use_gpu = torch.cuda.is_available()

if use_gpu:
  model = model.to("cuda")

In [67]:
model = train_model(model, dataloders, dataset_sizes, criterion, optimizer, use_gpu, 10)

Epoch 0/9
----------
train Loss: 0.2571 Acc: 0.8874
test Loss: 0.1410 Acc: 0.9320

Epoch 1/9
----------
train Loss: 0.1016 Acc: 0.9625
test Loss: 0.0627 Acc: 0.9688

Epoch 2/9
----------
train Loss: 0.0447 Acc: 0.9858
test Loss: 0.1148 Acc: 0.9603

Epoch 3/9
----------
train Loss: 0.0308 Acc: 0.9873
test Loss: 0.0688 Acc: 0.9717

Epoch 4/9
----------
train Loss: 0.0241 Acc: 0.9901
test Loss: 0.0630 Acc: 0.9773

Epoch 5/9
----------
train Loss: 0.0335 Acc: 0.9887
test Loss: 0.1037 Acc: 0.9603

Epoch 6/9
----------
train Loss: 0.0668 Acc: 0.9731
test Loss: 0.0951 Acc: 0.9575

Epoch 7/9
----------
train Loss: 0.0450 Acc: 0.9830
test Loss: 0.1387 Acc: 0.9518

Epoch 8/9
----------
train Loss: 0.0235 Acc: 0.9922
test Loss: 0.1067 Acc: 0.9603

Epoch 9/9
----------
train Loss: 0.0195 Acc: 0.9936
test Loss: 0.1022 Acc: 0.9518
Early stopping!
Training complete in 2m 53s
Best test Accuracy: 0.977337


## Evaluation

In [68]:
def evaluate_model(model, test_loader, target_labels):
    # please define the evaluation function here
    model.eval() # Set model to evaluate mode

    all_preds = []
    all_labels = []

    with torch.no_grad(): # Disable gradient calculation for evaluation
        for inputs, labels in test_loader:
            if torch.cuda.is_available():
                inputs = inputs.cuda()
                labels = labels.cuda()

            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Print classification report
    print(classification_report(all_labels, all_preds, target_names=target_labels))

    # Print confusion matrix
    print("Confusion Matrix:")
    print(confusion_matrix(all_labels, all_preds))

In [69]:
# Extract the trained model from the tuple returned by train_model
trained_model, train_acc_history, test_acc_history = model

# Now pass the actual model object to the evaluation function
evaluate_model(trained_model, dataloders['test'], ["female", "male"])

              precision    recall  f1-score   support

      female       0.99      0.97      0.98       210
        male       0.96      0.98      0.97       143

    accuracy                           0.97       353
   macro avg       0.97      0.98      0.97       353
weighted avg       0.97      0.97      0.97       353

Confusion Matrix:
[[204   6]
 [  3 140]]
