## Base Line Model
with Local Binary Pattern, Mobile Net 3 small with 1 added layer, flat architecture.

### Imports

In [None]:
!pip install hiclass[all]

import os
import pandas as pd
import torch
import numpy as np
import csv
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from PIL import Image
from skimage.feature import local_binary_pattern
from skimage import color
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score
from hiclass import LocalClassifierPerNode, LocalClassifierPerParentNode, LocalClassifierPerLevel
from hiclass.metrics import f1
import sklearn
import joblib
import xgboost as xgb

from google.colab import drive
drive.mount('/content/drive')

Collecting hiclass[all]
  Downloading hiclass-5.0.4-py3-none-any.whl.metadata (16 kB)
Downloading hiclass-5.0.4-py3-none-any.whl (50 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.6/50.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hiclass
Successfully installed hiclass-5.0.4
Mounted at /content/drive


### LBP Transform

In [None]:
class LBPTransform:
    def __init__(self, radius=3, n_points=None, method='uniform'):
        self.radius = radius
        self.n_points = n_points if n_points else 8 * radius
        self.method = method

    def __call__(self, img):
        if isinstance(img, Image.Image):
            img = np.array(img)

        if len(img.shape) == 3 :
            gray = color.rgb2gray(img)
        else:
            gray = img

        gray = (gray * 255).astype(np.uint8)

        lbp = local_binary_pattern(gray, self.n_points, self.radius, self.method)

        lbp = (lbp - lbp.min()) / (lbp.max() - lbp.min() + 1e-7)

        lbp_3 = np.stack([lbp, lbp, lbp], axis=-1)

        return lbp_3

### Make LBP Images

In [None]:
def make_lbp_csv(input_folder, csv_path, lbp_transformer):
    img_files = [f for f in os.listdir(input_folder) if f.endswith('.png')]

    print(f"Processing {len(img_files)} images from {input_folder}...")
    with open(csv_path, 'w') as f:
        writer = csv.writer(f)

        header_written=False

        for img, fname in enumerate(img_files):
          in_path = os.path.join(input_folder, fname)
          img = Image.open(in_path).convert('RGB')

          img = lbp_transformer(img)
          img = img.flatten()

          if not header_written:
            header = ['PGCname'] + [f'pixel_{i}' for i in range(len(img))]
            writer.writerow(header)
            header_written = True

          writer.writerow([fname] + img.tolist)

    print("LBP preprocessing complete.")

### Dataset Class
Class for processing data and combining images with labels

In [None]:
class PGCDataset(Dataset):
    def __init__(self, labels_df, img_folder, id_col='PGCname', label_col='T', transform=None):
        self.labels_df = labels_df.reset_index(drop=True)
        self.img_folder = img_folder
        self.id_col = id_col
        self.label_col = label_col
        self.transform = transform

        available_imgs = {f.replace('.png', '') for f in os.listdir(img_folder)
                            if f.endswith('.png')}
        self.labels_df = self.labels_df[self.labels_df[id_col].isin(available_imgs)].reset_index(drop=True)

        print(f"Dataset created with {len(self.labels_df)} imgs")

    def __len__(self):
        return len(self.labels_df)

    def __getitem__(self, idx):
        row = self.labels_df.iloc[idx]

        img_id = row[self.id_col]
        img_path = os.path.join(self.img_folder, f"{img_id}.png")
        img = Image.open(img_path).convert('RGB')

        label = torch.tensor(int(row[self.label_col]), dtype=torch.long)

        if self.transform:
            img = self.transform(img)

        return img, label, img_id

### Dataset creation

In [None]:
path = '/content/drive/Othercomputers/My laptop/Thesis/Galaxy-Classifier/'
img_folder = path + 'images'
lbp_csv = path + 'lbp.csv'


id_col = 'PGCname'
label_col = 'T'

img_size = 224

labels_df = pd.read_csv(path + 'EFIGI_attributes.txt', sep=r'\s+', comment='#')

labels = labels_df[label_col]

labels = labels.replace({-3:-2, -1:-2}) # S0
labels = labels.replace({0:1, 2:1}) # Sa
labels = labels.replace({3:4}) # Sb
labels = labels.replace({5:6}) # Sc
labels = labels.replace({8:7, 9:7}) # Sd
labels = labels.replace({10:11}) # Irr

labels = labels.replace({-6:0, -5:1, -4:2, -2:3, 1:4, 4:5, 11:8}) # Adjust to 0 - 8

labels_df[label_col] = labels

train_df, test_df = train_test_split(labels_df, test_size=0.2, random_state=0, stratify=labels_df[label_col])
train_df, val_df = train_test_split(train_df, test_size=0.125, random_state=0, stratify=train_df[label_col])


train_transform = transforms.Compose([
    transforms.RandomRotation(180),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

test_transform = transforms.Compose([
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])


train_dataset = PGCDataset(
    labels_df=train_df,
    img_folder=img_folder,
    id_col=id_col,
    label_col=label_col,
    transform=train_transform
)
val_dataset = PGCDataset(
    labels_df=val_df,
    img_folder=img_folder,
    id_col=id_col,
    label_col=label_col,
    transform=test_transform
)
test_dataset = PGCDataset(
    labels_df=test_df,
    img_folder=img_folder,
    id_col=id_col,
    label_col=label_col,
    transform=test_transform
)

Dataset created with 3120 imgs
Dataset created with 446 imgs
Dataset created with 892 imgs


### Data loader
loads data in batches

In [None]:
"""labels = train_df[label_col].values
classes= np.unique(labels)
class_weights = compute_class_weight('balanced', classes=classes, y=labels)"""


train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    num_workers=0,
)

val_loader = DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=0,
)

test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=0
)

### Hierarchical model
using pretrained resnet18

In [None]:
class FeatureResnet18(nn.Module):
  def __init__(self, num_classes=None):
    super().__init__()
    self.num_classes = num_classes
    # get model
    resnet = models.resnet18(weights='IMAGENET1K_V1')

    self.backbone = nn.Sequential(*list(resnet.children())[:-1])
    self.fc = nn.Linear(512, num_classes)


  def forward(self, x):
    features = self.backbone(x)
    features = torch.flatten(features, 1)

    return features, self.fc(features)

## Calculate class weights

In [None]:
def calculate_class_weights(labels, fine_per_coarse, device='cpu'):
    labels = np.array(labels)

    fine_to_coarse_mapping = {}
    fine_id = 0
    for coarse_id, num_fine in enumerate(fine_per_coarse):
      for local_fine_id in range(num_fine):
        fine_to_coarse_mapping[fine_id] = coarse_id
        fine_id += 1

    fine_classes = np.unique(labels)
    fine_weights = compute_class_weight('balanced', classes=fine_classes, y=labels)

    total_fine_classes = sum(fine_per_coarse)
    fine_weights_full = torch.ones(total_fine_classes, dtype=torch.float, device=device)
    for i, class_id in enumerate(fine_classes):
        fine_weights_full[class_id] = fine_weights[i]

    coarse_labels = np.array([fine_to_coarse_mapping[label] for label in labels])
    coarse_classes = np.unique(coarse_labels)
    coarse_weights = compute_class_weight('balanced', classes=coarse_classes, y=coarse_labels)

    num_coarse_classes = len(fine_per_coarse)
    coarse_weights_full = torch.ones(num_coarse_classes, dtype=torch.float, device=device)
    for i, class_id in enumerate(coarse_classes):
        coarse_weights_full[class_id] = coarse_weights[i]

    return fine_weights_full, coarse_weights_full

## Train/extract feature methods

In [None]:
def train_CNN(model, dataloader, criterion, optimizer, device, epochs):
    model.train()
    best_acc = 0

    for epoch in range(epochs):
      running_loss, correct, total = 0.0, 0, 0
      for img, labels, ids in dataloader:
        img = img.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)
        optimizer.zero_grad()

        _, outputs = model(img)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
        print(".", end="")
      print("")

      epoch_loss = running_loss / len(dataloader)
      epoch_acc = 100.0 * correct / total
      print(f"Epoch {epoch+1}/{epochs}")
      print(f"  Train Loss: {epoch_loss:.4f} | Train Acc: {epoch_acc:.2f}%")

      if epoch_acc > best_acc:
        best_acc = epoch_acc
        torch.save(model.state_dict(), path + 'feature_CNN.pth')


def extract_features(model, dataloader, device, file_name=None, loss_fn=None):
  all_features = []
  all_labels = []
  all_preds = []
  test_loss, acc = 0, 0

  with torch.no_grad():
    for X, y, _ in dataloader:
      X, y = X.to(device), y.to(device)

      features, outputs = model(X)

      if loss_fn:
        loss = loss_fn(outputs, y)
        test_loss += loss.item()
        acc += (outputs.argmax(1) == y).type(torch.float).sum().item()

        all_preds.append(outputs.argmax(1).cpu().numpy())

      all_features.append(features.cpu().numpy())
      all_labels.append(y.cpu().numpy())

      print(".", end="")
    print("")

  X = np.vstack(all_features)
  y = np.concatenate(all_labels)

  if loss_fn:
    all_preds = np.concatenate(all_preds)

    print(classification_report(y, all_preds, digits=4))

  if file_name:
    np.save(path + file_name + '_X.npy', X)
    np.save(path + file_name + '_y.npy', y)

  return X, y

def convert_labels(labels, fine_to_coarse):
  hierarchical_labels = []

  for fine in labels:
    coarse = fine_to_coarse[fine]
    hierarchical_labels.append([coarse, fine])

  return hierarchical_labels


## Train feature model and extract features

In [None]:
"""feature_name = 'feature_CNN.pth'

fine_to_coarse = {
    0:0, 1:0, 2:0,
    3:1,
    4:2, 5:2, 6:2, 7:2,
    8:3
}

num_classes = 9

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

feature_CNN = FeatureResnet18(num_classes).to(device)

class_weights = compute_class_weight('balanced', classes=np.unique(train_df[label_col]), y=train_df[label_col])
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
criterion = nn.CrossEntropyLoss(class_weights)
test_criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(feature_CNN.parameters(), lr=0.001)

#train_CNN(feature_CNN, train_loader, criterion, optimizer, device, epochs=50)

feature_CNN.load_state_dict(torch.load(path + feature_name))

#X_train, y_train = extract_features(feature_CNN, train_loader, device, file_name="train")
X_test, y_test = extract_features(feature_CNN, test_loader, device, file_name="test", loss_fn=test_criterion)"""

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 47.5MB/s]


............................
              precision    recall  f1-score   support

           0     0.1579    0.7500    0.2609         4
           1     0.7209    0.6889    0.7045        45
           2     0.8571    0.6667    0.7500         9
           3     0.7059    0.5607    0.6250       107
           4     0.6000    0.6667    0.6316       135
           5     0.6961    0.7172    0.7065       198
           6     0.7143    0.5333    0.6107       150
           7     0.7415    0.8398    0.7876       181
           8     0.7313    0.7778    0.7538        63

    accuracy                         0.6872       892
   macro avg     0.6583    0.6890    0.6478       892
weighted avg     0.6979    0.6872    0.6874       892



## Train and test classifier

In [None]:
"""model_name = 'Hi_node.joblib'

X_train = np.load(path + 'train_X.npy')
y_train_f = np.load(path + 'train_y.npy')

X_test = np.load(path + 'test_X.npy')
y_test_f = np.load(path + 'test_y.npy')

y_train_h = convert_labels(y_train_f, fine_to_coarse)
y_test_h = convert_labels(y_test_f, fine_to_coarse)

local_classifier = sklearn.svm.SVC(C=5, kernel="rbf", gamma="scale", probability=True, class_weight="balanced")

classifier = LocalClassifierPerNode(local_classifier, n_jobs=-1, verbose=20)

classifier.fit(X_train, y_train_h)

joblib.dump(classifier, path + model_name)"""

'model_name = \'Hi_node.joblib\'\n\nX_train = np.load(path + \'train_X.npy\')\ny_train_f = np.load(path + \'train_y.npy\')\n\nX_test = np.load(path + \'test_X.npy\')\ny_test_f = np.load(path + \'test_y.npy\')\n\ny_train_h = convert_labels(y_train_f, fine_to_coarse)\ny_test_h = convert_labels(y_test_f, fine_to_coarse)\n\nlocal_classifier = sklearn.svm.SVC(C=5, kernel="rbf", gamma="scale", probability=True, class_weight="balanced")\n\nclassifier = LocalClassifierPerNode(local_classifier, n_jobs=-1, verbose=20)\n\nclassifier.fit(X_train, y_train_h)\n\njoblib.dump(classifier, path + model_name)'

In [None]:
classifier = joblib.load(path + model_name)

y_pred_h = classifier.predict(X_test)

y_pred_f = np.array([int(pred[-1]) for pred in y_pred_h])


print(classification_report(y_test_f, y_pred_f))

'#classifier = joblib.load(path + model_name)\n\ny_pred_h = classifier.predict(X_test)\n\ny_pred_f = np.array([int(pred[-1]) for pred in y_pred_h])\n\n\nprint(classification_report(y_test_f, y_pred_f))'