In [None]:
pip install kaggle

In [None]:
!pip install kaggle

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c human-protein-atlas-image-classification

In [None]:
!unzip human-protein-atlas-image-classification.zip -d /content/data/

In [None]:
import os

data_dir = "/content/data/"
print("File Name:", os.listdir(data_dir))

In [None]:
# Check train
print("Train:", len(os.listdir(os.path.join(data_dir, "train"))))

# Check test
print("Test:", len(os.listdir(os.path.join(data_dir, "test"))))

In [None]:
import pandas as pd

# Train CSV
train_csv_path = os.path.join(data_dir, "train.csv")
train_labels = pd.read_csv(train_csv_path)

# Check
print(train_labels.head())

In [None]:
import numpy as np

NUM_CLASSES = 28

# One-hot
def multi_label_encoding(labels):
    encoded = np.zeros(NUM_CLASSES, dtype=np.float32)
    for label in labels.split():
        encoded[int(label)] = 1
    return encoded

train_labels['Encoded_Labels'] = train_labels['Target'].apply(multi_label_encoding)

print(train_labels.head())

In [None]:
import torchvision.transforms as transforms
from PIL import Image

# define picture preprocessing
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # same size
    transforms.RandomHorizontalFlip(),  # random flipping
    transforms.RandomRotation(15),  # rotat
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),  # Convert to tensors
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # normalization
])

data_dir = "/content/data/"
train_dir = os.path.join(data_dir, "train")

# Get a list of all image files in the train directory
image_files = [f for f in os.listdir(train_dir) if os.path.isfile(os.path.join(train_dir, f))]

# Check if there are any image files in the directory
if image_files:
    # Use the first image file in the list
    img_path = os.path.join(train_dir, image_files[0])
    image = Image.open(img_path).convert("RGB")
    image = train_transform(image)
    print(image.shape)
else:
    print("No image files found in the train directory.")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from PIL import Image
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# load training data
data_dir = "/content/data"
train_csv_path = os.path.join(data_dir, "train.csv")
train_images_path = os.path.join(data_dir, "train")

# read CSV
train_labels = pd.read_csv(train_csv_path)
print("training data:", len(train_labels))
print("first lines:\n", train_labels.head())

# Target string type
train_labels["Target"] = train_labels["Target"].astype(str)

# Defining a ProteinDataset

In [None]:
class ProteinDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None):
        self.dataframe = dataframe
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_id = self.dataframe.iloc[idx]["Id"]
        labels = self.dataframe.iloc[idx]["Target"]

        # read R/G/B/Y images
        img_red = Image.open(os.path.join(self.img_dir, f"{img_id}_red.png"))
        img_green = Image.open(os.path.join(self.img_dir, f"{img_id}_green.png"))
        img_blue = Image.open(os.path.join(self.img_dir, f"{img_id}_blue.png"))
        img_yellow = Image.open(os.path.join(self.img_dir, f"{img_id}_yellow.png")).convert("L")

        # Convert to tensors
        img_red = transforms.ToTensor()(img_red)
        img_green = transforms.ToTensor()(img_green)
        img_blue = transforms.ToTensor()(img_blue)
        img_yellow = transforms.ToTensor()(img_yellow)

        # Merge into 4 channels
        image = torch.cat([img_red, img_green, img_blue, img_yellow], dim=0)

        # transfer Target to One-hot encoding
        NUM_CLASSES = 28
        encoded_labels = torch.zeros(NUM_CLASSES, dtype=torch.float32)
        for label in str(labels).split():
            encoded_labels[int(label)] = 1

        if self.transform:
            image = self.transform(image)

        return image, encoded_labels

# Preprocessing & training only 1/3 of the data

In [None]:
# Defining Data Transformations (Normalize & Resize)
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    # transforms.ToTensor(),  # Remove this line as it's already done in __getitem__
    transforms.Normalize(mean=[0.5, 0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5, 0.5])
])

# Create a dataset
train_dataset = ProteinDataset(train_labels, train_images_path, transform=train_transform)

# Pick 1/3 of the data index
subset_size = len(train_dataset) // 3
subset_indices = np.random.choice(len(train_dataset), size=subset_size, replace=False)

# Use 'SubsetRandomSampler' to train only 1/3 of the data
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False, sampler=SubsetRandomSampler(subset_indices))

# Test DataLoader
for images, labels in train_loader:
    print("Batch image shape:", images.shape)
    print("Batch label sahpe:", labels.shape)
    break


# Defining CNNs (DenseNet121, supports 4-channel inputs)

In [None]:
# Load DenseNet121 train the model
model = models.densenet121(pretrained=True)

# Modify the first layer to support 4-channel inputs
model.features.conv0 = nn.Conv2d(4, 64, kernel_size=7, stride=2, padding=3, bias=False)

# Modifying the Last Layer Classifier (Class 28)
num_ftrs = model.classifier.in_features
model.classifier = nn.Linear(num_ftrs, 28)

# Send to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)

# Train the CNN
other loss function, BCEloss-different weights,


In [None]:
EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{EPOCHS}], Loss: {running_loss/len(train_loader):.4f}")

print("Successful！")

# Evaluate the model
confusion matrix

In [None]:
def evaluate_model(model, dataloader):
    model.eval()
    total_loss = 0.0

    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

    print(f"Test Loss: {total_loss/len(dataloader):.4f}")

evaluate_model(model, train_loader)

# Predict new pictures

In [None]:
def predict(model, img_id):
    model.eval()

    # Update train_images_path
    train_images_path = "/content/data/train"

    # read R/G/B/Y images
    img_red = Image.open(os.path.join(train_images_path, f"{img_id}_red.png"))
    img_green = Image.open(os.path.join(train_images_path, f"{img_id}_green.png"))
    img_blue = Image.open(os.path.join(train_images_path, f"{img_id}_blue.png"))
    img_yellow = Image.open(os.path.join(train_images_path, f"{img_id}_yellow.png")).convert("L")

    # Convert to Tensor and splic
    img_red = transforms.ToTensor()(img_red)
    img_green = transforms.ToTensor()(img_green)
    img_blue = transforms.ToTensor()(img_blue)
    img_yellow = transforms.ToTensor()(img_yellow)

    image = torch.cat([img_red, img_green, img_blue, img_yellow], dim=0).unsqueeze(0).to(device)

    # Predict
    with torch.no_grad():
        output = model(image)
        output = torch.sigmoid(output).cpu().numpy()

    predicted_labels = (output[0] > 0.5).astype(int)
    return predicted_labels

# Prediction example
sample_img_id = "0007d0f0-bbc3-11e8-b2bc-ac1f6b6435d0"
predicted_labels = predict(model, sample_img_id)
print(f"Predicted Labels: {predicted_labels}")