In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, models
from PIL import Image
import pandas as pd
from transformers import DeiTForImageClassification, DeiTFeatureExtractor, get_scheduler

# Define paths
TRAIN_LABELED_CSV = "./train_labeled.csv"
TRAIN_LABELED_DIR = "./train/labeled"
TRAIN_UNLABELED_CSV = "./train_unlabeled.csv"
TRAIN_UNLABELED_DIR = "./train/unlabeled"
TEST_CSV = "./test.csv"
TEST_DIR = "./test"
CATEGORIES_CSV = "./categories.csv"
MODEL_SAVE_PATH = "./model.pth"

# Check if MPS is available and select the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define Dataset Class for Image Classification
class CustomDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.data = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.data.iloc[idx, 0])  # Assuming filename is in the first column
        image = Image.open(img_name).convert("RGB")
        label = int(self.data.iloc[idx, 1])  # Assuming label is in the second column
        
        if self.transform:
            image = self.transform(image)
        
        return image, label

# Image Transformations for Preprocessing
transform = transforms.Compose([
    transforms.RandomResizedCrop(256),  # RandomCrop followed by Resize (Resize and RandomCrop to keep augmentation)
    transforms.RandomHorizontalFlip(),  # Random Horizontal Flip for data augmentation
    transforms.ToTensor(),              # Convert image to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize based on ImageNet stats
])

# Load the Labeled Training Dataset
train_labeled = CustomDataset(csv_file=TRAIN_LABELED_CSV, img_dir=TRAIN_LABELED_DIR, transform=transform)
train_loader = DataLoader(train_labeled, batch_size=32, shuffle=True)

# Load the DeiT Model and Feature Extractor
model_name = 'facebook/deit-base-patch16-224-in21k'
model = DeiTForImageClassification.from_pretrained(model_name, num_labels=len(pd.read_csv(CATEGORIES_CSV)))
feature_extractor = DeiTFeatureExtractor.from_pretrained(model_name)

model.to(device)  # Move the model to the device (CUDA or CPU)

# Replace the final classifier head with the last two layers from DeiT
# Extracting the penultimate layer and the final classification head from the model
# First, we'll identify the layers
last_two_layers = nn.Sequential(
    model.deit.encoder.layer[-2],  # Penultimate block of the transformer encoder
    model.deit.encoder.layer[-1],  # Final block of the transformer encoder
    model.classifier  # Keep the original classifier head
)

# Reassign these layers back to the model
model.deit.encoder.layer = nn.ModuleList(last_two_layers)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()

# Set optimizer with learning rate and weight decay (as per your baseline)
optimizer = optim.Adam(model.parameters(), lr=5e-5, weight_decay=1e-5)

# Exponential decay for learning rate
lr_scheduler = get_scheduler(
    "linear",  # Linear decay scheduler
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_loader) * 10  # Assume 10 epochs
)

model.to(device)  # Ensure the model is moved to the correct device

# Training Loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    running_loss = 0.0
    correct = 0
    total = 0
    
    for inputs, labels in train_loader:
        # Move inputs and labels to the device (CUDA/CPU)
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()  # Zero the gradients
        
        # Forward pass
        outputs = model(inputs)  # Model will use the correct device
        
        # Compute loss
        loss = criterion(outputs.logits, labels)  # Use logits from the model output
        
        # Backward pass
        loss.backward()
        
        # Optimize the parameters
        optimizer.step()
        
        # Update learning rate
        lr_scheduler.step()

        # Accumulate the loss
        running_loss += loss.item()
        
        # Get predictions
        _, predicted = torch.max(outputs.logits, 1)
        
        # Update accuracy
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    # Calculate average loss and accuracy for the epoch
    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100 * correct / total
    
    # Print epoch loss and accuracy
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%")

# Save the trained model
torch.save(model.state_dict(), MODEL_SAVE_PATH)

# To load the model later:
model.load_state_dict(torch.load(MODEL_SAVE_PATH))
model.eval()  # Set the model to evaluation mode

def get_predictions(model, test_path, transform, device):
    predictions = []

    # Loop through all the images in the test directory
    for image_name in os.listdir(test_path):
        if image_name.endswith('.jpg'):  # Assuming images are in JPG format
            image_path = os.path.join(test_path, image_name)
            
            # Load and transform the image
            image = Image.open(image_path)
            image = transform(image).unsqueeze(0)  # Add batch dimension
            
            # Move the image tensor to the same device as the model
            image = image.to(device)

            # Make the prediction
            with torch.no_grad():
                output = model(image)
                _, predicted_class = torch.max(output.logits, 1)  # Get the predicted class
            
            predictions.append((image_name, predicted_class.item()))  # Append the result

    return predictions

# Get predictions for all images in the test folder
predictions = get_predictions(model, "./test1/test", transform, device)

# Convert predictions to DataFrame and save as CSV
df = pd.DataFrame(predictions, columns=['image', 'id'])
df.to_csv('predictions.csv', index=False)


In [3]:
from transformers import AutoImageProcessor, DeiTModel
import torch
from datasets import load_dataset

dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
image = dataset["test"]["image"][0]

image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
model = DeiTModel.from_pretrained("facebook/deit-base-distilled-patch16-224")

inputs = image_processor(image, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

Downloading builder script: 100%|██████████| 2.56k/2.56k [00:00<00:00, 3.92MB/s]
Downloading data: 100%|██████████| 173k/173k [00:00<00:00, 2.76MB/s]
Generating test split: 1 examples [00:00, 41.27 examples/s]
Some weights of DeiTModel were not initialized from the model checkpoint at facebook/deit-base-distilled-patch16-224 and are newly initialized: ['deit.pooler.dense.bias', 'deit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[1, 198, 768]

In [1]:
from transformers import pipeline
# Allocate a pipeline for sentiment-analysis
classifier = pipeline('sentiment-analysis')
classifier('We are very happy to introduce pipeline to the transformers repository.')

  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


: 

In [1]:
from transformers import AutoImageProcessor, AutoModelForImageClassification
from PIL import Image
import requests

url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base-imagenet1k-1-layer')
model = AutoModelForImageClassification.from_pretrained('facebook/dinov2-base-imagenet1k-1-layer')

inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", model.config.id2label[predicted_class_idx])

  from .autonotebook import tqdm as notebook_tqdm


Predicted class: tabby, tabby cat
