In [6]:
# Install required package first
!pip install transformers



In [7]:
import torch

# Define the base path to the dataset
data_path = "/kaggle/input/houston2018-tensors-probably-skewed"

# Load the tensors
X_train = torch.load(f"{data_path}/X_train.pt")
y_train = torch.load(f"{data_path}/y_train.pt")
X_test = torch.load(f"{data_path}/X_test.pt")  # Added this line
y_test = torch.load(f"{data_path}/y_test.pt")

# Verify shapes
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)   # Added this line
print("y_test shape:", y_test.shape)

X_train shape: torch.Size([11829, 50, 3, 3])
y_train shape: torch.Size([11829])
X_test shape: torch.Size([1312965, 50, 3, 3])
y_test shape: torch.Size([1312965])


In [8]:
import torch
import torch.nn as nn

class PatchEmbedding(nn.Module):
    def __init__(self, embed_dim=256, dropout=0.1):
        super().__init__()
        self.num_tokens = 9  # 3x3 = 9 spatial positions
        self.num_bands = 50  # Your data has 50 spectral bands
        
        # Linear projection for spectral dimension
        self.projection = nn.Linear(self.num_bands, embed_dim)
        
        # Learnable positional embedding (+1 for CLS token)
        self.pos_embedding = nn.Parameter(torch.randn(1, self.num_tokens + 1, embed_dim))
        
        # CLS token
        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        
        self.dropout = nn.Dropout(dropout)
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        nn.init.xavier_uniform_(self.projection.weight)
        if self.projection.bias is not None:
            nn.init.zeros_(self.projection.bias)
        nn.init.normal_(self.pos_embedding, std=0.02)
        nn.init.normal_(self.cls_token, std=0.02)
    
    def forward(self, x):
        # x shape: (batch_size, num_bands, 3, 3)
        batch_size = x.shape[0]
        
        # Reshape patch into sequence of tokens
        # (batch_size, num_bands, 3, 3) -> (batch_size, 9, num_bands)
        x = x.reshape(batch_size, self.num_bands, 9).transpose(1, 2)
        
        # Project each token's spectral vector to embedding dimension
        x = self.projection(x)  # (batch_size, 9, embed_dim)
        
        # Prepend CLS token
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        x = torch.cat([cls_tokens, x], dim=1)  # (batch_size, 10, embed_dim)
        
        # Add positional embedding
        x = x + self.pos_embedding
        
        return self.dropout(x)

class SpectralTransformer(nn.Module):
    def __init__(
        self,
        embed_dim=256,
        num_layers=6,
        num_heads=8,
        mlp_ratio=4,
        dropout=0.1,
        attention_dropout=0.1,
    ):
        super().__init__()
        
        self.patch_embed = PatchEmbedding(
            embed_dim=embed_dim,
            dropout=dropout
        )
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=embed_dim * mlp_ratio,
            dropout=dropout,
            activation='gelu',
            batch_first=True,
            norm_first=True
        )
        
        self.transformer = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers,
            norm=nn.LayerNorm(embed_dim)
        )
        
        self.norm = nn.LayerNorm(embed_dim)
        
    def forward(self, x):
        x = self.patch_embed(x)
        x = self.transformer(x)
        x = self.norm(x)
        return x[:, 0]

def create_model(
    embed_dim=256,
    num_layers=6,
    num_heads=8,
    mlp_ratio=4,
    dropout=0.1,
    attention_dropout=0.1,
    device="cuda" if torch.cuda.is_available() else "cpu"
):
    model = SpectralTransformer(
        embed_dim=embed_dim,
        num_layers=num_layers,
        num_heads=num_heads,
        mlp_ratio=mlp_ratio,
        dropout=dropout,
        attention_dropout=attention_dropout
    ).to(device)
    return model

# Test and verify the model
if __name__ == "__main__":
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    try:
        # Create a test batch
        batch_size = 32
        x = torch.randn(batch_size, 50, 3, 3).to(device)
        print(f"Created test input with shape: {x.shape}")
        
        # Create model
        model = create_model(
            embed_dim=256,
            num_layers=6,
            num_heads=8,
            device=device
        )
        print("Created model successfully")
        
        # Forward pass
        with torch.no_grad():
            output = model(x)
            print(f"Forward pass successful!")
            print(f"Input shape: {x.shape}")
            print(f"Output shape: {output.shape}")  # Should be (batch_size, embed_dim)
        
        # Optional: Test with actual data if available
        if 'X_train' in globals():
            print("\nTesting with actual training data:")
            x_sample = X_train[:32].to(device)
            with torch.no_grad():
                output = model(x_sample)
                print(f"Real data input shape: {x_sample.shape}")
                print(f"Real data output shape: {output.shape}")
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")

Using device: cuda
Created test input with shape: torch.Size([32, 50, 3, 3])
Created model successfully
Forward pass successful!
Input shape: torch.Size([32, 50, 3, 3])
Output shape: torch.Size([32, 256])

Testing with actual training data:
Real data input shape: torch.Size([32, 50, 3, 3])
Real data output shape: torch.Size([32, 256])




In [11]:
# Install required package first
!pip install transformers

# Now import required libraries
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel

# Define TextEncoder class with all class descriptions
class TextEncoder(nn.Module):
    """
    Text encoder using BERT for encoding class names/descriptions into semantic feature vectors.
    This forms the text encoder component of the vision-language system.
    """
    def __init__(
        self,
        model_name: str = "bert-base-uncased",
        embedding_dim: int = 256,
        device: str = None,
    ):
        super().__init__()
        self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Load BERT model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.bert = AutoModel.from_pretrained(model_name).to(self.device)
        
        # Freeze BERT by default
        for param in self.bert.parameters():
            param.requires_grad = False
            
        # Project from BERT dimension (768) to desired embedding dimension
        self.projection = nn.Linear(768, embedding_dim).to(self.device)
        
        # Class descriptions for Houston dataset
        self.class_descriptions = {
            "Healthy grass": (
                "A region showing vibrant, well-maintained grass vegetation with high chlorophyll content. "
                "The spectral signature indicates optimal plant health and photosynthetic activity. "
                "This class represents areas of grass that receive adequate water and nutrients."
            ),
            "Stressed grass": (
                "An area of grass vegetation showing signs of environmental stress or deterioration. "
                "The spectral signature shows reduced chlorophyll content and photosynthetic activity. "
                "This may be due to insufficient water, nutrient deficiency, or other environmental stressors."
            ),
            "Artificial turf": (
                "A synthetic surface designed to mimic natural grass, typically used in sports fields. "
                "The spectral signature is highly uniform and lacks the natural variation of real vegetation. "
                "This material shows consistent reflectance patterns across the spectrum."
            ),
            "Evergreen trees": (
                "Dense canopy of trees that maintain their foliage throughout the year. "
                "The spectral signature shows strong absorption in visible bands and high reflectance in NIR. "
                "These trees maintain consistent photosynthetic activity across seasons."
            ),
            "Deciduous trees": (
                "Trees that seasonally shed their leaves, showing variable spectral patterns. "
                "The canopy structure and leaf characteristics affect the spectral signature. "
                "These trees show seasonal variations in their spectral response."
            ),
            "Bare earth": (
                "Exposed soil surface without vegetation cover. "
                "The spectral signature is influenced by soil composition, moisture, and organic content. "
                "This class represents areas of bare ground or cleared land."
            ),
            "Water": (
                "Bodies of water including ponds, lakes, or other water features. "
                "The spectral signature shows strong absorption in NIR and characteristic water absorption bands. "
                "Water depth and quality influence the spectral response."
            ),
            "Residential buildings": (
                "Single-family homes and residential structures. "
                "The spectral signature is influenced by roofing materials and urban features. "
                "These areas show typical patterns of residential development."
            ),
            "Non-residential buildings": (
                "Commercial, industrial, or institutional buildings. "
                "The spectral signature varies with building materials and roof types. "
                "These structures often have larger footprints than residential buildings."
            ),
            "Roads": (
                "Paved transportation routes including streets and access roads. "
                "The spectral signature is characteristic of asphalt or concrete surfaces. "
                "These features show linear patterns in the urban landscape."
            ),
            "Sidewalks": (
                "Concrete or paved pedestrian walkways. "
                "The spectral signature is typical of concrete or similar materials. "
                "These features are usually adjacent to roads and buildings."
            ),
            "Crosswalks": (
                "Marked pedestrian crossing areas on roads. "
                "The spectral signature shows patterns of road marking materials. "
                "These features have distinctive patterns within road surfaces."
            ),
            "Major thoroughfares": (
                "Wide, main roads with multiple lanes for higher traffic volume. "
                "The spectral signature indicates extensive paved surfaces. "
                "These roads are major transportation arteries."
            ),
            "Highways": (
                "Large, multi-lane roads for high-speed vehicular traffic. "
                "The spectral signature shows extensive asphalt or concrete surfaces. "
                "These are the largest road features in the urban landscape."
            ),
            "Railways": (
                "Train tracks and associated railroad infrastructure. "
                "The spectral signature includes tracks, gravel, and surrounding features. "
                "These features show distinctive linear patterns."
            ),
            "Paved parking lots": (
                "Asphalt or concrete surfaces designated for vehicle parking. "
                "The spectral signature is similar to roads but in larger continuous areas. "
                "These areas show extensive impervious surface coverage."
            ),
            "Unpaved parking lots": (
                "Gravel or dirt surfaces used for vehicle parking. "
                "The spectral signature indicates unpaved, compacted surfaces. "
                "These areas show different patterns from paved surfaces."
            ),
            "Cars": (
                "Parked or moving vehicles visible in the image. "
                "The spectral signature is influenced by vehicle materials and shadows. "
                "These features appear as small objects in parking areas or on roads."
            ),
            "Trains": (
                "Railroad cars or locomotives on railway tracks. "
                "The spectral signature includes metal surfaces and associated features. "
                "These objects appear along railway infrastructure."
            ),
            "Stadium seats": (
                "Seating areas in sports stadiums or amphitheaters. "
                "The spectral signature is influenced by seating materials and arrangement. "
                "These features show regular patterns in recreational facilities."
            )
        }
        
    def encode_class_description(self, class_name: str) -> torch.Tensor:
        """
        Encode a class description into a feature vector.
        Uses full description if available, otherwise uses the class name.
        """
        # Get full description if available, otherwise use class name
        text = self.class_descriptions.get(class_name, class_name)
        
        # Tokenize with truncation and padding
        inputs = self.tokenizer(
            text,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(self.device)
        
        # Get BERT embeddings
        with torch.no_grad():
            outputs = self.bert(**inputs)
            # Use CLS token embedding as text representation
            text_features = outputs.last_hidden_state[:, 0, :]
        
        # Project to desired dimension and normalize
        text_features = self.projection(text_features)
        text_features = torch.nn.functional.normalize(text_features, p=2, dim=-1)
        
        return text_features
    
    def encode_batch(self, class_names: list) -> torch.Tensor:
        """
        Encode a batch of class names into feature vectors.
        """
        embeddings = []
        for class_name in class_names:
            embedding = self.encode_class_description(class_name)
            embeddings.append(embedding)
        
        return torch.cat(embeddings, dim=0)
    
    def forward(self, class_names: list) -> torch.Tensor:
        """
        Forward pass - encode a batch of class names.
        """
        return self.encode_batch(class_names)
    
    def unfreeze(self, lr_scale: float = 0.1):
        """Unfreeze BERT for fine-tuning with scaled learning rate."""
        for param in self.bert.parameters():
            param.requires_grad = True
            if hasattr(param, 'lr_scale'):
                param.lr_scale = lr_scale
                
    def freeze(self):
        """Freeze BERT."""
        for param in self.bert.parameters():
            param.requires_grad = False

# Test function
def test_text_encoder():
    # Create text encoder
    text_encoder = TextEncoder(embedding_dim=256)
    print(f"Created text encoder on device: {text_encoder.device}")
    
    # Test with some class names
    class_names = [
        "Healthy grass",
        "Stressed grass",
        "Artificial turf"
    ]
    
    # Test single class encoding
    single_embedding = text_encoder.encode_class_description(class_names[0])
    print(f"\nSingle class embedding shape: {single_embedding.shape}")
    
    # Test batch encoding
    batch_embeddings = text_encoder.encode_batch(class_names)
    print(f"Batch embeddings shape: {batch_embeddings.shape}")
    
    # Compare embeddings of similar and different classes
    embeddings = {
        name: text_encoder.encode_class_description(name) 
        for name in ["Healthy grass", "Stressed grass", "Water"]
    }
    
    # Calculate cosine similarities
    print("\nCosine similarities:")
    for name1 in embeddings:
        for name2 in embeddings:
            sim = torch.nn.functional.cosine_similarity(
                embeddings[name1], 
                embeddings[name2]
            ).item()
            print(f"{name1} vs {name2}: {sim:.3f}")

# Run the test
test_text_encoder()



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

2025-07-28 07:44:43.371082: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753688683.501898      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753688683.537894      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Created text encoder on device: cuda

Single class embedding shape: torch.Size([1, 256])
Batch embeddings shape: torch.Size([3, 256])

Cosine similarities:
Healthy grass vs Healthy grass: 1.000
Healthy grass vs Stressed grass: 0.897
Healthy grass vs Water: 0.827
Stressed grass vs Healthy grass: 0.897
Stressed grass vs Stressed grass: 1.000
Stressed grass vs Water: 0.873
Water vs Healthy grass: 0.827
Water vs Stressed grass: 0.873
Water vs Water: 1.000
