In [2]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from tqdm import tqdm
import glob
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import timm
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import spearmanr

In [3]:
DATA_STRUCTURE_PATH_BER = "/kaggle/input/berlin-bin-data/download_checkpoint.json"
IMAGE_BASE_DIR_BER = "/kaggle/input/geovaluatorberlin/images"
DATA_STRUCTURE_PATH_MUN = "/kaggle/input/muen-structure/download_checkpoint_muen_old.json"
IMAGE_BASE_DIR_MUN = "/kaggle/input/muenchen-data/muenchen_images"

In [4]:
def set_seed():
    seed = 28
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True    
set_seed()

In [9]:
class EfficientNetRegressor(nn.Module):
    def __init__(self, model_name='efficientnet_b3'):
        super(EfficientNetRegressor, self).__init__()
        
        self.backbone = timm.create_model(model_name, 
                                         pretrained=True,
                                         num_classes=0,
                                         global_pool='avg')
        
        # Single output neuron for regression
        self.regressor = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(self.backbone.num_features, 1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(0.1),
            
            nn.Dropout(0.4),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(0.1),
            
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.1),
            
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(0.1),
            
            nn.Dropout(0.1),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        features = self.backbone(x)
        return self.regressor(features)

class StreetViewDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['image_path']
        label = self.dataframe.iloc[idx]['label']
        
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
            
        return image, label

def cross_city_spearman_analysis(berlin_model, munich_model, berlin_test_loader, munich_test_loader, device):
    print("Berlin Model with Munich Data")
    berlin_model.eval()
    munich_predictions = []
    munich_true_labels = []
    
    with torch.no_grad():
        for images, labels in munich_test_loader:
            images = images.to(device)
            outputs = berlin_model(images)
            munich_predictions.extend(outputs.cpu().numpy().flatten())
            munich_true_labels.extend(labels.numpy().flatten())
    
    spearman_berlin_on_munich = spearmanr(munich_predictions, munich_true_labels)
    print(f"Spearman Correlation: {spearman_berlin_on_munich.correlation:.4f}")
    
    print(" Munich Model with Berlin Data")
    munich_model.eval()
    berlin_predictions = []
    berlin_true_labels = []
    
    with torch.no_grad():
        for images, labels in berlin_test_loader:
            images = images.to(device)
            outputs = munich_model(images)
            berlin_predictions.extend(outputs.cpu().numpy().flatten())
            berlin_true_labels.extend(labels.numpy().flatten())
    
    spearman_munich_on_berlin = spearmanr(berlin_predictions, berlin_true_labels)
    print(f"Spearman Correlation: {spearman_munich_on_berlin.correlation:.4f}")
    
    return {
        'berlin_on_munich': spearman_berlin_on_munich.correlation,
        'munich_on_berlin': spearman_munich_on_berlin.correlation
    }

In [7]:
# Usage:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Models load
berlin_model = EfficientNetRegressor(model_name='efficientnet_b3')
berlin_model.load_state_dict(torch.load('/kaggle/input/best_regression_model_berlin.pth/pytorch/default/1/best_regression_model_berlin.pth'))
berlin_model.to(device)

munich_model = EfficientNetRegressor(model_name='efficientnet_b3')  
munich_model.load_state_dict(torch.load('/kaggle/input/best_regression_model_muenchen.pth/pytorch/default/1/best_regression_model_muenchen.pth'))
munich_model.to(device)

Using device: cuda


EfficientNetRegressor(
  (backbone): EfficientNet(
    (conv_stem): Conv2d(3, 40, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): BatchNormAct2d(
      40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
      (drop): Identity()
      (act): SiLU(inplace=True)
    )
    (blocks): Sequential(
      (0): Sequential(
        (0): DepthwiseSeparableConv(
          (conv_dw): Conv2d(40, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=40, bias=False)
          (bn1): BatchNormAct2d(
            40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
            (drop): Identity()
            (act): SiLU(inplace=True)
          )
          (aa): Identity()
          (se): SqueezeExcite(
            (conv_reduce): Conv2d(40, 10, kernel_size=(1, 1), stride=(1, 1))
            (act1): SiLU(inplace=True)
            (conv_expand): Conv2d(10, 40, kernel_size=(1, 1), stride=(1, 1))
            (gate): Sigmoid()
          )
          

In [10]:
# BERLIN DATA
with open(DATA_STRUCTURE_PATH_BER, 'r') as f:
    data = json.load(f)
    
# Create a district to normalized price mapping
district_price_mapping = {}

for item in data['successful_downloads']:
    district = item['district']
    normalized_price = item['normalized_price']
    district_price_mapping[district] = normalized_price

image_paths = []
image_labels = []
district_names_list = []

# Get all district folders
all_district_folders = [d for d in os.listdir(IMAGE_BASE_DIR_BER) 
                       if os.path.isdir(os.path.join(IMAGE_BASE_DIR_BER, d))]

# Load images and assign the normalized price labels
for district_folder in all_district_folders:
    if district_folder in district_price_mapping:
        average_price_id = district_price_mapping[district_folder]
        district_image_dir = os.path.join(IMAGE_BASE_DIR_BER, district_folder)
        
        # Get all images in this district folder
        pattern = os.path.join(district_image_dir, '*.jpg')
        image_files = glob.glob(pattern)
        
        for image_path in image_files:
            image_paths.append(image_path)
            image_labels.append(average_price_id)
            district_names_list.append(district_folder)

# Create DataFrame
image_df = pd.DataFrame({
    'image_path': image_paths,
    'label': image_labels,
    'district': district_names_list
})

# Group images by label
label_groups = {}
for label in image_df['label'].unique():
    label_groups[label] = image_df[image_df['label'] == label]

# Split each label separately
train_dfs, val_dfs, test_dfs = [], [], []

for label, group in label_groups.items():
    train_val, test = train_test_split(group, test_size=0.0654, random_state=42)
    train, val = train_test_split(train_val, test_size=0.07, random_state=42)
    
    train_dfs.append(train)
    val_dfs.append(val)
    test_dfs.append(test)

train_df = pd.concat(train_dfs).reset_index(drop=True)
val_df = pd.concat(val_dfs).reset_index(drop=True)
test_df = pd.concat(test_dfs).reset_index(drop=True)


# Define transforms for Imagenet B3 (300x300)
INPUT_SIZE = 300

train_transform = transforms.Compose([
    transforms.Resize((INPUT_SIZE, INPUT_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # normalization for ImageNet
])

val_transform = transforms.Compose([
    transforms.Resize((INPUT_SIZE, INPUT_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create datasets
train_dataset = StreetViewDataset(train_df, transform=train_transform)
val_dataset = StreetViewDataset(val_df, transform=val_transform)
test_dataset = StreetViewDataset(test_df, transform=val_transform)

# Create dataloaders
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
berlin_test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

In [11]:
# Munich DATA
with open(DATA_STRUCTURE_PATH_MUN, 'r') as f:
    data = json.load(f)
    
# Create a district to normalized price mapping
district_price_mapping = {}

for item in data['successful_downloads']:
    district = item['district']
    normalized_price = item['normalized_price']
    district_price_mapping[district] = normalized_price

image_paths = []
image_labels = []
district_names_list = []

# Get all district folders
all_district_folders = [d for d in os.listdir(IMAGE_BASE_DIR_MUN) 
                       if os.path.isdir(os.path.join(IMAGE_BASE_DIR_MUN, d))]

# Load images and assign the normalized price labels
for district_folder in all_district_folders:
    if district_folder in district_price_mapping:
        average_price_id = district_price_mapping[district_folder]
        district_image_dir = os.path.join(IMAGE_BASE_DIR_MUN, district_folder)
        
        # Get all images in this district folder
        pattern = os.path.join(district_image_dir, '*.jpg')
        image_files = glob.glob(pattern)
        
        for image_path in image_files:
            image_paths.append(image_path)
            image_labels.append(average_price_id)
            district_names_list.append(district_folder)

# Create DataFrame
image_df = pd.DataFrame({
    'image_path': image_paths,
    'label': image_labels,
    'district': district_names_list
})

# Group images by label
label_groups = {}
for label in image_df['label'].unique():
    label_groups[label] = image_df[image_df['label'] == label]

# Split each label separately
train_dfs, val_dfs, test_dfs = [], [], []

for label, group in label_groups.items():
    train_val, test = train_test_split(group, test_size=0.0654, random_state=42)
    train, val = train_test_split(train_val, test_size=0.07, random_state=42)
    
    train_dfs.append(train)
    val_dfs.append(val)
    test_dfs.append(test)

train_df = pd.concat(train_dfs).reset_index(drop=True)
val_df = pd.concat(val_dfs).reset_index(drop=True)
test_df = pd.concat(test_dfs).reset_index(drop=True)


# Define transforms for Imagenet B3 (300x300)
INPUT_SIZE = 300

train_transform = transforms.Compose([
    transforms.Resize((INPUT_SIZE, INPUT_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # normalization for ImageNet
])

val_transform = transforms.Compose([
    transforms.Resize((INPUT_SIZE, INPUT_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create datasets
train_dataset = StreetViewDataset(train_df, transform=train_transform)
val_dataset = StreetViewDataset(val_df, transform=val_transform)
test_dataset = StreetViewDataset(test_df, transform=val_transform)

# Create dataloaders
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
munich_test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

In [14]:
# Run the analysis
results = cross_city_spearman_analysis(
    berlin_model, 
    munich_model, 
    berlin_test_loader,
    munich_test_loader, 
    device
)

Berlin Model with Munich Data
Spearman Correlation: 0.4318
p-value: 0.0000
 Munich Model with Berlin Data
Spearman Correlation: 0.2750
p-value: 0.0000


In [15]:
# Berlin Model with Munich Data: Moderate transferlearning
# Munich Model with Munich Data: Weak transferlearning