In [63]:
import os

base_path = '/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset'
files = os.listdir(base_path)
print("Files and folders in the dataset:")
for f in files:
    print(f)


Files and folders in the dataset:
Sentinel2.csv
SampleSubmission.csv
Sentinel1.csv
Test.csv
Train


In [64]:
pip install rasterio


Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from glob import glob

import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import torch.nn as nn
import torch.optim as optim


In [None]:
class CroplandDataset(Dataset):
    def __init__(self, image_paths, mask_paths, transform=None):
        self.image_paths = image_paths
        self.mask_paths = mask_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img = Image.open(self.image_paths[idx])
        mask = Image.open(self.mask_paths[idx]).convert("L")

        img = img.resize((256, 256))
        mask = mask.resize((256, 256))

        img = np.array(img) / 255.0
        mask = np.array(mask) / 255.0
        mask = np.round(mask)  # ensure binary

        img = torch.tensor(img, dtype=torch.float).permute(2, 0, 1)
        mask = torch.tensor(mask, dtype=torch.float).unsqueeze(0)

        return img, mask


In [None]:
class UNet(nn.Module):
    def __init__(self):
        super(UNet, self).__init__()

        def conv_block(in_c, out_c):
            return nn.Sequential(
                nn.Conv2d(in_c, out_c, 3, padding=1),
                nn.ReLU(inplace=True),
                nn.Conv2d(out_c, out_c, 3, padding=1),
                nn.ReLU(inplace=True)
            )

        self.enc1 = conv_block(3, 64)
        self.pool1 = nn.MaxPool2d(2)
        self.enc2 = conv_block(64, 128)
        self.pool2 = nn.MaxPool2d(2)

        self.bottleneck = conv_block(128, 256)

        self.up2 = nn.ConvTranspose2d(256, 128, 2, stride=2)
        self.dec2 = conv_block(256, 128)
        self.up1 = nn.ConvTranspose2d(128, 64, 2, stride=2)
        self.dec1 = conv_block(128, 64)

        self.out = nn.Conv2d(64, 1, kernel_size=1)

    def forward(self, x):
        enc1 = self.enc1(x)
        enc2 = self.enc2(self.pool1(enc1))
        bottleneck = self.bottleneck(self.pool2(enc2))
        dec2 = self.dec2(torch.cat([self.up2(bottleneck), enc2], dim=1))
        dec1 = self.dec1(torch.cat([self.up1(dec2), enc1], dim=1))
        return torch.sigmoid(self.out(dec1))


In [None]:
def train(model, loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for img, mask in loader:
        img, mask = img.to(device), mask.to(device)
        pred = model(img)
        loss = loss_fn(pred, mask)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)


In [None]:
# You will need to glob actual image/mask paths based on your dataset
image_paths = sorted(glob('/kaggle/input/geoai-challenge-for-cropland-mapping-dry/images/*.tif'))
mask_paths  = sorted(glob('/kaggle/input/geoai-challenge-for-cropland-mapping-dry/masks/*.tif'))

dataset = CroplandDataset(image_paths, mask_paths)
loader = DataLoader(dataset, batch_size=4, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = UNet().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.BCELoss()

for epoch in range(5):  # keep small for baseline
    loss = train(model, loader, optimizer, loss_fn, device)
    print(f"Epoch {epoch+1} | Loss: {loss:.4f}")


In [None]:
import os

base_path = '/kaggle/input/geoai-challenge-for-cropland-mapping-dry'
for root, dirs, files in os.walk(base_path):
    print(f"\n📁 Folder: {root}")
    for file in files[:5]:  # Show only a few files per folder
        print(f"   📄 {file}")


In [None]:
print("Number of images found:", len(image_paths))
print("Number of masks found:", len(mask_paths))
print("Sample image path:", image_paths[0] if image_paths else "None")
print("Sample mask path:", mask_paths[0] if mask_paths else "None")


In [None]:
import os

base_path = '/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset'

for root, dirs, files in os.walk(base_path):
    print(f"\n📁 {root}")
    for f in files[:5]:  # Show only the first 5 files in each folder
        print(f"   📄 {f}")


In [70]:
import pandas as pd
import geopandas as gpd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import ModelCheckpoint
from shapely.geometry import Point
from sklearn.neighbors import KDTree
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv1D, Dense, Flatten, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.ensemble import RandomForestClassifier

In [71]:
import tensorflow as tf
print(tf.__version__)

2.18.0


In [72]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Sentinel2.csv
/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/SampleSubmission.csv
/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Sentinel1.csv
/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Test.csv
/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Train/Orenburg_training_samples.shx
/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Train/Fergana_training_samples.dbf
/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Train/Fergana_training_samples.shp
/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Train/Orenburg_training_samples.dbf
/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Train/Fergana_training_samples.shx
/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Train/Orenburg_training_samples.shp


In [73]:
# Load Sentinel-1 and 2
s1 = pd.read_csv("/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Sentinel1.csv").drop(columns=['date'])
s2 = pd.read_csv("/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Sentinel2.csv").drop(columns=['date'])

In [76]:
s1.head()

Unnamed: 0,ID,VH,VV,orbit,polarization,rel_orbit,translated_lat,translated_lon
0,ID_AFQOFP,-21.479683,-16.633259,DESCENDING,"[VV, VH]",78.0,41.652292,72.144256
1,ID_AFQOFP,-24.76911,-15.943674,DESCENDING,"[VV, VH]",78.0,41.652289,72.144375
2,ID_AFQOFP,-25.370838,-15.185609,DESCENDING,"[VV, VH]",78.0,41.652286,72.144495
3,ID_AFQOFP,-24.134005,-16.351102,DESCENDING,"[VV, VH]",78.0,41.652283,72.144614
4,ID_AFQOFP,-20.654249,-16.792723,DESCENDING,"[VV, VH]",78.0,41.65228,72.144733


In [77]:
# Sentinel 1 & 2 information
s1.info()
print('-'*50)
s2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1752570 entries, 0 to 1752569
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   ID              object 
 1   VH              float64
 2   VV              float64
 3   orbit           object 
 4   polarization    object 
 5   rel_orbit       float64
 6   translated_lat  float64
 7   translated_lon  float64
dtypes: float64(5), object(3)
memory usage: 107.0+ MB
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5610393 entries, 0 to 5610392
Data columns (total 16 columns):
 #   Column          Dtype  
---  ------          -----  
 0   B11             int64  
 1   B12             int64  
 2   B2              int64  
 3   B3              int64  
 4   B4              int64  
 5   B5              int64  
 6   B6              int64  
 7   B7              int64  
 8   B8              int64  
 9   B8A             int64  
 10  ID              object 
 11

In [78]:
# shape of sentinel 1 & 2
print('s1 shape:', s1.shape)
print('\n')
print('s2 shape:', s2.shape)

s1 shape: (1752570, 8)


s2 shape: (5610393, 16)


In [79]:
# Sentinel 1 & 2 information
s1.info()
print('-'*50)
s2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1752570 entries, 0 to 1752569
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   ID              object 
 1   VH              float64
 2   VV              float64
 3   orbit           object 
 4   polarization    object 
 5   rel_orbit       float64
 6   translated_lat  float64
 7   translated_lon  float64
dtypes: float64(5), object(3)
memory usage: 107.0+ MB
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5610393 entries, 0 to 5610392
Data columns (total 16 columns):
 #   Column          Dtype  
---  ------          -----  
 0   B11             int64  
 1   B12             int64  
 2   B2              int64  
 3   B3              int64  
 4   B4              int64  
 5   B5              int64  
 6   B6              int64  
 7   B7              int64  
 8   B8              int64  
 9   B8A             int64  
 10  ID              object 
 11

In [80]:
# Check null values
print('s1')
print(s1.isna().sum())
print('-'*50)
print('s2')
print(s2.isna().sum())

s1
ID                0
VH                0
VV                0
orbit             0
polarization      0
rel_orbit         0
translated_lat    0
translated_lon    0
dtype: int64
--------------------------------------------------
s2
B11               0
B12               0
B2                0
B3                0
B4                0
B5                0
B6                0
B7                0
B8                0
B8A               0
ID                0
cloud_pct         0
solar_azimuth     0
solar_zenith      0
translated_lat    0
translated_lon    0
dtype: int64


In [82]:
# Load shapefiles and extract labeled geodata
fergana_gdf = gpd.read_file("/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Train/Fergana_training_samples.shp")
orenburg_gdf = gpd.read_file("/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Train/Orenburg_training_samples.shp")
train_gdf = pd.concat([fergana_gdf, orenburg_gdf])
train_gdf = train_gdf[['Cropland', 'geometry']]
train_gdf["lon"] = train_gdf.geometry.x
train_gdf["lat"] = train_gdf.geometry.y

In [83]:
# Match Sentinel2 ID points to nearest labeled point
tree = KDTree(train_gdf[["lat", "lon"]].values)
s2_points = s2.groupby("ID")[["translated_lat", "translated_lon"]].mean().reset_index()
dist, idx = tree.query(s2_points[["translated_lat", "translated_lon"]].values, k=1)
s2_points["label"] = train_gdf.iloc[idx.flatten()].Cropland.values
s2_labels = s2_points[["ID", "label"]]

In [84]:
# Aggregate function (excluding ID)
def aggregate_features(df, id_col="ID"):
    return df.drop(columns=["translated_lat", "translated_lon"], errors='ignore') \
             .groupby(id_col).agg(['mean', 'std', 'min', 'max']).reset_index()

# Aggregate train features
s1_feats = aggregate_features(s1[["ID", "VH", "VV"]])
s2_feats = aggregate_features(s2[["ID", "B2", "B3", "B4", "B5", "B6", "B7", "B8", "B8A", "B11", "B12"]])

# Flatten multi-index columns
s1_feats.columns = ['_'.join(col).strip("_") for col in s1_feats.columns.values]
s2_feats.columns = ['_'.join(col).strip("_") for col in s2_feats.columns.values]

In [85]:
# Merge training features and labels
train_df = s2_feats.merge(s1_feats, on="ID", how="outer").merge(s2_labels, on="ID", how="inner")
train_df = train_df.dropna()

In [86]:
train_df.head()

Unnamed: 0,ID,B2_mean,B2_std,B2_min,B2_max,B3_mean,B3_std,B3_min,B3_max,B4_mean,...,B12_max,VH_mean,VH_std,VH_min,VH_max,VV_mean,VV_std,VV_min,VV_max,label
0,ID_ABQOQT,2678.931516,2533.538818,749,16975,3087.956733,2446.428918,1015,16883,3313.985663,...,4938,-20.59984,3.788596,-48.104045,-11.83779,-10.241005,2.984731,-20.647057,-0.260766,0
1,ID_ADDROF,2180.467148,793.942887,314,10032,2551.1519,745.78422,684,9640,2725.154697,...,8095,-19.009345,3.215299,-37.340685,-9.947574,-9.077417,3.564367,-20.568176,6.495257,0
2,ID_AFIWZH,1760.09741,735.938649,76,6572,2015.75032,689.742879,342,6560,2055.006831,...,6238,-17.171517,2.445001,-33.776153,-10.090858,-7.539021,3.765559,-18.14654,4.695586,1
3,ID_AFQOFP,2459.410762,1700.451288,789,13704,2913.663884,1596.009589,1090,13088,3316.157296,...,7570,-24.251169,4.94117,-49.731128,-12.438228,-14.699856,3.231105,-25.396003,-2.288849,0
4,ID_AHRONV,2965.180911,3141.400874,439,10392,3160.444765,2983.807369,855,10456,3339.176704,...,7629,-25.955984,5.098076,-47.658529,-16.043004,-15.333656,3.817844,-26.721309,-5.322858,0


In [87]:
# Train-test split
X = train_df.drop(columns=["ID", "label"])
y = train_df["label"]
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [88]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Prepare data for CNN: reshape to (samples, features, 1)
X_train_cnn = np.expand_dims(X_train_scaled, axis=2)
X_val_cnn = np.expand_dims(X_val_scaled, axis=2)

# Encode labels if not already 0/1
num_classes = len(np.unique(y_train))
y_train_cnn = to_categorical(y_train, num_classes)
y_val_cnn = to_categorical(y_val, num_classes)

# Build a simple 1D CNN model with Input layer
model = Sequential([
    Input(shape=(X_train_cnn.shape[1], 1)),
    Conv1D(32, kernel_size=3, activation='relu'),
    Dropout(0.2),
    Conv1D(64, kernel_size=3, activation='relu'),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [89]:
# Save the best model based on validation accuracy
checkpoint = ModelCheckpoint('best_cnn_model.h5', monitor='val_accuracy', save_best_only=True, mode='max', verbose=1)

# Train the model with the callback
model.fit(
    X_train_cnn, y_train_cnn,
    epochs=20,
    batch_size=32,
    validation_data=(X_val_cnn, y_val_cnn),
    callbacks=[checkpoint],
    verbose=2
)

# Load the best weights before evaluation
model.load_weights('best_cnn_model.h5')

Epoch 1/20

Epoch 1: val_accuracy improved from -inf to 0.60000, saving model to best_cnn_model.h5
15/15 - 3s - 181ms/step - accuracy: 0.6271 - loss: 0.6654 - val_accuracy: 0.6000 - val_loss: 0.6302
Epoch 2/20

Epoch 2: val_accuracy improved from 0.60000 to 0.65000, saving model to best_cnn_model.h5
15/15 - 0s - 14ms/step - accuracy: 0.6417 - loss: 0.6336 - val_accuracy: 0.6500 - val_loss: 0.5843
Epoch 3/20

Epoch 3: val_accuracy improved from 0.65000 to 0.66667, saving model to best_cnn_model.h5
15/15 - 0s - 14ms/step - accuracy: 0.6458 - loss: 0.6020 - val_accuracy: 0.6667 - val_loss: 0.5828
Epoch 4/20

Epoch 4: val_accuracy did not improve from 0.66667
15/15 - 0s - 12ms/step - accuracy: 0.6292 - loss: 0.5962 - val_accuracy: 0.6500 - val_loss: 0.5921
Epoch 5/20

Epoch 5: val_accuracy improved from 0.66667 to 0.68333, saving model to best_cnn_model.h5
15/15 - 0s - 14ms/step - accuracy: 0.6229 - loss: 0.5846 - val_accuracy: 0.6833 - val_loss: 0.5843
Epoch 6/20

Epoch 6: val_accuracy im

In [69]:
print(X_train_cnn.shape[0])  # Number of input samples
print(y_train_cnn.shape[0])  # Number of labels

10
480


In [90]:
model.add(Dropout(0.5))

In [91]:
model.add(BatchNormalization())

In [92]:
from tensorflow.keras.layers import BatchNormalization

In [93]:
model.add(BatchNormalization())

In [94]:
from tensorflow.keras.regularizers import l2
model.add(Dense(64, kernel_regularizer=l2(0.01)))

In [95]:
from tensorflow.keras.callbacks import ReduceLROnPlateau

lr_schedule = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1)

In [96]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True
)
datagen.fit(X_train_cnn)

ValueError: Input to `.fit()` should have rank 4. Got array with shape: (480, 48, 1)

In [None]:
X_train_cnn = X_train_cnn.reshape((480, 48, 48, 1))

In [None]:
X_train_cnn = X_train_cnn.reshape((10, 48, 48, 1))

In [None]:
print(X_train_cnn.shape)
print(X_train_cnn.size)

In [None]:
datagen.fit(X_train_cnn)

In [97]:
# Evaluate on validation set
val_loss, val_acc = model.evaluate(X_val_cnn, y_val_cnn, verbose=0)
print("Validation Accuracy:", val_acc)

Validation Accuracy: 0.7083333134651184


In [98]:
# Prepare test set
test_meta = pd.read_csv("/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Test.csv")
test_ids = test_meta["ID"].unique()
s1_test = s1[s1["ID"].isin(test_ids)]
s2_test = s2[s2["ID"].isin(test_ids)]

In [99]:
# Drop date + aggregate test features
s1_test_feats = aggregate_features(s1_test[["ID", "VH", "VV"]])
s2_test_feats = aggregate_features(s2_test[["ID", "B2", "B3", "B4", "B5", "B6", "B7", "B8", "B8A", "B11", "B12"]])
s1_test_feats.columns = ['_'.join(col).strip("_") for col in s1_test_feats.columns.values]
s2_test_feats.columns = ['_'.join(col).strip("_") for col in s2_test_feats.columns.values]

In [100]:
# Merge test features
test_df = s2_test_feats.merge(s1_test_feats, on="ID", how="outer").fillna(0)
X_test = test_df.drop(columns=["ID"])
test_preds = model.predict(X_test)

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step


In [101]:
# Save predictions to CSV
predicted_labels = np.argmax(test_preds, axis=1)
submission = pd.DataFrame({'ID': test_df['ID'], 'label': predicted_labels})
submission.to_csv('submission.csv', index=False)