### Import necessary libraries

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import ModelCheckpoint
from shapely.geometry import Point
from sklearn.neighbors import KDTree
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv1D, Dense, Flatten, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.ensemble import RandomForestClassifier

2025-07-14 16:23:24.005610: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752510204.229998      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752510204.295175      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Loading Data

In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Sentinel2.csv
/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/SampleSubmission.csv
/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Sentinel1.csv
/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Test.csv
/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Train/Orenburg_training_samples.shx
/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Train/Fergana_training_samples.dbf
/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Train/Fergana_training_samples.shp
/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Train/Orenburg_training_samples.dbf
/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Train/Fergana_training_samples.shx
/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Train/Orenburg_training_samples.shp


In [3]:
# Load Sentinel-1 and 2
s1 = pd.read_csv("/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Sentinel1.csv").drop(columns=['date'])
s2 = pd.read_csv("/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Sentinel2.csv").drop(columns=['date'])

In [4]:
s1.head()

Unnamed: 0,ID,VH,VV,orbit,polarization,rel_orbit,translated_lat,translated_lon
0,ID_AFQOFP,-21.479683,-16.633259,DESCENDING,"[VV, VH]",78.0,41.652292,72.144256
1,ID_AFQOFP,-24.76911,-15.943674,DESCENDING,"[VV, VH]",78.0,41.652289,72.144375
2,ID_AFQOFP,-25.370838,-15.185609,DESCENDING,"[VV, VH]",78.0,41.652286,72.144495
3,ID_AFQOFP,-24.134005,-16.351102,DESCENDING,"[VV, VH]",78.0,41.652283,72.144614
4,ID_AFQOFP,-20.654249,-16.792723,DESCENDING,"[VV, VH]",78.0,41.65228,72.144733


In [5]:
s2.head()

Unnamed: 0,B11,B12,B2,B3,B4,B5,B6,B7,B8,B8A,ID,cloud_pct,solar_azimuth,solar_zenith,translated_lat,translated_lon
0,2169,1820,1328,1610,1670,1985,2446,2628,2598,2638,ID_ZHZRHO,6.980395,139.093139,22.625533,40.935173,71.617062
1,2151,1770,1306,1586,1640,1961,2495,2691,2684,2732,ID_ZHZRHO,6.980395,139.093139,22.625533,40.935171,71.61718
2,2169,1820,1456,1674,1808,1985,2446,2628,2486,2638,ID_ZHZRHO,6.980395,139.093139,22.625533,40.935085,71.61694
3,2169,1820,1284,1604,1658,1985,2446,2628,2658,2638,ID_ZHZRHO,6.980395,139.093139,22.625533,40.935083,71.617059
4,2151,1770,1242,1522,1564,1961,2495,2691,2696,2732,ID_ZHZRHO,6.980395,139.093139,22.625533,40.935081,71.617177


### Data Preparation

In [6]:
# shape of sentinel 1 & 2
print('s1 shape:', s1.shape)
print('\n')
print('s2 shape:', s2.shape)

s1 shape: (1752570, 8)


s2 shape: (5610393, 16)


In [7]:
# Sentinel 1 & 2 information
s1.info()
print('-'*50)
s2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1752570 entries, 0 to 1752569
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   ID              object 
 1   VH              float64
 2   VV              float64
 3   orbit           object 
 4   polarization    object 
 5   rel_orbit       float64
 6   translated_lat  float64
 7   translated_lon  float64
dtypes: float64(5), object(3)
memory usage: 107.0+ MB
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5610393 entries, 0 to 5610392
Data columns (total 16 columns):
 #   Column          Dtype  
---  ------          -----  
 0   B11             int64  
 1   B12             int64  
 2   B2              int64  
 3   B3              int64  
 4   B4              int64  
 5   B5              int64  
 6   B6              int64  
 7   B7              int64  
 8   B8              int64  
 9   B8A             int64  
 10  ID              object 
 11

In [8]:
# Check null values
print('s1')
print(s1.isna().sum())
print('-'*50)
print('s2')
print(s2.isna().sum())

s1
ID                0
VH                0
VV                0
orbit             0
polarization      0
rel_orbit         0
translated_lat    0
translated_lon    0
dtype: int64
--------------------------------------------------
s2
B11               0
B12               0
B2                0
B3                0
B4                0
B5                0
B6                0
B7                0
B8                0
B8A               0
ID                0
cloud_pct         0
solar_azimuth     0
solar_zenith      0
translated_lat    0
translated_lon    0
dtype: int64


### Extract Labelled GeoData

In [9]:
# Load shapefiles and extract labeled geodata
fergana_gdf = gpd.read_file("/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Train/Fergana_training_samples.shp")
orenburg_gdf = gpd.read_file("/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Train/Orenburg_training_samples.shp")
train_gdf = pd.concat([fergana_gdf, orenburg_gdf])
train_gdf = train_gdf[['Cropland', 'geometry']]
train_gdf["lon"] = train_gdf.geometry.x
train_gdf["lat"] = train_gdf.geometry.y

In [10]:
# Match Sentinel2 ID points to nearest labeled point
tree = KDTree(train_gdf[["lat", "lon"]].values)
s2_points = s2.groupby("ID")[["translated_lat", "translated_lon"]].mean().reset_index()
dist, idx = tree.query(s2_points[["translated_lat", "translated_lon"]].values, k=1)
s2_points["label"] = train_gdf.iloc[idx.flatten()].Cropland.values
s2_labels = s2_points[["ID", "label"]]

In [11]:
# Aggregate function (excluding ID)
def aggregate_features(df, id_col="ID"):
    return df.drop(columns=["translated_lat", "translated_lon"], errors='ignore') \
             .groupby(id_col).agg(['mean', 'std', 'min', 'max']).reset_index()

# Aggregate train features
s1_feats = aggregate_features(s1[["ID", "VH", "VV"]])
s2_feats = aggregate_features(s2[["ID", "B2", "B3", "B4", "B5", "B6", "B7", "B8", "B8A", "B11", "B12"]])

# Flatten multi-index columns
s1_feats.columns = ['_'.join(col).strip("_") for col in s1_feats.columns.values]
s2_feats.columns = ['_'.join(col).strip("_") for col in s2_feats.columns.values]

#### Merge training features and labels

In [12]:
# Merge training features and labels
train_df = s2_feats.merge(s1_feats, on="ID", how="outer").merge(s2_labels, on="ID", how="inner")
train_df = train_df.dropna()

In [13]:
train_df.head()

Unnamed: 0,ID,B2_mean,B2_std,B2_min,B2_max,B3_mean,B3_std,B3_min,B3_max,B4_mean,...,B12_max,VH_mean,VH_std,VH_min,VH_max,VV_mean,VV_std,VV_min,VV_max,label
0,ID_ABQOQT,2678.931516,2533.538818,749,16975,3087.956733,2446.428918,1015,16883,3313.985663,...,4938,-20.59984,3.788596,-48.104045,-11.83779,-10.241005,2.984731,-20.647057,-0.260766,0
1,ID_ADDROF,2180.467148,793.942887,314,10032,2551.1519,745.78422,684,9640,2725.154697,...,8095,-19.009345,3.215299,-37.340685,-9.947574,-9.077417,3.564367,-20.568176,6.495257,0
2,ID_AFIWZH,1760.09741,735.938649,76,6572,2015.75032,689.742879,342,6560,2055.006831,...,6238,-17.171517,2.445001,-33.776153,-10.090858,-7.539021,3.765559,-18.14654,4.695586,1
3,ID_AFQOFP,2459.410762,1700.451288,789,13704,2913.663884,1596.009589,1090,13088,3316.157296,...,7570,-24.251169,4.94117,-49.731128,-12.438228,-14.699856,3.231105,-25.396003,-2.288849,0
4,ID_AHRONV,2965.180911,3141.400874,439,10392,3160.444765,2983.807369,855,10456,3339.176704,...,7629,-25.955984,5.098076,-47.658529,-16.043004,-15.333656,3.817844,-26.721309,-5.322858,0


In [14]:
# Train-test split
X = train_df.drop(columns=["ID", "label"])
y = train_df["label"]
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

### CNN Model

In [15]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Prepare data for CNN: reshape to (samples, features, 1)
X_train_cnn = np.expand_dims(X_train_scaled, axis=2)
X_val_cnn = np.expand_dims(X_val_scaled, axis=2)

# Encode labels if not already 0/1
num_classes = len(np.unique(y_train))
y_train_cnn = to_categorical(y_train, num_classes)
y_val_cnn = to_categorical(y_val, num_classes)

# Build a simple 1D CNN model with Input layer
model = Sequential([
    Input(shape=(X_train_cnn.shape[1], 1)),
    Conv1D(32, kernel_size=3, activation='relu'),
    Dropout(0.2),
    Conv1D(64, kernel_size=3, activation='relu'),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

I0000 00:00:1752510241.671633      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [16]:
# Save the best model based on validation accuracy
checkpoint = ModelCheckpoint('best_cnn_model.h5', monitor='val_accuracy', save_best_only=True, mode='max', verbose=1)

# Train the model with the callback
model.fit(
    X_train_cnn, y_train_cnn,
    epochs=20,
    batch_size=32,
    validation_data=(X_val_cnn, y_val_cnn),
    callbacks=[checkpoint],
    verbose=2
)

# Load the best weights before evaluation
model.load_weights('best_cnn_model.h5')

Epoch 1/20


I0000 00:00:1752510244.973601      57 service.cc:148] XLA service 0x7e31f000a330 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1752510244.974344      57 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1752510245.253176      57 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1752510247.320055      57 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.



Epoch 1: val_accuracy improved from -inf to 0.64167, saving model to best_cnn_model.h5
15/15 - 5s - 334ms/step - accuracy: 0.6187 - loss: 0.6589 - val_accuracy: 0.6417 - val_loss: 0.5957
Epoch 2/20

Epoch 2: val_accuracy improved from 0.64167 to 0.67500, saving model to best_cnn_model.h5
15/15 - 0s - 9ms/step - accuracy: 0.6417 - loss: 0.6248 - val_accuracy: 0.6750 - val_loss: 0.5892
Epoch 3/20

Epoch 3: val_accuracy did not improve from 0.67500
15/15 - 0s - 7ms/step - accuracy: 0.6521 - loss: 0.6037 - val_accuracy: 0.6667 - val_loss: 0.5685
Epoch 4/20

Epoch 4: val_accuracy did not improve from 0.67500
15/15 - 0s - 7ms/step - accuracy: 0.6458 - loss: 0.5992 - val_accuracy: 0.6417 - val_loss: 0.5858
Epoch 5/20

Epoch 5: val_accuracy improved from 0.67500 to 0.70000, saving model to best_cnn_model.h5
15/15 - 0s - 8ms/step - accuracy: 0.6646 - loss: 0.5869 - val_accuracy: 0.7000 - val_loss: 0.5706
Epoch 6/20

Epoch 6: val_accuracy did not improve from 0.70000
15/15 - 0s - 7ms/step - acc

In [17]:
# Evaluate on validation set
val_loss, val_acc = model.evaluate(X_val_cnn, y_val_cnn, verbose=0)
print("Validation Accuracy:", val_acc)

Validation Accuracy: 0.699999988079071


In [18]:
# Prepare test set
test_meta = pd.read_csv("/kaggle/input/geoai-challenge-for-cropland-mapping-dry-dataset/Test.csv")
test_ids = test_meta["ID"].unique()
s1_test = s1[s1["ID"].isin(test_ids)]
s2_test = s2[s2["ID"].isin(test_ids)]

In [19]:
# Drop date + aggregate test features
s1_test_feats = aggregate_features(s1_test[["ID", "VH", "VV"]])
s2_test_feats = aggregate_features(s2_test[["ID", "B2", "B3", "B4", "B5", "B6", "B7", "B8", "B8A", "B11", "B12"]])
s1_test_feats.columns = ['_'.join(col).strip("_") for col in s1_test_feats.columns.values]
s2_test_feats.columns = ['_'.join(col).strip("_") for col in s2_test_feats.columns.values]

In [20]:
# Merge test features
test_df = s2_test_feats.merge(s1_test_feats, on="ID", how="outer").fillna(0)
X_test = test_df.drop(columns=["ID"])
test_preds = model.predict(X_test)

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step


In [21]:
# Save predictions to CSV
predicted_labels = np.argmax(test_preds, axis=1)
submission = pd.DataFrame({'ID': test_df['ID'], 'label': predicted_labels})
submission.to_csv('submission.csv', index=False)