### **LIBRARIES USED**

In [1]:
import os
from pathlib import Path
import shutil
import pandas as pd
import rasterio
import geopandas as gpd
import gc

### **LOADING DATASETS**

In [6]:
# dataset
data_path = Path("./s2_images")

train = pd.read_csv("data/Train.csv")
test = pd.read_csv("data/Test.csv")

In [7]:
train.head()

Unnamed: 0,ID,year,month,tifPath,Target,class
0,ID_h14T0B_Jan,2024,Jan,/content/drive/MyDrive/train_s2_images/s2_Rubb...,Rubber,3
1,ID_KbyKOr_Jan,2024,Jan,/content/drive/MyDrive/train_s2_images/s2_Rubb...,Rubber,3
2,ID_t4Tmmn_Jan,2024,Jan,/content/drive/MyDrive/train_s2_images/s2_Rubb...,Rubber,3
3,ID_yipWoC_Jan,2024,Jan,/content/drive/MyDrive/train_s2_images/s2_Rubb...,Rubber,3
4,ID_XKiksa_Jan,2024,Jan,/content/drive/MyDrive/train_s2_images/s2_Rubb...,Rubber,3


In [8]:
test.head()

Unnamed: 0,ID,year,month,tifPath
0,ID_731818_Jan,2024,Jan,/content/drive/MyDrive/test_s2_images_/s2_Unkn...
1,ID_790093_Jan,2024,Jan,/content/drive/MyDrive/test_s2_images_/s2_Unkn...
2,ID_931033_Jan,2024,Jan,/content/drive/MyDrive/test_s2_images_/s2_Unkn...
3,ID_079024_Jan,2024,Jan,/content/drive/MyDrive/test_s2_images_/s2_Unkn...
4,ID_691532_Jan,2024,Jan,/content/drive/MyDrive/test_s2_images_/s2_Unkn...


In [5]:
test.shape

(3384, 4)

In [7]:
def find_matching_tiffs(df, data_path):
    """
    Find and collect TIFF file paths from a directory that match IDs in a GeoDataFrame.

    Parameters:
    - df (DataFrame): A DataFrame containing an 'ID' column.
    - data_path (Path or str): Directory path containing TIFF files.

    Returns:
    - List[Path]: List of matching TIFF file paths.
    """
    data_path = Path(data_path)
    matching_tiffs = []

    for idx, row in df.iterrows():
        field_id = row['ID']
        for tif_file in os.listdir(data_path):
            if field_id in tif_file:
                tif_path = data_path / tif_file
                matching_tiffs.append(tif_path)

    return matching_tiffs

In [8]:
# Get the paths of train/test images
train_tiffs = find_matching_tiffs(train, data_path)
test_tiffs = find_matching_tiffs(test, data_path)

In [85]:
train_tiffs[:5]

[]

### **DATAFRAME WITH THE TIF DATA**

In [11]:
def get_tiff_path(idx, month_idx, year, tif_paths):
    month_str = f"{month_idx:02d}"  # ensures 01, 02, ..., 12
    identifier = f"{idx}_{year}_{month_str}"

    for tif_path in tif_paths:
        if identifier in tif_path.name:
            return tif_path

    return None  # Only return None if no match found

In [12]:
YEAR = 2024
MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

def build_tiff_dataframe(gdf, tif_paths, year=2024, is_train=True, month_limit=12):
    """
    Constructs a DataFrame mapping each ID and month to its corresponding TIFF path.

    Parameters:
    - gdf: GeoDataFrame with 'ID' (and optionally 'crop', 'class' if is_train)
    - tif_paths: list or path collection of TIFF files
    - year: year to use for matching filenames
    - is_train: flag to indicate if it's for training (includes 'crop' and 'class')
    - month_range: limit the data to generating the dataframe for a specific range of months

    Returns:
    - A pandas DataFrame with columns: ID, year, month, tifPath, (optional: crop, class)
    """

    data = []
    for month_idx in range(1, month_limit+1):
        month = MONTHS[month_idx - 1]
        for idx in gdf['ID'].values:
            tif_path = get_tiff_path(idx, month_idx, year, tif_paths)
            entry = {
                "ID": f"{idx}_{month}",
                "year": year,
                "month": month,
                "tifPath": str(tif_path) if tif_path else None,
            }
            if is_train:
                entry["crop"] = gdf.loc[gdf['ID'] == idx, 'crop'].values[0]
                entry["class"] = gdf.loc[gdf['ID'] == idx, 'class'].values[0]
            data.append(entry)

    return pd.DataFrame(data)


def build_tiff_dataframe_single_month(gdf, tif_paths, year=2024, is_train=True, month_idx=1):
    """
    Constructs a DataFrame mapping each ID and month to its corresponding TIFF path.

    Parameters:
    - gdf: GeoDataFrame with 'ID' (and optionally 'crop', 'class' if is_train)
    - tif_paths: list or path collection of TIFF files
    - year: year to use for matching filenames
    - is_train: flag to indicate if it's for training (includes 'crop' and 'class')
    - month_range: limit the data to generating the dataframe for a specific range of months

    Returns:
    - A pandas DataFrame with columns: ID, year, month, tifPath, (optional: crop, class)
    """

    data = []
    month = MONTHS[month_idx - 1]
    for idx in gdf['ID'].values:
      tif_path = get_tiff_path(idx, month_idx, year, tif_paths)
      entry = {
          "ID": f"{idx}_{month}",
          "year": year,
          "month": month,
          "tifPath": str(tif_path) if tif_path else None,
          }
      if is_train:
        entry["crop"] = gdf.loc[gdf['ID'] == idx, 'crop'].values[0]
        entry["class"] = gdf.loc[gdf['ID'] == idx, 'class'].values[0]
      data.append(entry)

    return pd.DataFrame(data)

In [13]:
## Build a toy train dataframe for a single month (January)
df_train = build_tiff_dataframe_single_month(train, 'sentinel-2', 
                                             year=YEAR, is_train=True, month_idx=1)

AttributeError: 'str' object has no attribute 'name'