# Preprocessing data for TimeSen2Crop

Raw time series data:

    + Convert to 12 monthly composites using the median of clear pixels.

    + Remove cloudy/snowy/shadowed values.

    + If no clear value exists in a month, that month’s data is set to zero.

Output:

    + Clean data
    
    + Labels for each sample (each sample (pixel) belongs to which class: 0, ..., 15)

In [None]:
data_root = "/scratch/users/dtran/croptype/dataset/TimeSen2Crop"

# === Main process ===
# Initialize varialbes for output
all_data, all_label = [], []
label_map = build_label_map(data_root)

# Go into each sample
for tile in tqdm(sorted(os.listdir(data_root)), desc="Tiles"):
    # sorted alphabetically list of tiles
    # The desc="Tiles" part just adds the label "Tiles" in front of the progress bar of tqdm.
    # Create tile_path and check
    tile_path = os.path.join(data_root, tile)
    if not os.path.isdir(tile_path): continue

    # Load acquisition dates
    dates_path = os.path.join(tile_path, "dates.csv")
    if not os.path.isdir(dates_path):
        print(f"Missing {dates_path}, skipping...")
        continue
    try:
        date_list = pd.read_csv(dates_path)["acquisition_date"].tolist()#access the column named "acquisition_date" in the dates_path
    except Exception as e:
        print(f"Could not read dates.csv file in {tile}: {e}")
        continue
    
    # Go into each class in tile
    for cls in sorted(os.listdir(tile_path)):
        # Create class path and check
        cls_path = os.path.join(tile_path, cls)
        if not os.path.isdir(cls_path) or not cls.isdigit(): continue
        # Take label of class cls
        label = label_map[cls]
        
        # Go into each file in class cls
        for file in sorted(os.listdir(cls_path)):
            # Check file
            if not file.endswith(".csv"): continue
            # The try–except block here is used to make the data processing robust and resilient to individual file errors without stopping the whole program.
            # Without try–except: One bad file would cause the whole script to fail. That's especially painful when you're processing hundreds or thousands of files.
            try:
                # Read data file and calculate result
                df = pd.read_csv(os.path.join(cls_path, file))
                if len(df) != len(date_list): continue
                result = extract_monthly_median(df, date_list)
                all_data.append(result)
                all_label.append(label)
            except Exception as e:
                print(f"Error in {cls}/{file}: {e}")

# Save final tensor

# Build label map, extract_monthly_median
## Build label map function

In [None]:
# data_root is a parameter, expected to be a string.
# -> dict: The function is expected to return a dictionary. Specifically, the dictionary will map class folder names like "Class 1" to numeric values like 1.
def build_label_map(data_root: str) -> dict:
    label_map = {}
    for tile in os.listdir(data_root):
        tile_path = os.path.join(data_root, tile)
        if not os.path.isdir(tile_path): continue
        for cls in os.listdir(tile_path):
            cls_path = os.path.join(tile_path, cls)
            if os.path.isdir(cls_path) and cls.isdigit() and cls not in label_map:
                label_map[cls] = int(cls)
    return label_map

## extract_monthly_median function

In [None]:
from datetime import datetime
n_bands = 9
n_months = 12
band_indices = list(range(9))
flag_col = 'Flag'
def extract_monthly_median(data: pd.DataFrame, dates: list) -> np.ndarray:
    """Compute monthly medians for clear pixels."""
    reflectance = data.iloc[:, band_indices].values
    # .iloc[] is a Pandas function that is used for index-based selection. It allows you to select rows and columns based on their integer position rather than labels.
    # .values converts the selected DataFrame portion into a NumPy array.
    flags = data[flag_col].values
    months = [datetime.strptime(d, "%Y%m%d").month for d in dates]
    # strptime() is to convert a string that represents a date or time into a Python datetime object so you can easily work with it (e.g., perform date arithmetic, extract the month or year, etc.).
    # Date format: %Y%m%d or %Y-%m-%d
    # Time format: %H:%M:%S
    monthly = np.zeros((n_months, n_bands), dtype=np.float32)

    for m in range(1, 13):
        idx = [i for i, mo in enumerate(months) if mo == m and flags[i] == 0]
        if idx:
            monthly[m - 1] = np.median(reflectance[idx], axis=0)
        else:
            monthly[m - 1] = 0.0
    return monthly