# Split the data
---

The data wasn't split into training and validation sets, so I'll do that now.

## Setup

### Import libraries

In [None]:
import os
import pandas as pd

### Set parameters

In [None]:
val_perc = 0.2

In [None]:
meta_path = "/Users/andreferreira/Documents_Offline/Datasets/WorkResearch/MultitaskNeurIPS2021"
seg_path = "/Users/andreferreira/Documents_Offline/Datasets/WorkResearch/MultitaskNeurIPS2021/segmentation_labels"

## Load the metadata

In [None]:
meta_df = pd.read_csv(os.path.join(meta_path, "reg_co2_data.csv"))
meta_df

In [None]:
perc_null_gen = meta_df.gen_output.isnull().sum() / len(meta_df)
print(f"Percentage of null values in generation output: {perc_null_gen * 100:.2f}%")

In [None]:
(meta_df.gen_output == 0).sum()

In [None]:
num_rows_orig = len(meta_df)
meta_df = meta_df[meta_df.gen_output.isnull() == False]
num_rows_new = len(meta_df)
print(f"Number of rows removed: {num_rows_orig - num_rows_new}")

## Split the data

### Define the sets

In [None]:
plant_ids = meta_df.groupby(["lat", "lon"]).groups
count = 0
for key, val in plant_ids.items():
    plant_ids[key] = count
    count += 1
plant_ids

In [None]:
meta_df["plant_id"] = meta_df.apply(lambda row: plant_ids[(row.lat, row.lon)], axis=1)
meta_df

In [None]:
samples_per_plant = meta_df.groupby("plant_id").filename.count()
samples_per_plant

In [None]:
num_val_samples = int(len(meta_df) * val_perc)
num_val_samples

In [None]:
cur_num_val_samples = 0
val_samples_ids = list()
while cur_num_val_samples < num_val_samples:
    cur_plant = meta_df.sample(n=1)["plant_id"].values[0]
    if cur_plant not in val_samples_ids:
        val_samples_ids.append(cur_plant)
        cur_num_val_samples += samples_per_plant[cur_plant]

In [None]:
cur_num_val_samples

In [None]:
val_samples_files = meta_df[meta_df.plant_id.isin(val_samples_ids)].filename.str.replace(".tif", "").values
val_samples_files

### Move the segmentation files to the set folders

In [None]:
seg_files = os.listdir(seg_path)
seg_files

In [None]:
os.makedirs(os.path.join(seg_path, "validation"), exist_ok=True)
os.makedirs(os.path.join(seg_path, "training"), exist_ok=True)
for seg_file in seg_files:
    if seg_file.split("_features")[0] in val_samples_files:
        # move to validation folder
        os.rename(os.path.join(seg_path, seg_file), os.path.join(seg_path, "validation", seg_file))
    else:
        # move to training folder
        os.rename(os.path.join(seg_path, seg_file), os.path.join(seg_path, "training", seg_file))