# Split data for machine learning
---

Experimenting with splitting the data for machine learning model training.

## Setup

### Imports

In [None]:
import numpy as np
import geopandas as gpd

In [None]:
from coal_emissions_monitoring.data_cleaning import get_final_dataset
from coal_emissions_monitoring.ml_utils import get_facility_set_mapper, split_data_in_sets

### Parameters

In [None]:
train_val_ratio = 0.8
test_data_year = 2023

## Load data

In [None]:
df = get_final_dataset(
    image_metadata_path="/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/image_metadata.csv",
    campd_facilities_path="/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/facility_attributes.csv",
    campd_emissions_path="/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/daily_emissions_facility_aggregation.csv",
)
df

In [None]:
df.to_csv("/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/final_dataset.csv", index=False)

In [None]:
df.co2_mass_short_tons.value_counts()

In [None]:
df.isna().sum()

## Split data

In [None]:
facility_set_mapper = get_facility_set_mapper("/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/facility_attributes.csv")
df["data_set"] = df.apply(lambda row: split_data_in_sets(row=row, data_set_mapper=facility_set_mapper, test_year=test_data_year), axis=1)
df

In [None]:
df.data_set.value_counts()

In [None]:
df.data_set.value_counts() / df.shape[0]

In [None]:
for data_set in df.data_set.unique():
    print(data_set)
    print(df[df.data_set == data_set].ts.dt.year.value_counts())
    print()