### Code Description

This code prepares the model's training datasets. The output format is JSON Lines (jsonl), ideal for training models with large datasets, while keeping the data structure lightweight and easy to process.

### Imports

In [1]:
import pandas as pd
import json
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import os

### Dataset Global

In [None]:
all_data = pd.read_csv('../database/combined_data.csv', sep=";")

state_product_dict = {
    state: list(all_data[all_data['state'] == state]['product'].unique())
    for state in all_data['state'].unique()
}

output_file = 'dataset_global/dataset_global.jsonl'

os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Open the output file in write mode
with open(output_file, 'w') as file:

    ''' 
    # INFO: ======== Remove SP ========
    ''' 
    for state, products in state_product_dict.items():
        if state == "sp":
            continue

        for product in products:
            
            # Filter data for the current state and product
            data_filtered = all_data[(all_data['state'] == state) & (all_data['product'] == product)]

            sequence = data_filtered['m3'].tolist()
            json_line = {f'{product}_{state}': sequence}

            file.write(json.dumps(json_line) + '\n')
    
    # ''' 
    # # INFO: ======== Raw Data ========
    # ''' 
    # for state, products in state_product_dict.items():
    #     for product in products:

    #         # Filter data for the current state and product
    #         data_filtered = all_data[(all_data['state'] == state) & (all_data['product'] == product)]

    #         sequence = data_filtered['m3'].tolist()
    #         json_line = {f'{product}_{state}': sequence}

    #         file.write(json.dumps(json_line) + '\n')

    
    
    ''' 
    # INFO: ======== MinMaxScaler ========
    ''' 
    # for state, products in state_product_dict.items():
    #     for product in products:
    #         data_filtered = all_data[(all_data['state'] == state) & (all_data['product'] == product)]
            
    #         data = rolling_window(data_filtered['m3'], 12)
    #         print(data)

    #         sequence = data.values  

    #         print(sequence)
            
    #         scaler = MinMaxScaler(feature_range=(-1, 1))
    #         sequence_scaled = scaler.fit_transform(sequence.reshape(-1, 1)).flatten()
    #         print(sequence_scaled)
            
    #         json_line = {"sequence": sequence_scaled.tolist()} 
            
    #         file.write(json.dumps(json_line) + '\n')
    

print(f"Filtered data has been saved to {output_file}")

### All datasets

In [3]:
all_data = pd.read_csv('../database/combined_data.csv', sep=';')

states = all_data['state'].unique()

for excluded_state in states:
    data_filtered_all = all_data[all_data['state'] != excluded_state]

    state_product_dict = {
        state: list(data_filtered_all[data_filtered_all['state'] == state]['product'].unique())
        for state in data_filtered_all['state'].unique()
    }

    output_file = f'all_datasets_global/dataset_{excluded_state}.jsonl'
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    with open(output_file, 'w') as file:
        for state, products in state_product_dict.items():
            for product in products:
                data_filtered = data_filtered_all[(data_filtered_all['state'] == state) & (data_filtered_all['product'] == product)]
                sequence = data_filtered['m3'].tolist()
                json_line = {f'sequence': sequence}
                file.write(json.dumps(json_line) + '\n')