In [None]:
import os
import sys
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# setting warnings
import warnings
warnings.simplefilter(action='ignore', category=Warning)

# import modules and components
from FAIRS.commons.utils.preprocessing import PreProcessing
from FAIRS.commons.utils.validation import ModelValidation
from FAIRS.commons.pathfinder import DATA_PATH, CHECKPOINT_PATH
import FAIRS.commons.configurations as cnf

# Load and prepare data

In [None]:
filepath = os.path.join(DATA_PATH, 'FAIRS_dataset.csv')                
df_FAIRS = pd.read_csv(filepath, sep= ';', encoding='utf-8')
num_samples = int(df_FAIRS.shape[0] * cnf.data_size)
df_FAIRS = df_FAIRS[(df_FAIRS.shape[0] - num_samples):]

preprocessor = PreProcessing()

# add number positions, map numbers to roulette color and reshape dataset
categories = [['green', 'black', 'red']]
categorical_encoder = OrdinalEncoder(categories=categories, handle_unknown='use_encoded_value', unknown_value=-1)
df_FAIRS = preprocessor.roulette_colormapping(df_FAIRS, no_mapping=False)
timeseries = categorical_encoder.fit_transform(df_FAIRS['encoding'].values.reshape(-1, 1))
timeseries = pd.DataFrame(timeseries, columns=['encoding'])

# split dataset into train and test and generate window-dataset
train_data, test_data = preprocessor.split_timeseries(timeseries, cnf.test_size, inverted=cnf.invert_test)   
train_samples, test_samples = train_data.shape[0], test_data.shape[0]
X_train, Y_train = preprocessor.timeseries_labeling(train_data, cnf.window_size) 
X_test, Y_test = preprocessor.timeseries_labeling(test_data, cnf.window_size)   

# one hot encode the output for softmax training shape = (timesteps, features)
OH_encoder = OneHotEncoder(sparse=False)
Y_train_OHE = OH_encoder.fit_transform(Y_train.reshape(Y_train.shape[0], -1))
Y_test_OHE = OH_encoder.transform(Y_test.reshape(Y_test.shape[0], -1))

# Data validation

### 1. ...

In [None]:
print(f'''DISTRIBUTION OF CLASSES
-------------------------------------------------------------------------------  
Most frequent class in train dataset: {most_freq_train}
Most frequent class in test dataset:  {most_freq_test}
Number of classes in train dataset:   {train_data.nunique()['encoding']}
Number of classes in test dataset:    {test_data.nunique()['encoding']}
-------------------------------------------------------------------------------''')