In [None]:
import pandas as pd
import math
from sklearn.preprocessing import MinMaxScaler
import re
import numpy as np
from tqdm import tqdm
# pytorch libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from ChessDataset import ChessDataset
from ChessNet import Net
from torch.optim import lr_scheduler
import optuna
import matplotlib.pyplot as plt
import util as ut

from optuna.trial import TrialState

In [None]:
df = pd.read_csv('../data/01_raw/small_all_data.csv')

In [None]:
df.describe()

In [None]:
def process_data(df):
    df['Evaluation'] = df['Evaluation'].apply(ut.eval_to_number)
    # normalize the evaluation column to be between -10 and 10
    scaler = MinMaxScaler(feature_range=(-50, 50))
    df['Normalized Evaluation'] = scaler.fit_transform(df['Evaluation'].values.reshape(-1, 1))
    
    # remove data in normalized evaluation that is not between -2 and 2
    df_only_between = df[(df['Normalized Evaluation'] > -5) & (df['Normalized Evaluation'] < 5)]
    scaler = MinMaxScaler(feature_range=(-30, 30))
    df_only_between['Normalized Evaluation'] = scaler.fit_transform(df_only_between['Normalized Evaluation'].values.reshape(-1, 1))
    
    df_not_between = df[(df['Normalized Evaluation'] <= -5) | (df['Normalized Evaluation'] >= 5)]
    df_not_between = df_not_between[(df_not_between['Normalized Evaluation'] <= -30) | (df_not_between['Normalized Evaluation'] >= 30)]
    
    df = pd.concat([df_only_between, df_not_between])
    
    # sample the data and split into train and validation and test
    df = df.sample(frac=1)
    df_train = df[:int(len(df) * 0.8)]
    df_val = df[int(len(df) * 0.8):int(len(df) * 0.9)]
    df_test = df[int(len(df) * 0.9):]
    
    return df_train, df_val, df_test

In [None]:
df = process_data(df)

In [None]:
df.head()

In [None]:
# plot the distribution of the evaluation column
plt.hist(df['Normalized Evaluation'], bins=100)

In [None]:
# print length of data in dataframe with normalized evaluation between -1 and 1
print(len(df[(df['Normalized Evaluation'] > -10) & (df['Normalized Evaluation'] < 10)]))

In [None]:
# print min and max of the normalized evaluation column
print(df['Normalized Evaluation'].max())
print(len(df[(df['Normalized Evaluation'] <= -15) | (df['Normalized Evaluation'] >= 15)]))

In [None]:
# print example with normalized evaluation of -15
df[(df['Normalized Evaluation'] < 0.1) & (df["Normalized Evaluation"] > -0.1)].head()["FEN"].iloc[0]

In [None]:
len(df)