In [1]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import cross_val_score
import math

In [2]:
df = pd.read_csv("chessData.csv")
df.head()

Unnamed: 0,FEN,Evaluation
0,rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR ...,-10
1,rnbqkbnr/pppp1ppp/4p3/8/4P3/8/PPPP1PPP/RNBQKBN...,56
2,rnbqkbnr/pppp1ppp/4p3/8/3PP3/8/PPP2PPP/RNBQKBN...,-9
3,rnbqkbnr/ppp2ppp/4p3/3p4/3PP3/8/PPP2PPP/RNBQKB...,52
4,rnbqkbnr/ppp2ppp/4p3/3p4/3PP3/8/PPPN1PPP/R1BQK...,-26


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12958035 entries, 0 to 12958034
Data columns (total 2 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   FEN         object
 1   Evaluation  object
dtypes: object(2)
memory usage: 197.7+ MB


In [4]:
df["FEN"]

0           rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR ...
1           rnbqkbnr/pppp1ppp/4p3/8/4P3/8/PPPP1PPP/RNBQKBN...
2           rnbqkbnr/pppp1ppp/4p3/8/3PP3/8/PPP2PPP/RNBQKBN...
3           rnbqkbnr/ppp2ppp/4p3/3p4/3PP3/8/PPP2PPP/RNBQKB...
4           rnbqkbnr/ppp2ppp/4p3/3p4/3PP3/8/PPPN1PPP/R1BQK...
                                  ...                        
12958030    r1bqkb1r/pp3ppp/1nn1p3/3pP3/3P1P2/1B3N2/PP2Q1P...
12958031    r2qkb1r/pp1b1ppp/1nn1p3/3pP3/3P1P2/1B3N2/PP2Q1...
12958032    r2qkb1r/pp1b1ppp/1nn1p3/3pP3/3P1P2/1BN2N2/PP2Q...
12958033    r2qkb1r/pp1b1ppp/1n2p3/n2pP3/3P1P2/1BN2N2/PP2Q...
12958034    r2qkb1r/pp1b1ppp/1n2p3/n2pP3/3P1P2/2N2N2/PPB1Q...
Name: FEN, Length: 12958035, dtype: object

In [5]:
df = df[~df['Evaluation'].str.startswith('#')]
df['Evaluation'] = df['Evaluation'].str.replace('[^\d-]+', '', regex=True).astype(int)
df["Evaluation"].value_counts()

Evaluation
 0       1261356
 13       157897
-13       148689
 46        41674
 53        41404
          ...   
 5858          1
-4756          1
-3586          1
 8580          1
-4516          1
Name: count, Length: 13432, dtype: int64

In [6]:
df[["row1", "row2", "row3", "row4", "row5", "row6", "row7", "leftover_data"]] = df["FEN"].str.split('/', expand=True)

In [7]:
df.drop(columns=['FEN'], inplace=True)
df.head()

Unnamed: 0,Evaluation,row1,row2,row3,row4,row5,row6,row7,leftover_data
0,-10,rnbqkbnr,pppppppp,8,8,4P3,8,PPPP1PPP,RNBQKBNR b KQkq - 0 1
1,56,rnbqkbnr,pppp1ppp,4p3,8,4P3,8,PPPP1PPP,RNBQKBNR w KQkq - 0 2
2,-9,rnbqkbnr,pppp1ppp,4p3,8,3PP3,8,PPP2PPP,RNBQKBNR b KQkq - 0 2
3,52,rnbqkbnr,ppp2ppp,4p3,3p4,3PP3,8,PPP2PPP,RNBQKBNR w KQkq - 0 3
4,-26,rnbqkbnr,ppp2ppp,4p3,3p4,3PP3,8,PPPN1PPP,R1BQKBNR b KQkq - 1 3


In [8]:
df[["row8","active_color", "castle", "en_passant", "halfmove_clock", "fullmove_num"]] = df["leftover_data"].str.split(" ", expand=True)

In [9]:
df.drop(columns=['leftover_data'], inplace=True)

In [10]:
df["active_color"] = df["active_color"].map({"b": 1, "w": 0})
df.head()

Unnamed: 0,Evaluation,row1,row2,row3,row4,row5,row6,row7,row8,active_color,castle,en_passant,halfmove_clock,fullmove_num
0,-10,rnbqkbnr,pppppppp,8,8,4P3,8,PPPP1PPP,RNBQKBNR,1,KQkq,-,0,1
1,56,rnbqkbnr,pppp1ppp,4p3,8,4P3,8,PPPP1PPP,RNBQKBNR,0,KQkq,-,0,2
2,-9,rnbqkbnr,pppp1ppp,4p3,8,3PP3,8,PPP2PPP,RNBQKBNR,1,KQkq,-,0,2
3,52,rnbqkbnr,ppp2ppp,4p3,3p4,3PP3,8,PPP2PPP,RNBQKBNR,0,KQkq,-,0,3
4,-26,rnbqkbnr,ppp2ppp,4p3,3p4,3PP3,8,PPPN1PPP,R1BQKBNR,1,KQkq,-,1,3


In [11]:
df['halfmove_clock'] = df['halfmove_clock'].astype(int)
df['fullmove_num'] = df['fullmove_num'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12767881 entries, 0 to 12958034
Data columns (total 14 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   Evaluation      int32 
 1   row1            object
 2   row2            object
 3   row3            object
 4   row4            object
 5   row5            object
 6   row6            object
 7   row7            object
 8   row8            object
 9   active_color    int64 
 10  castle          object
 11  en_passant      object
 12  halfmove_clock  int32 
 13  fullmove_num    int32 
dtypes: int32(3), int64(1), object(10)
memory usage: 1.3+ GB


In [13]:
df["castle"].value_counts()
df = pd.get_dummies(data=df,
               prefix=["castle"],
               columns=["castle"])
df.head()

Unnamed: 0,Evaluation,row1,row2,row3,row4,row5,row6,row7,row8,active_color,...,castle_Kk,castle_Kkq,castle_Kq,castle_Q,castle_Qk,castle_Qkq,castle_Qq,castle_k,castle_kq,castle_q
0,-10,rnbqkbnr,pppppppp,8,8,4P3,8,PPPP1PPP,RNBQKBNR,1,...,False,False,False,False,False,False,False,False,False,False
1,56,rnbqkbnr,pppp1ppp,4p3,8,4P3,8,PPPP1PPP,RNBQKBNR,0,...,False,False,False,False,False,False,False,False,False,False
2,-9,rnbqkbnr,pppp1ppp,4p3,8,3PP3,8,PPP2PPP,RNBQKBNR,1,...,False,False,False,False,False,False,False,False,False,False
3,52,rnbqkbnr,ppp2ppp,4p3,3p4,3PP3,8,PPP2PPP,RNBQKBNR,0,...,False,False,False,False,False,False,False,False,False,False
4,-26,rnbqkbnr,ppp2ppp,4p3,3p4,3PP3,8,PPPN1PPP,R1BQKBNR,1,...,False,False,False,False,False,False,False,False,False,False
