# Data selection

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)

## Games as white

### Data cleaning

In [3]:
raw_white = pd.read_parquet("data\snAp_freAk-white.parquet")
raw_white.head(5)

Unnamed: 0,Black,BlackElo,CurrentPosition,Date,ECO,ECOUrl,EndDate,EndTime,Event,FEN,Link,Result,Round,SetUp,Site,StartTime,Termination,TimeControl,Timezone,UTCDate,UTCTime,White,WhiteElo,mainline_moves,Date_clean,Online
0,RobertV44,1220,1k1rr3/p1pb3p/B5p1/5p2/3Pp3/p3P1P1/KPQ2PP1/1N1...,2024.02.03,A40,https://www.chess.com/openings/Queens-Pawn-Ope...,2024.02.03,07:45:12,Live Chess,,https://www.chess.com/game/live/100684995037,1-0,-,,Chess.com,07:41:26,snAp_freAk won by resignation,120+1,UTC,2024.02.03,07:41:26,snAp_freAk,1257,1. d4 e6 2. Bf4 d5 3. Nf3 Bd6 4. Bg3 Nc6 5. e3...,2024-02-03,False
1,MShaker1944,1311,3k3Q/1pp3R1/pq1r1p2/3p4/3P1P2/2P1P3/PP6/2K5 b - -,2024.02.03,D00,https://www.chess.com/openings/Queens-Pawn-Ope...,2024.02.03,07:49:42,Live Chess,,https://www.chess.com/game/live/100685510145,1-0,-,,Chess.com,07:45:22,snAp_freAk won by checkmate,120+1,UTC,2024.02.03,07:45:22,snAp_freAk,1271,1. d4 d5 2. Bf4 Bf5 3. Nf3 e6 4. e3 a6 5. Nbd2...,2024-02-03,False
2,Digocuaca,1710,2kr4/1pp2pp1/4pnp1/p2p4/q1nP1PP1/1N2P3/1PPQ4/2...,2024.02.05,D00,https://www.chess.com/openings/Queens-Pawn-Ope...,2024.02.05,08:05:54,Live Chess,,https://www.chess.com/game/live/100858967271,0-1,-,,Chess.com,07:59:44,Digocuaca won by resignation,600,UTC,2024.02.05,07:59:44,snAp_freAk,1641,1. d4 d5 2. Bf4 Nf6 3. Nf3 Bf5 4. e3 Nc6 5. Nh...,2024-02-05,False
3,Adler-Homs,1706,r3k2r/ppq2pp1/6p1/3p4/3P1Q2/2Pn3P/PPB2PPR/R3K3...,2024.02.05,D00,https://www.chess.com/openings/Queens-Pawn-Ope...,2024.02.05,12:08:29,Live Chess,,https://www.chess.com/game/live/100873472495,0-1,-,,Chess.com,12:04:03,Adler-Homs won by resignation,600,UTC,2024.02.05,12:04:03,snAp_freAk,1682,1. d4 d5 2. Bf4 Bf5 3. Nf3 Nf6 4. e3 e6 5. Nh4...,2024-02-05,False
4,stephiroth8,1630,q2r2r1/6p1/p6p/3pP3/p1kQ1BP1/2P1P3/5KP1/1R6 b - -,2024.02.05,A45,https://www.chess.com/openings/Indian-Game-2.B...,2024.02.05,12:26:55,Live Chess,,https://www.chess.com/game/live/100874570831,1-0,-,,Chess.com,12:18:37,snAp_freAk won by checkmate,600,UTC,2024.02.05,12:18:37,snAp_freAk,1680,1. d4 Nf6 2. Bf4 d5 3. Nf3 Nc6 4. e3 Bg4 5. Nb...,2024-02-05,False


In [4]:
raw_white.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2231 entries, 0 to 2230
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Black            2231 non-null   object        
 1   BlackElo         2231 non-null   object        
 2   CurrentPosition  2231 non-null   object        
 3   Date             2231 non-null   object        
 4   ECO              2231 non-null   object        
 5   ECOUrl           2231 non-null   object        
 6   EndDate          2231 non-null   object        
 7   EndTime          2231 non-null   object        
 8   Event            2231 non-null   object        
 9   FEN              1 non-null      object        
 10  Link             2231 non-null   object        
 11  Result           2231 non-null   object        
 12  Round            2231 non-null   object        
 13  SetUp            1 non-null      object        
 14  Site             2231 non-null   object 

### Changing TimeControl format to enclude minutes 

In [5]:
time_control = raw_white['TimeControl'].value_counts()
time_control

TimeControl
300        1490
120+1       346
600         265
60          114
180           9
1/86400       6
900+10        1
Name: count, dtype: int64

In [6]:
white = raw_white.copy()
white['TimeControl'] = white['TimeControl'].map({'300': '5m', 
                          '120+1': '2m+1s', 
                          '600': '10m', 
                          '60': '1m', 
                          '180': '3m', 
                          '1/86400': '1d', 
                          '900+10': '15m+10s'})
new_time_control = white['TimeControl'].value_counts()
new_time_control

TimeControl
5m         1490
2m+1s       346
10m         265
1m          114
3m            9
1d            6
15m+10s       1
Name: count, dtype: int64

In [7]:
columns_of_interest = ['White', 'WhiteElo', 'Result', 'Black', 'BlackElo', 'TimeControl', 'Date_clean', 'CurrentPosition', 'Termination', 'mainline_moves']
white = white[columns_of_interest]
white.head(5)


Unnamed: 0,White,WhiteElo,Result,Black,BlackElo,TimeControl,Date_clean,CurrentPosition,Termination,mainline_moves
0,snAp_freAk,1257,1-0,RobertV44,1220,2m+1s,2024-02-03,1k1rr3/p1pb3p/B5p1/5p2/3Pp3/p3P1P1/KPQ2PP1/1N1...,snAp_freAk won by resignation,1. d4 e6 2. Bf4 d5 3. Nf3 Bd6 4. Bg3 Nc6 5. e3...
1,snAp_freAk,1271,1-0,MShaker1944,1311,2m+1s,2024-02-03,3k3Q/1pp3R1/pq1r1p2/3p4/3P1P2/2P1P3/PP6/2K5 b - -,snAp_freAk won by checkmate,1. d4 d5 2. Bf4 Bf5 3. Nf3 e6 4. e3 a6 5. Nbd2...
2,snAp_freAk,1641,0-1,Digocuaca,1710,10m,2024-02-05,2kr4/1pp2pp1/4pnp1/p2p4/q1nP1PP1/1N2P3/1PPQ4/2...,Digocuaca won by resignation,1. d4 d5 2. Bf4 Nf6 3. Nf3 Bf5 4. e3 Nc6 5. Nh...
3,snAp_freAk,1682,0-1,Adler-Homs,1706,10m,2024-02-05,r3k2r/ppq2pp1/6p1/3p4/3P1Q2/2Pn3P/PPB2PPR/R3K3...,Adler-Homs won by resignation,1. d4 d5 2. Bf4 Bf5 3. Nf3 Nf6 4. e3 e6 5. Nh4...
4,snAp_freAk,1680,1-0,stephiroth8,1630,10m,2024-02-05,q2r2r1/6p1/p6p/3pP3/p1kQ1BP1/2P1P3/5KP1/1R6 b - -,snAp_freAk won by checkmate,1. d4 Nf6 2. Bf4 d5 3. Nf3 Nc6 4. e3 Bg4 5. Nb...


### Doing same for black

In [8]:
raw_black = pd.read_parquet("data\snAp_freAk-black.parquet")
time_control = raw_black['TimeControl'].value_counts()
black = raw_black.copy()
black['TimeControl'] = black['TimeControl'].map({'300': '5m', 
                          '120+1': '2m+1s', 
                          '600': '10m', 
                          '60': '1m', 
                          '180': '3m', 
                          '1/86400': '1d', 
                          '900+10': '15m+10s'})
black = black[columns_of_interest]
black.head(5)

Unnamed: 0,White,WhiteElo,Result,Black,BlackElo,TimeControl,Date_clean,CurrentPosition,Termination,mainline_moves
0,Rizxtar,1537,1-0,snAp_freAk,1246,2m+1s,2024-02-03,6k1/pp3pp1/1b2p3/3B4/8/8/PP4PP/4RK1R b - -,Rizxtar won by resignation,1. d4 d5 2. c4 c6 3. Nc3 Bf5 4. f3 e6 5. e4 Bg...
1,nebpetrovic,1619,0-1,snAp_freAk,1681,10m,2024-02-05,8/1r6/3R1p2/3K1kp1/3P4/2r5/8/8 w - -,snAp_freAk won by resignation,1. b4 d5 2. a4 Bf5 3. Bb2 c6 4. e3 e6 5. b5 Nf...
2,Mgr2050,1672,0-1,snAp_freAk,1718,10m,2024-02-05,2rq1rk1/pp1n1ppp/2n1p2B/3pPb2/1P1P3b/P7/3NBPPP...,snAp_freAk won by resignation,1. e4 c6 2. d4 d5 3. e5 Bf5 4. c3 e6 5. Nf3 Nd...
3,Vivek2810N,1712,1-0,snAp_freAk,1653,10m,2024-02-05,3Q2k1/p2n1ppp/8/1pr5/8/4P2P/Pb1PB3/3K2R1 b - -,Vivek2810N won by resignation,1. e3 c6 2. b3 d5 3. g3 Bf5 4. Bb2 e6 5. Bg2 N...
4,mohcineee13,1641,1-0,snAp_freAk,1650,10m,2024-02-05,3R2k1/2r2ppp/2q5/p3PbB1/Pp6/1B5P/1PP1Q3/2K5 b - -,mohcineee13 won by resignation,1. e4 c6 2. Bc4 d5 3. exd5 cxd5 4. Bb5+ Nc6 5....


In [9]:
combined = pd.concat([black, white], axis=0)
combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4465 entries, 0 to 2230
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   White            4465 non-null   object        
 1   WhiteElo         4465 non-null   object        
 2   Result           4465 non-null   object        
 3   Black            4465 non-null   object        
 4   BlackElo         4465 non-null   object        
 5   TimeControl      4465 non-null   object        
 6   Date_clean       4465 non-null   datetime64[ns]
 7   CurrentPosition  4465 non-null   object        
 8   Termination      4465 non-null   object        
 9   mainline_moves   4465 non-null   object        
dtypes: datetime64[ns](1), object(9)
memory usage: 383.7+ KB


In [10]:
white.to_csv('Data/games_as_white_final.csv', index=False)
black.to_csv('Data/games_as_black_final.csv', index=False)
combined.to_csv('Data/all_games_final.csv', index=False)