In [9]:
# Cleaning
import pandas as pd

def clean_df_new(df):

    resMap = lambda res : 'White' if res == '1-0' else 'Black' if res == '0-1' else 'Draw' if res == "1/2-1/2" else res
    cleanDateTime = lambda d: pd.to_datetime(d)
    cleanComment = lambda x: x if x.find(']') == -1 else x.split("] ")[1]

    datetime_columns = ['Date', 'Time', 'WhiteClock', 'BlackClock']
    float_columns = ['WhiteRD', 'BlackRD', 'AvgEvalOpening', 'AvgEvalMiddle', 'AvgEvalEnd', 'AvgEmtOpening', 'AvgEmtMiddle', 'AvgEmtEnd']
    int_columns = ['FICSGamesDBGameNo', 'WhiteElo', 'BlackElo', 'PlyCount']

    df = df.drop(['Event', 'Site', 'Round', 'WhiteClock', 'BlackClock'], axis=1)

    for column in df.columns:

        if column in float_columns:
            df[column] = df[column].astype(float)
        elif column in int_columns:
            df[column] = df[column].astype(int)
        elif column in datetime_columns:
            df[column] = df[column].apply(cleanDateTime)
        else:
            if column == 'Result':
                df[column] = df[column].apply(resMap)
            if column == 'ResultComment':
                df[column] = df[column].apply(cleanComment)
            df[column] = df[column].astype('category')

    df['WhiteIsComp'] = df['WhiteIsComp'] == 'Yes'
    df['BlackIsComp'] = df['BlackIsComp'] == 'Yes'

    return df

In [10]:
imp_df = pd.read_csv('data-by-month/January.csv') # whatever month of data you want to work with. WIll update for importing a folder full of em at once.

cleaned = clean_df_new(imp_df)
print (cleaned.dtypes)
cleaned.head()

Date                 datetime64[ns]
White                      category
Black                      category
Result                     category
BlackElo                      int64
BlackRD                     float64
ECO                        category
FICSGamesDBGameNo             int64
PlyCount                      int64
Time                 datetime64[ns]
TimeControl                category
WhiteElo                      int64
WhiteIsComp                    bool
WhiteRD                     float64
AvgEvalOpening              float64
AvgEvalMiddle               float64
AvgEvalEnd                  float64
AvgEmtOpening               float64
AvgEmtMiddle                float64
AvgEmtEnd                   float64
ResultComment              category
BlackIsComp                    bool
dtype: object


Unnamed: 0,Date,White,Black,Result,BlackElo,BlackRD,ECO,FICSGamesDBGameNo,PlyCount,Time,...,WhiteIsComp,WhiteRD,AvgEvalOpening,AvgEvalMiddle,AvgEvalEnd,AvgEmtOpening,AvgEmtMiddle,AvgEmtEnd,ResultComment,BlackIsComp
0,2023-01-31,konozrout,Geforce,Draw,1957,15.9,B00,530203389,55,2023-03-21 23:27:00,...,True,27.4,98.526316,117.055556,-186.277778,0.460778,0.896167,1.133889,Game drawn by repetition,False
1,2023-01-31,Geforce,pikozrout,Draw,2137,39.5,B40,530203352,69,2023-03-21 23:17:00,...,False,15.9,-52.958333,235.478261,484.090909,0.465087,0.490348,0.850773,Game drawn by mutual agreement,True
2,2023-01-31,Geforce,konozrout,Black,2054,27.5,B52,530203324,134,2023-03-21 23:03:00,...,False,15.9,-71.088889,-192.340909,-262.727273,0.481045,0.553864,0.317295,White forfeits on time,True
3,2023-01-31,GimmeDatKing,Frubes,White,1796,49.4,B90,530203266,55,2023-03-21 22:43:00,...,False,74.1,60.578947,246.222222,367.111111,0.882167,1.149944,1.762,Black resigns,False
4,2023-01-31,playoften,GimmeDatKing,Black,2290,74.5,A04,530203175,42,2023-03-21 22:22:00,...,False,24.8,43.866667,-18.571429,-105.307692,0.396786,0.546714,1.321692,White resigns,False


In [15]:
def construct_df_train(df):
    # Define columns to take the average of and to one-hot encode
    cols_to_avg = ['WhiteRD', 'BlackRD', 'AvgEvalOpening', 'AvgEvalMiddle', 'AvgEvalEnd', 'AvgEmtOpening', 'AvgEmtMiddle', 'AvgEmtEnd', 'PlyCount']
    cols_to_onehot = ['ECO'] # 'White', 'Black',  <--- For if we decide to train using player UN for black and white as a features
    ground_truth_labels = ['Result', 'ResultComment']

    # Define dictionary to map column names to letters for one-hot encoding
    col_name_to_letter = {col_name: chr(65+i) for i, col_name in enumerate(ground_truth_labels+cols_to_onehot)}

    # Define column names for player-wise averages
    new_cols_white = ['WhiteAvgRD', 'WhiteAvgEvalForOpenings', 'WhiteAvgEvalForMiddlegames', 'WhiteAvgEvalForEndgames', 'WhiteAvgEmtForOpenings', 'WhiteAvgEmtForMiddlegames', 'WhiteAvgEmtForEndgames', 'WhiteAvgPlyCount']
    new_cols_black = ['BlackAvgRD', 'BlackAvgEvalForOpenings', 'BlackAvgEvalForMiddlegames', 'BlackAvgEvalForEndgames', 'BlackAvgEmtForOpenings', 'BlackAvgEmtForMiddlegames', 'BlackAvgEmtForEndgames', 'BlackAvgPlyCount']

    # Compute player-wise averages
    dataAvgsByUserWhite = {p: df[df['White'] == p][cols_to_avg+cols_to_onehot].mean(numeric_only=True) for p in df['White'].unique()}
    dataAvgsByUserBlack = {p: df[df['Black'] == p][cols_to_avg+cols_to_onehot].mean(numeric_only=True) for p in df['Black'].unique()}

    # Define empty dataframe to hold the new data
    df_new = pd.DataFrame(columns = ground_truth_labels+cols_to_onehot+new_cols_white+new_cols_black)

    # Iterate over each row in the original dataframe and create a new row in the output dataframe with the necessary data
    for row in df.itertuples():
        white_avg_values = dataAvgsByUserWhite.get(row.White, pd.Series([0]*len(cols_to_avg+cols_to_onehot)))
        black_avg_values = dataAvgsByUserBlack.get(row.Black, pd.Series([0]*len(cols_to_avg+cols_to_onehot)))

        white_new_cols = [white_avg_values[0], white_avg_values[2], white_avg_values[3], white_avg_values[4], white_avg_values[5], white_avg_values[6], white_avg_values[7], white_avg_values[8]]
        black_new_cols = [black_avg_values[1], black_avg_values[2], black_avg_values[3], black_avg_values[4], black_avg_values[5], black_avg_values[6], black_avg_values[7], black_avg_values[8]]

        new_row = tuple(getattr(row, label) for label in ground_truth_labels+cols_to_onehot) + tuple(white_new_cols) + tuple(black_new_cols)

        df_new.loc[len(df_new)] = new_row

    # One-hot encode the ground truth labels and cols_to_onehot
    df_new = pd.get_dummies(df_new, columns = ground_truth_labels+cols_to_onehot, prefix = list(col_name_to_letter.values()))

    # Map the column labels to their corresponding uppercase letters
    df_new.columns = df_new.columns.map(lambda x: col_name_to_letter.get(x, x))

    return df_new

In [16]:
df_train = construct_df_train(cleaned)
print (cleaned.dtypes)
cleaned.head()

Date                 datetime64[ns]
White                      category
Black                      category
Result                     category
BlackElo                      int64
BlackRD                     float64
ECO                        category
FICSGamesDBGameNo             int64
PlyCount                      int64
Time                 datetime64[ns]
TimeControl                category
WhiteElo                      int64
WhiteIsComp                    bool
WhiteRD                     float64
AvgEvalOpening              float64
AvgEvalMiddle               float64
AvgEvalEnd                  float64
AvgEmtOpening               float64
AvgEmtMiddle                float64
AvgEmtEnd                   float64
ResultComment              category
BlackIsComp                    bool
dtype: object


Unnamed: 0,Date,White,Black,Result,BlackElo,BlackRD,ECO,FICSGamesDBGameNo,PlyCount,Time,...,WhiteIsComp,WhiteRD,AvgEvalOpening,AvgEvalMiddle,AvgEvalEnd,AvgEmtOpening,AvgEmtMiddle,AvgEmtEnd,ResultComment,BlackIsComp
0,2023-01-31,konozrout,Geforce,Draw,1957,15.9,B00,530203389,55,2023-03-21 23:27:00,...,True,27.4,98.526316,117.055556,-186.277778,0.460778,0.896167,1.133889,Game drawn by repetition,False
1,2023-01-31,Geforce,pikozrout,Draw,2137,39.5,B40,530203352,69,2023-03-21 23:17:00,...,False,15.9,-52.958333,235.478261,484.090909,0.465087,0.490348,0.850773,Game drawn by mutual agreement,True
2,2023-01-31,Geforce,konozrout,Black,2054,27.5,B52,530203324,134,2023-03-21 23:03:00,...,False,15.9,-71.088889,-192.340909,-262.727273,0.481045,0.553864,0.317295,White forfeits on time,True
3,2023-01-31,GimmeDatKing,Frubes,White,1796,49.4,B90,530203266,55,2023-03-21 22:43:00,...,False,74.1,60.578947,246.222222,367.111111,0.882167,1.149944,1.762,Black resigns,False
4,2023-01-31,playoften,GimmeDatKing,Black,2290,74.5,A04,530203175,42,2023-03-21 22:22:00,...,False,24.8,43.866667,-18.571429,-105.307692,0.396786,0.546714,1.321692,White resigns,False


In [None]:
before = cleaned.shape[0]

dropped = cleaned.dropna()

after = dropped.shape[0]

print ('{} / {}'.format(after,before))
print ('Data Loss : {:.2f} %  \n(Mostly due to NaN eval scores. Try increasing engine computation time if this is too high.)'.format(((before - after)*100)/before))

#cleaned.to_csv('out.csv')

2453 / 2460
Data Loss : 0.28 %  
(Mostly due to NaN eval scores. Try increasing engine computation time if this is too high.)
