In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("/home/courtino/.chess-stats/game-synthesis/DiciDicee/games.tsv", sep='\t')
df

Unnamed: 0,link,date,color,elo,time_control,variant,termination,result,opening_family,eco,moves
0,https://www.chess.com/game/live/124184918193,2024-11,White,1419,Rapid,,DiciDicee won by resignation,1-0,"Four knights game, Scotch variation",C48,1. e4 {[%clk 0:09:59.3]} 1... e5 {[%clk 0:09:5...
1,https://www.chess.com/game/daily/721873215,2024-11,Black,1313,Daily,,DiciDicee won by resignation,0-1,Caro-Kann defence,B12,1. e4 {[%clk 0:00:00.9]} 1... c6 {[%clk 0:00:0...
2,https://www.chess.com/game/daily/724666567,2024-11,Black,1314,Daily,,DiciDicee won by resignation,0-1,Caro-Kann defence,B12,1. e4 {[%clk 0:00:39]} 1... c6 {[%clk 0:02:38....
3,https://www.chess.com/game/daily/724666593,2024-11,White,1316,Daily,,DiciDicee won on time,1-0,Queen's pawn,A40,1. d4 {[%clk 0:00:01.6]} 1... h5 {[%clk 0:00:4...
4,https://www.chess.com/game/daily/725428271,2024-11,White,1318,Daily,,DiciDicee won by resignation,1-0,Ruy Lopez (Spanish opening),C70,1. e4 {[%clk 0:00:02.5]} 1... e5 {[%clk 0:00:2...
...,...,...,...,...,...,...,...,...,...,...,...
16527,https://lichess.org/rpQZUKem,2024-12,White,1617,Bullet,Standard,Normal,1-0,Scandinavian (centre counter) defence,B01,1. e4 { [%clk 0:02:00] } 1... d5 { [%clk 0:02:...
16528,https://lichess.org/k9hZ4C3m,2024-12,Black,1613,Bullet,Standard,Normal,0-1,Caro-Kann defence,B12,1. e4 { [%eval 0.18] [%clk 0:02:00] } 1... c6 ...
16529,https://lichess.org/a29idTn3,2024-12,White,1606,Bullet,Standard,Time forfeit,1-0,Scandinavian (centre counter) defence,B01,1. e4 { [%clk 0:02:00] } 1... d5 { [%clk 0:02:...
16530,https://lichess.org/5WzxR7u4,2024-12,Black,1613,Bullet,Standard,Normal,1-0,Queen's gambit declined,D35,1. d4 { [%clk 0:02:00] } 1... d5 { [%clk 0:02:...


In [2]:
opening_outcomes = df[df["date"] >= "2024-01"][["date", "color", "result", "opening_family", "time_control"]]
opening_outcomes = opening_outcomes[opening_outcomes['time_control'] != 'Bullet']
opening_outcomes

Unnamed: 0,date,color,result,opening_family,time_control
0,2024-11,White,1-0,"Four knights game, Scotch variation",Rapid
1,2024-11,Black,0-1,Caro-Kann defence,Daily
2,2024-11,Black,0-1,Caro-Kann defence,Daily
3,2024-11,White,1-0,Queen's pawn,Daily
4,2024-11,White,1-0,Ruy Lopez (Spanish opening),Daily
...,...,...,...,...,...
16262,2024-12,Black,0-1,Caro-Kann defence,Blitz
16263,2024-12,White,1-0,Philidor's defence,Blitz
16264,2024-12,White,1-0,Scandinavian (centre counter) defence,Blitz
16265,2024-12,White,1-0,Sicilian defence,Blitz


In [3]:
opening_outcomes_count = opening_outcomes.groupby(['color', 'opening_family', 'result']).size().reset_index(name='count')
opening_family_totals = opening_outcomes_count.groupby(['color', 'opening_family'])['count'].transform('sum')


opening_outcomes_count['percentage'] = (opening_outcomes_count['count'] / opening_family_totals) * 100
opening_outcomes_count['games_count'] = opening_family_totals

def convert_score_to_result(df, score_col = 'result', color_col = 'color'):
    conditions = [
        # 1. Win: (White and 1-0) OR (Black and 0-1)
        ((df[color_col] == 'White') & (df[score_col] == '1-0')) | 
        ((df[color_col] == 'Black') & (df[score_col] == '0-1')),
        
        # 2. Loss: (White and 0-1) OR (Black and 1-0)
        ((df[color_col] == 'White') & (df[score_col] == '0-1')) | 
        ((df[color_col] == 'Black') & (df[score_col] == '1-0')),
        
        # 3. Draw: 1/2-1/2 for either color
        (df[score_col] == '1/2-1/2')
    ]
    
    choices = ['Win', 'Loss', 'Draw']
    
    df['result'] = np.select(conditions, choices, default='Unknown')
    
    return df


opening_outcomes_count = convert_score_to_result(opening_outcomes_count)
opening_outcomes_count = opening_outcomes_count.sort_values(by=['games_count', 'opening_family', 'result'], ascending=False)

pd.set_option('display.max_rows', None)
opening_outcomes_count

Unnamed: 0,color,opening_family,result,count,percentage,games_count
5,Black,Caro-Kann defence,Win,671,52.421875,1280
6,Black,Caro-Kann defence,Loss,561,43.828125,1280
7,Black,Caro-Kann defence,Draw,48,3.75,1280
145,White,Ruy Lopez (Spanish opening),Win,408,54.327563,751
144,White,Ruy Lopez (Spanish opening),Loss,319,42.476698,751
146,White,Ruy Lopez (Spanish opening),Draw,24,3.195739,751
153,White,Sicilian defence,Win,211,47.41573,445
152,White,Sicilian defence,Loss,218,48.988764,445
154,White,Sicilian defence,Draw,16,3.595506,445
64,Black,Queen's pawn game,Win,201,50.502513,398


In [4]:


def identify_weak_openings(df):
    win_rate_map = df[df['result'] == 'Win'].set_index(['color', 'opening_family'])['percentage']
    df['opening_win_rate'] = df.set_index(['color', 'opening_family']).index.map(win_rate_map)
    
    loss_rate_map = df[df['result'] == 'Loss'].set_index(['color', 'opening_family'])['percentage']
    df['opening_loss_rate'] = df.set_index(['color', 'opening_family']).index.map(loss_rate_map)
    
    df_filtered = (
        df[df['opening_win_rate'] < df['opening_loss_rate']]
            .drop(columns=['opening_win_rate', 'opening_loss_rate'])
    )
    return df_filtered

weak_openings = identify_weak_openings(opening_outcomes_count)    
weak_openings

Unnamed: 0,color,opening_family,result,count,percentage,games_count
153,White,Sicilian defence,Win,211,47.41573,445
152,White,Sicilian defence,Loss,218,48.988764,445
154,White,Sicilian defence,Draw,16,3.595506,445
50,Black,Queen's gambit declined,Win,65,46.428571,140
51,Black,Queen's gambit declined,Loss,68,48.571429,140
52,Black,Queen's gambit declined,Draw,7,5.0,140
14,Black,English opening,Win,32,43.835616,73
15,Black,English opening,Loss,39,53.424658,73
16,Black,English opening,Draw,2,2.739726,73
28,Black,Italian game,Win,21,43.75,48
