In [9]:
#!pip install python-chess

In [None]:
import os
import zipfile
import pandas as pd
import chess.pgn
from io import TextIOWrapper

# === Configuración ===
zip_folder = "pgns"
os.makedirs("pgns_extracted", exist_ok=True)  # No usado, pero por si luego se quiere guardar
game_counter = 0

# === Contenedores de resultados ===
all_games_info = []
player_moves_data = []

# === Proceso de extracción ===
for zip_name in sorted(os.listdir(zip_folder)):
    if not zip_name.endswith(".zip"):
        continue

    player_lastname = os.path.splitext(zip_name)[0]  # "Carlsen", "Kasparov", etc.
    zip_path = os.path.join(zip_folder, zip_name)

    with zipfile.ZipFile(zip_path, 'r') as archive:
        for pgn_filename in archive.namelist():
            if not pgn_filename.endswith(".pgn"):
                continue

            with archive.open(pgn_filename) as file:
                text_io = TextIOWrapper(file, encoding='utf-8', errors='ignore')
                while True:
                    game = chess.pgn.read_game(text_io)
                    if game is None:
                        break

                    board = game.board()
                    moves = []
                    for move in game.mainline_moves():
                        try:
                            san = board.san(move)
                        except Exception:
                            san = None  # jugada inválida
                        if san:
                            moves.append(san)
                        board.push(move)

                    white = game.headers.get("White", "").lower()
                    black = game.headers.get("Black", "").lower()
                    lastname_lower = player_lastname.lower()

                    # Determinar si el jugador del archivo es blanco o negro
                    if lastname_lower in white:
                        player_color = "white"
                        player_moves = moves[::2]
                    elif lastname_lower in black:
                        player_color = "black"
                        player_moves = moves[1::2]
                    else:
                        continue  # Si el jugador no aparece como blanco ni negro, saltamos

                    # === Guardar en ambos dataframes ===
                    game_id = game_counter

                    all_games_info.append({
                        "id": game_id,
                        "white": game.headers.get("White", ""),
                        "black": game.headers.get("Black", ""),
                        "result": game.headers.get("Result", ""),
                        "event": game.headers.get("Event", ""),
                        "date": game.headers.get("Date", ""),
                        "player_file": player_lastname,
                        "full_moves": " ".join(moves),  # <-- todos los movimientos SAN
})

                    player_moves_data.append({
                        "id": game_id,
                        "player": player_lastname,
                        "color": player_color,
                        "moves": " ".join(player_moves),
                    })

                    game_counter += 1

# === Crear DataFrames ===
df_all_games = pd.DataFrame(all_games_info)
df_player_moves = pd.DataFrame(player_moves_data)


# === Guardar a CSV (opcional) ===
df_all_games.to_csv("csvs/games.csv", index=False)
df_player_moves.to_csv("csvs/player_moves.csv", index=False)


print(f"Partidas procesadas: {len(df_all_games)}")


illegal san: 'Qxe1' in r2k3r/2pPp3/p4n2/3b2B1/1p5P/2qP4/3RQ1P1/4K2R w - - 2 31 while parsing <Game at 0x19cae4ce180 ('Gelfand,B' vs. 'Gareev,T', '2019.12.29' at 'Moscow RUS')>


Partidas procesadas: 99897


In [11]:
df_all_games

Unnamed: 0,id,white,black,result,event,date,player_file,full_moves
0,0,Savrov,"Alekhine, Alexander",0-1,Earl tourn,1906.??.??,Alekhine,e4 e5 f4 Bc5 Nf3 d6 c3 Bg4 Be2 Bxf3 Bxf3 Nc6 b...
1,1,Giese,"Alekhine, Alexander",0-1,Earl tourn,1906.??.??,Alekhine,e4 e5 Nf3 Nc6 d4 exd4 Nxd4 Nf6 Nc3 Bb4 Nxc6 bx...
2,2,"Alekhine, Alexander","Ljubimov, T.",1-0,Earl tourn,1906.??.??,Alekhine,e4 e5 Nf3 Nc6 Bc4 Nf6 Ng5 d5 exd5 Na5 Bb5+ c6 ...
3,3,"Alekhine, Alexander",Romaskevic,1-0,Earl tourn,1906.??.??,Alekhine,e4 e5 Ne2 f5 Ng3 Qh4 Nc3 Bc5 Qf3 d6 exf5 Ne7 N...
4,4,"Manko, V.","Alekhine, Alexander",1-0,Earl tourn,1906.??.??,Alekhine,e4 e5 Nf3 Nc6 Bc4 Bc5 b4 Bxb4 c3 Ba5 d4 exd4 O...
...,...,...,...,...,...,...,...,...
99892,99892,"Zukertort, Johannes Hermann","Blackburne, Joseph Henry",0-1,London m,1887.??.??,Zukertort,d4 d5 e3 Nf6 c4 e6 Nf3 c5 b3 Nc6 Bb2 cxd4 exd4...
99893,99893,"Blackburne, Joseph Henry","Zukertort, Johannes Hermann",1/2-1/2,London m,1887.??.??,Zukertort,e4 e5 Nf3 Nc6 Bb5 Nf6 d3 d6 O-O g6 Nc3 Bd7 Bg5...
99894,99894,"Zukertort, Johannes Hermann","Blackburne, Joseph Henry",1/2-1/2,London m,1887.??.??,Zukertort,c4 e5 e3 Nc6 a3 g6 Nc3 Bg7 Nf3 Nge7 Be2 d5 cxd...
99895,99895,"Blackburne, Joseph Henry","Zukertort, Johannes Hermann",1/2-1/2,London m,1887.??.??,Zukertort,d4 d5 c4 e6 Nc3 Nf6 Bf4 c5 e3 Nc6 Nf3 cxd4 exd...


In [12]:
df_player_moves

Unnamed: 0,id,player,color,moves
0,0,Alekhine,black,e5 Bc5 d6 Bg4 Bxf3 Nc6 Bb6 Nce7 exd4 Nf6 O-O d...
1,1,Alekhine,black,e5 Nc6 exd4 Nf6 Bb4 bxc6 Qe7 d5 O-O Bc5 gxf6 B...
2,2,Alekhine,white,e4 Nf3 Bc4 Ng5 exd5 Bb5+ dxc6 Qf3 Bd3 Ne4 Ng3 ...
3,3,Alekhine,white,e4 Ne2 Ng3 Nc3 Qf3 exf5 Nce4 c3 d4 Nh5 Nhf6+ g...
4,4,Alekhine,black,e5 Nc6 Bc5 Bxb4 Ba5 exd4 dxc3 Qf6 Qg6 Nge7 Rb8...
...,...,...,...,...
99892,99892,Zukertort,white,d4 e3 c4 Nf3 b3 Bb2 exd4 bxc4 Nbd2 Qa4 Rd1 Bd3...
99893,99893,Zukertort,black,e5 Nc6 Nf6 d6 g6 Bd7 Bg7 h6 Bxf6 Bg7 O-O Ne7 N...
99894,99894,Zukertort,white,c4 e3 a3 Nc3 Nf3 Be2 cxd5 O-O Qc2 bxc3 d3 e4 h...
99895,99895,Zukertort,black,d5 e6 Nf6 c5 Nc6 cxd4 dxc4 Be7 O-O Bd7 Rc8 Qa5...


In [13]:
df_player_moves["player"].value_counts()

player
Kamsky         7035
Carlsen        6615
Caruana        5340
Aronian        5107
Ivanchuk       4950
Kramnik        4324
Anand          4204
Gelfand        3983
Timman         3621
Karjakin       3536
Karpov         3529
Portisch       3030
Leko           2681
Smyslov        2627
Topalov        2613
Tal            2431
Larsen         2383
Spassky        2231
Geller         2198
Kasparov       2128
Bronstein      1930
Petrosian      1893
Polugaevsky    1890
Alekhine       1661
Najdorf        1604
Keres          1571
Reshevsky      1267
Euwe           1122
Korchnoi       1038
Bogoljubow      973
Lasker          900
Botvinnik       891
Fischer         827
Rubinstein      797
Maroczy         756
Schlechter      739
Tarrasch        704
Chigorin        688
Anderssen       681
Capablanca      597
Steinitz        590
Nimzowitsch     512
Pillsbury       388
Fine            305
Staunton        284
Zukertort       265
Winawer         241
Morphy          211
Philidor          6
Name: count, 

In [None]:
#!pip install sentence-transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
import sentence_transformers
import pandas as pd
import os

In [8]:
# Move one directory up to access the 'csvs' folder
os.chdir("..")

### SBERT

In [3]:
def vectorizar(jugadas):
    modelo = sentence_transformers.SentenceTransformer("bert-base-nli-mean-tokens")
    return  modelo.encode(jugadas)

In [10]:
df_player_moves = pd.read_csv("csvs/player_moves.csv")
df_player_moves["moves"] = df_player_moves["moves"].apply(vectorizar)
df_player_moves

TypeError: 'float' object is not subscriptable

In [None]:
df_player_moves.to_csv("vectorized_player_moves.csv", index=False)