In [55]:
import os
import chess.pgn
from datetime import datetime

print('Preloading games...')
limit = 10_000
games = []

for path in os.listdir("datasets"):
    with open(f'datasets/{path}') as file:
        while len(games) < limit:
            game = chess.pgn.read_game(file)
            if game is None:
                break

            date = game.headers.get('UTCDate')  # 2012.12.31
            time = game.headers.get('UTCTime')  # 23:04:12

            ts_game = {
                'id': game.headers.get('Site').split('/')[-1],  # [Site "https://lichess.org/j1dkb5dw"]
                'link': game.headers.get('Site'),
                'timestamp_utc': int(datetime.strptime(f'{date} {time}', '%Y.%m.%d %H:%M:%S').timestamp()),
                'event': game.headers.get('Event'),
                'white': game.headers.get('White'),
                'black': game.headers.get('Black'),
                'opening': game.headers.get('Opening'),
                'termination': game.headers.get('Termination'),
                'mainline_moves': str(game.mainline_moves()),
            }

            games.append(ts_game)
print('Loaded {} games'.format(len(games)))


def print_game(game):
    print(f"{game['white']} vs {game['black']} ({game['event']})")
    print(game['link'])
    print(game['mainline_moves'])
    print()

Preloading games...
Loaded 10000 games


# Typesense

In [56]:
import requests
import typesense

typesense_url = "http://localhost:8108"
typesense_api_key = "xyz"
typesense_api_key_header = "X-TYPESENSE-API-KEY"

typesense_client = typesense.Client({
    'nodes': [{
        'host': 'localhost',
        'port': '8108',
        'protocol': 'http'
    }],
    'api_key': 'xyz',
    'connection_timeout_seconds': 2
})

collection_name = 'chess'

measurements = []


# noinspection PyShadowingBuiltins
def human_size(bytes, units=None):
    minus = bytes < 0
    if minus:
        bytes = -bytes
    if units is None:
        units = [' bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB']
    formatted = str(bytes) + units[0] if bytes < 1024 else human_size(bytes >> 10, units[1:])
    if minus:
        formatted = '-' + formatted
    return formatted


def measure_metrics():
    measurement = requests.get(f"{typesense_url}/metrics.json",
                               headers={typesense_api_key_header: typesense_api_key}).json()

    measurement['system_cpu_active_percentage'] = float(measurement['system_cpu_active_percentage'])
    measurement['system_disk_used_bytes'] = int(measurement['system_disk_used_bytes'])
    measurement['system_memory_used_bytes'] = int(measurement['system_memory_used_bytes'])
    measurement['system_network_received_bytes'] = int(measurement['system_network_received_bytes'])
    measurement['system_network_sent_bytes'] = int(measurement['system_network_sent_bytes'])
    measurement['typesense_memory_active_bytes'] = int(measurement['typesense_memory_active_bytes'])
    measurement['typesense_memory_allocated_bytes'] = int(measurement['typesense_memory_allocated_bytes'])
    measurement['typesense_memory_mapped_bytes'] = int(measurement['typesense_memory_mapped_bytes'])
    measurement['typesense_memory_metadata_bytes'] = int(measurement['typesense_memory_metadata_bytes'])
    measurement['typesense_memory_fragmentation_ratio'] = float(measurement['typesense_memory_fragmentation_ratio'])
    measurement['typesense_memory_resident_bytes'] = int(measurement['typesense_memory_resident_bytes'])
    measurement['typesense_memory_retained_bytes'] = int(measurement['typesense_memory_retained_bytes'])
    measurements.append(measurement)
    return measurement


def subtract_measurements(m1, m2):
    m = {'system_cpu_active_percentage': m1['system_cpu_active_percentage'] - m2['system_cpu_active_percentage'],
         'system_disk_used_bytes': m1['system_disk_used_bytes'] - m2['system_disk_used_bytes'],
         'system_memory_used_bytes': m1['system_memory_used_bytes'] - m2['system_memory_used_bytes'],
         'system_network_received_bytes': m1['system_network_received_bytes'] - m2['system_network_received_bytes'],
         'system_network_sent_bytes': m1['system_network_sent_bytes'] - m2['system_network_sent_bytes'],
         'typesense_memory_active_bytes': m1['typesense_memory_active_bytes'] - m2['typesense_memory_active_bytes'],
         'typesense_memory_allocated_bytes': m1['typesense_memory_allocated_bytes'] - m2[
             'typesense_memory_allocated_bytes'],
         'typesense_memory_mapped_bytes': m1['typesense_memory_mapped_bytes'] - m2['typesense_memory_mapped_bytes'],
         'typesense_memory_metadata_bytes': m1['typesense_memory_metadata_bytes'] - m2[
             'typesense_memory_metadata_bytes'],
         'typesense_memory_fragmentation_ratio': m1['typesense_memory_fragmentation_ratio'] - m2[
             'typesense_memory_fragmentation_ratio'],
         'typesense_memory_resident_bytes': m1['typesense_memory_resident_bytes'] - m2[
             'typesense_memory_resident_bytes'],
         'typesense_memory_retained_bytes': m1['typesense_memory_retained_bytes'] - m2[
             'typesense_memory_retained_bytes']}

    return m


def display_metrics_text(measurement):
    print(f"CPU:                                  {measurement['system_cpu_active_percentage']}%")
    print(f"system_disk_used_bytes:               {human_size((measurement['system_disk_used_bytes']))}")
    print(f"system_memory_used_bytes:             {human_size((measurement['system_memory_used_bytes']))}")
    print(f"system_network_received_bytes:        {human_size((measurement['system_network_received_bytes']))}")
    print(f"system_network_sent_bytes:            {human_size((measurement['system_network_sent_bytes']))}")
    print(f"typesense_memory_active_bytes:        {human_size((measurement['typesense_memory_active_bytes']))}")
    print(f"typesense_memory_allocated_bytes:     {human_size((measurement['typesense_memory_allocated_bytes']))}")
    print(f"typesense_memory_mapped_bytes:        {human_size((measurement['typesense_memory_mapped_bytes']))}")
    print(f"typesense_memory_metadata_bytes:      {human_size((measurement['typesense_memory_metadata_bytes']))}")
    print(f"typesense_memory_fragmentation_ratio: {measurement['typesense_memory_fragmentation_ratio']}")
    print(f"typesense_memory_resident_bytes:      {human_size((measurement['typesense_memory_resident_bytes']))}")
    print(f"typesense_memory_retained_bytes:      {human_size((measurement['typesense_memory_retained_bytes']))}")

1. Średni czas przetwarzania zadania zawierającego 10000 dokumentów (porównanie bedzie się zaczynać na pustym indeksie)

In [57]:
print("Re/Creating collection...")
schema = {
    'name': collection_name,
    'fields': [
        {'name': 'link', 'type': 'string'},  # [Site "https://lichess.org/j1dkb5dw"]
        {'name': 'timestamp_utc', 'type': 'int32'},  # [UTCDate "2012.12.31"] [UTCTime "23:04:12"]
        {'name': 'event', 'type': 'string'},  # [Event "Rated Classical game"]
        {'name': 'white', 'type': 'string'},  # [White "BFG9k"]
        {'name': 'black', 'type': 'string'},  # [Black "mamalak"]
        {'name': 'opening', 'type': 'string'},  # [Opening "French Defense: Normal Variation"]
        {'name': 'termination', 'type': 'string'},  # [Termination "Normal"]
        {'name': 'mainline_moves', 'type': 'string'},  # 1. e4 e6 2. d4 b6 3. a3 Bb7 4. Nc3 Nh6 5. Bxh6 gxh6 6. Be2...
    ],
    'default_sorting_field': 'timestamp_utc'
}

names = [x['name'] for x in typesense_client.collections.retrieve()]

if collection_name in names:
    typesense_client.collections[collection_name].delete()

before = measure_metrics()

Re/Creating collection...


In [58]:
%%timeit
global collection

names = [x['name'] for x in typesense_client.collections.retrieve()]

if collection_name in names:
    typesense_client.collections[collection_name].delete()

typesense_client.collections.create(schema)

collection = typesense_client.collections[collection_name]

collection.documents.import_(games, {
    'action': 'upsert',
})

733 ms ± 9.78 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [59]:
after = measure_metrics()

2. Czas odpowiedzi na pojedyncze zapytania

In [133]:
query = "1. e4 e5 2. Nf3 Nc6 3. Bb5 a6 4. Ba4 Nf6"
print("Searching for '{}'...".format(query))
params = {
    'q': query,
    'query_by': 'mainline_moves',
    'per_page': 10,
    'page': 1,
}

Searching for '1. e4 e5 2. Nf3 Nc6 3. Bb5 a6 4. Ba4 Nf6'...


In [None]:
%%timeit
global collection
global results
results = collection.documents.search(params)

3. Porównanie wyników wyszukiwania dla jednakowych zapytań

In [135]:
tp = 0
fp = 0
relevant = 0
next_page = True
page = 1
while next_page:
    r = sum(1 if query in x['document']['mainline_moves'] else 0 for x in collection.documents.search({
        'q': query,
        'query_by': 'mainline_moves',
        'per_page': 250,
        'page': page,
    })['hits'])
    page += 1
    relevant += r
    if r < 250:
        next_page = False

for result in results['hits']:
    if query in result['document']['mainline_moves']:
        tp += 1
    else:
        fp += 1
    if tp + fp <= 10:
        print_game(result['document'])

print()
print("Total relevant: {}".format(relevant))
print("False positives: {:.02f}".format(fp))
print("True positives: {:.02f}".format(tp))
print("Precision: {:.02f}".format(tp / (tp + fp)))
print("Recall: {:.02f}".format(tp / relevant))
print("F1: {:.02f}".format(2 * (tp / (tp + fp)) * (tp / relevant) / ((tp / (tp + fp)) + (tp / relevant))))

Fisher62 vs bert (Rated Classical game)
https://lichess.org/cndw3ifd
1. e4 e5 2. Nf3 Nc6 3. Bb5 a6 4. Ba4 Nf6 5. O-O Nxe4 6. Re1 d5 7. d3 Nd6 8. Nxe5 Be7 9. Bxc6+ Kf8 10. Bxd5 Be6 11. Bxe6 fxe6 12. Qf3+ Bf6 13. Bg5 h6 14. Ng6+ Kf7 15. Nxh8+ Qxh8 16. Bxf6 gxf6 17. Nc3 h5 18. Ne4 Nf5 19. Ng3 Ng7 20. Qxb7 Rc8 21. Qxa6 h4 22. Ne4 h3 23. g3 Re8 24. Qc6 Re7 25. a4 Qh5 26. Nd2 Qf5 27. Qf3 Qa5 28. c3 c5 29. Reb1 e5 30. b4 cxb4 31. cxb4 Qc7 32. Nc4 Ne6 33. Qh5+ Kg8 34. Qxh3 Nd4 35. Qg4+ Rg7 36. Qd1 Qc6 37. Nd2 Rh7 38. b5 Qd7 39. Qf1 Qg4 40. Qg2 Ne2+ 41. Kf1 Nd4 42. Re1 Qe6 43. Nf3 Nc2 44. Rac1 Nxe1 45. Nxe1 Qa2 46. Rc8+ Kg7 47. Rc7+ Kg6 48. Rxh7 Kxh7 49. b6 Kg7 50. b7 Qb3 51. a5

Fisher62 vs Rodney (Rated Classical game)
https://lichess.org/3vnli7c6
1. e4 e5 2. Nf3 Nc6 3. Bb5 a6 4. Ba4 b5 5. Bb3 d6 6. h3 Nf6 7. O-O Nxe4 8. Re1 Nf6 9. d4 Be7 10. dxe5 dxe5 11. Qxd8+ Bxd8 12. Nxe5 Nxe5 13. Rxe5+ Be7 14. Bd5 Rb8 15. Bc6+ Bd7 16. Bxd7+ Nxd7 17. Re3 Rb6 18. Bd2 Re6 19. Rxe6 fxe6 20. Nc3 O-O 21. Re1 B

4. Zużycie procesora i RAMu w momencie przetwarzania zadania zawierającego 10000 dokumentów

In [None]:
print("BEFORE")
display_metrics_text(before)
print()

print("AFTER")
display_metrics_text(after)
print()

print("DIFF")
display_metrics_text(subtract_measurements(after, before))

5. Średni czas odpowiedzi pzry wielu jednoczesnych wyszukiwaniach - analiza przeprowadzona będzie na jednej maszynie, przy użyciu jednego skryptu wysyłajacego zapytania HTTP aby zminimalizować wpływ innych czynników niz silnik

In [None]:
import requests
from timeit import default_timer as timer
from concurrent.futures import ThreadPoolExecutor


def query(n):
    start = timer()
    collection.documents.search(params)
    end = timer()
    return end - start


with ThreadPoolExecutor(max_workers=10000) as pool:
    r = list(pool.map(query, range(10000)))
    print(f"Average time: {sum(r) / len(r):.2f} seconds")

6. Wzrost wielkości indeksu w zależnosci od ilości zaindeksowanch dokumentów

In [None]:
import matplotlib.pyplot as plt

batch_size = 1000


def plot(data, name):
    x = [i for i, _ in enumerate(data)]
    y = data

    plt.xlabel(f'batch number [every {batch_size} games]')
    plt.ylabel(name)
    plt.scatter(x, y)
    plt.show()


def plot_metric(metric, name, transform=lambda x: x):
    plot([transform(measurement[metric]) for measurement in measurements], name)


measurements.clear()

names = [x['name'] for x in typesense_client.collections.retrieve()]

if collection_name in names:
    typesense_client.collections[collection_name].delete()

typesense_client.collections.create(schema)

collection = typesense_client.collections[collection_name]

measure_metrics()
for i in range(0, len(games), batch_size):
    print(f"Indexing batch {i // batch_size + 1}...")
    collection.documents.import_(games[i:i + batch_size], {
        'action': 'upsert',
    })
    measure_metrics()

plot_metric("system_disk_used_bytes", "DISK MB", lambda x: float(x) / 1024.0 / 1024.0)
plot_metric("system_cpu_active_percentage", "CPU %", lambda x: float(x))
plot_metric("system_memory_used_bytes", "MEMORY MB", lambda x: float(x) / 1024.0 / 1024.0)

7. Porównanie wsparcia da popularnych jezyków programowania oraz próba określenia łatwości korzystania z dokumentacji i jej kompletności