In [8]:
import os
import chess.pgn
from datetime import datetime

print('Preloading games...')
limit = 10000
games = []

for path in os.listdir("datasets"):
    with open(f'datasets/{path}') as file:
        while len(games) < limit:
            game = chess.pgn.read_game(file)
            if game is None:
                break

            date = game.headers.get('UTCDate')  # 2012.12.31
            time = game.headers.get('UTCTime')  # 23:04:12

            ts_game = {
                'id': game.headers.get('Site').split('/')[-1],  # [Site "https://lichess.org/j1dkb5dw"]
                'link': game.headers.get('Site'),
                'timestamp_utc': int(datetime.strptime(f'{date} {time}', '%Y.%m.%d %H:%M:%S').timestamp()),
                'event': game.headers.get('Event'),
                'white': game.headers.get('White'),
                'black': game.headers.get('Black'),
                'opening': game.headers.get('Opening'),
                'termination': game.headers.get('Termination'),
                'mainline_moves': str(game.mainline_moves()),
            }

            games.append(ts_game)
print('Loaded {} games'.format(len(games)))


def print_game(game):
    print(f"{game['white']} vs {game['black']} ({game['event']})")
    print(game['link'])
    print(game['mainline_moves'])
    print()

Preloading games...
Loaded 10000 games


# Typesense

In [9]:
import requests
import typesense

typesense_url = "http://localhost:8108"
typesense_api_key = "xyz"
typesense_api_key_header = "X-TYPESENSE-API-KEY"

typesense_client = typesense.Client({
    'nodes': [{
        'host': 'localhost',
        'port': '8108',
        'protocol': 'http'
    }],
    'api_key': 'xyz',
    'connection_timeout_seconds': 2
})

collection_name = 'chess'

measurements = []

# noinspection PyShadowingBuiltins
def human_size(bytes, units=None):
    minus = bytes < 0
    if minus:
        bytes = -bytes
    if units is None:
        units = [' bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB']
    formatted = str(bytes) + units[0] if bytes < 1024 else human_size(bytes >> 10, units[1:])
    if minus:
        formatted = '-' + formatted
    return formatted


def measure_metrics():
    measurement = requests.get(f"{typesense_url}/metrics.json",
                               headers={typesense_api_key_header: typesense_api_key}).json()

    measurement['system_cpu_active_percentage'] = float(measurement['system_cpu_active_percentage'])
    measurement['system_disk_used_bytes'] = int(measurement['system_disk_used_bytes'])
    measurement['system_memory_used_bytes'] = int(measurement['system_memory_used_bytes'])
    measurement['system_network_received_bytes'] = int(measurement['system_network_received_bytes'])
    measurement['system_network_sent_bytes'] = int(measurement['system_network_sent_bytes'])
    measurement['typesense_memory_active_bytes'] = int(measurement['typesense_memory_active_bytes'])
    measurement['typesense_memory_allocated_bytes'] = int(measurement['typesense_memory_allocated_bytes'])
    measurement['typesense_memory_mapped_bytes'] = int(measurement['typesense_memory_mapped_bytes'])
    measurement['typesense_memory_metadata_bytes'] = int(measurement['typesense_memory_metadata_bytes'])
    measurement['typesense_memory_fragmentation_ratio'] = float(measurement['typesense_memory_fragmentation_ratio'])
    measurement['typesense_memory_resident_bytes'] = int(measurement['typesense_memory_resident_bytes'])
    measurement['typesense_memory_retained_bytes'] = int(measurement['typesense_memory_retained_bytes'])
    measurements.append(measurement)
    return measurement


def subtract_measurements(m1, m2):
    m = {'system_cpu_active_percentage': m1['system_cpu_active_percentage'] - m2['system_cpu_active_percentage'],
         'system_disk_used_bytes': m1['system_disk_used_bytes'] - m2['system_disk_used_bytes'],
         'system_memory_used_bytes': m1['system_memory_used_bytes'] - m2['system_memory_used_bytes'],
         'system_network_received_bytes': m1['system_network_received_bytes'] - m2['system_network_received_bytes'],
         'system_network_sent_bytes': m1['system_network_sent_bytes'] - m2['system_network_sent_bytes'],
         'typesense_memory_active_bytes': m1['typesense_memory_active_bytes'] - m2['typesense_memory_active_bytes'],
         'typesense_memory_allocated_bytes': m1['typesense_memory_allocated_bytes'] - m2[
             'typesense_memory_allocated_bytes'],
         'typesense_memory_mapped_bytes': m1['typesense_memory_mapped_bytes'] - m2['typesense_memory_mapped_bytes'],
         'typesense_memory_metadata_bytes': m1['typesense_memory_metadata_bytes'] - m2[
             'typesense_memory_metadata_bytes'],
         'typesense_memory_fragmentation_ratio': m1['typesense_memory_fragmentation_ratio'] - m2[
             'typesense_memory_fragmentation_ratio'],
         'typesense_memory_resident_bytes': m1['typesense_memory_resident_bytes'] - m2[
             'typesense_memory_resident_bytes'],
         'typesense_memory_retained_bytes': m1['typesense_memory_retained_bytes'] - m2[
             'typesense_memory_retained_bytes']}

    return m


def display_metrics_text(measurement):
    print(f"CPU:                                  {measurement['system_cpu_active_percentage']}%")
    print(f"system_disk_used_bytes:               {human_size((measurement['system_disk_used_bytes']))}")
    print(f"system_memory_used_bytes:             {human_size((measurement['system_memory_used_bytes']))}")
    print(f"system_network_received_bytes:        {human_size((measurement['system_network_received_bytes']))}")
    print(f"system_network_sent_bytes:            {human_size((measurement['system_network_sent_bytes']))}")
    print(f"typesense_memory_active_bytes:        {human_size((measurement['typesense_memory_active_bytes']))}")
    print(f"typesense_memory_allocated_bytes:     {human_size((measurement['typesense_memory_allocated_bytes']))}")
    print(f"typesense_memory_mapped_bytes:        {human_size((measurement['typesense_memory_mapped_bytes']))}")
    print(f"typesense_memory_metadata_bytes:      {human_size((measurement['typesense_memory_metadata_bytes']))}")
    print(f"typesense_memory_fragmentation_ratio: {measurement['typesense_memory_fragmentation_ratio']}")
    print(f"typesense_memory_resident_bytes:      {human_size((measurement['typesense_memory_resident_bytes']))}")
    print(f"typesense_memory_retained_bytes:      {human_size((measurement['typesense_memory_retained_bytes']))}")

1. Średni czas przetwarzania zadania zawierającego 10000 dokumentów (porównanie bedzie się zaczynać na pustym indeksie)

In [10]:
print("Re/Creating collection...")
schema = {
    'name': collection_name,
    'fields': [
        {'name': 'link', 'type': 'string'},  # [Site "https://lichess.org/j1dkb5dw"]
        {'name': 'timestamp_utc', 'type': 'int32'},  # [UTCDate "2012.12.31"] [UTCTime "23:04:12"]
        {'name': 'event', 'type': 'string'},  # [Event "Rated Classical game"]
        {'name': 'white', 'type': 'string'},  # [White "BFG9k"]
        {'name': 'black', 'type': 'string'},  # [Black "mamalak"]
        {'name': 'opening', 'type': 'string'},  # [Opening "French Defense: Normal Variation"]
        {'name': 'termination', 'type': 'string'},  # [Termination "Normal"]
        {'name': 'mainline_moves', 'type': 'string'},  # 1. e4 e6 2. d4 b6 3. a3 Bb7 4. Nc3 Nh6 5. Bxh6 gxh6 6. Be2...
    ],
    'default_sorting_field': 'timestamp_utc'
}

names = [x['name'] for x in typesense_client.collections.retrieve()]

if collection_name in names:
    typesense_client.collections[collection_name].delete()

before = measure_metrics()

Re/Creating collection...


In [35]:
%%timeit
global collection

names = [x['name'] for x in typesense_client.collections.retrieve()]

if collection_name in names:
    typesense_client.collections[collection_name].delete()

typesense_client.collections.create(schema)

collection = typesense_client.collections[collection_name]

collection.documents.import_(games, {
    'action': 'upsert',
})

702 ms ± 8.06 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [36]:
after = measure_metrics()

2. Czas odpowiedzi na pojedyncze zapytania

In [45]:
query = "e4 e5"
print("Searching for '{}'...".format(query))
params = {
    'q': query,
    'query_by': 'mainline_moves',
    'sort_by': 'timestamp_utc:desc',
    'per_page': 10,
    'page': 1,
}

Searching for 'e4 e5'...


In [46]:
%%timeit
global collection
global results
results = collection.documents.search(params)

5.21 ms ± 350 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


3. Porównanie wyników wyszukiwania dla jednakowych zapytań

In [52]:
tp = 0
fp = 0
for result in results['hits']:
    if query in result['document']['mainline_moves']:
        tp += 1
    else:
        fp += 1
    print_game(result['document'])

print()
print("False positives: {:.02f}".format(fp))
print("True positives: {:.02f}".format(tp))
print("Precision: {:.02f}".format(tp / (tp + fp)))
print("Recall: {:.02f}".format(1))
print("F1: {:.02f}".format(2 * (tp / (tp + fp)) / (1 + (tp / (tp + fp)))))

CrazyBullet vs Paker (Rated Bullet game)
https://lichess.org/rb1lnek9
1. e3 b6 2. d3 c5 3. f3 Bb7 4. e4 d5 5. e5 f6 6. exf6 Nxf6 7. Qe2 Qd6 8. Bg5 e5 9. Bxf6 gxf6 10. Nc3 Bh6 11. Nb5 Qc6 12. a4 a6 13. Nc3

pronik vs elenin012 (Rated Classical game)
https://lichess.org/7s6f3zgm
1. d4 d5 2. Nf3 Bf5 3. Bd2 e6 4. Nc3 Bd6 5. e3 Ne7 6. Be2 h6 7. O-O c6 8. a3 Bc7 9. h3 Qd6 10. Bd3 Bg6 11. Ne5 Nd7 12. f4 Nf5 13. Ne2 O-O 14. c4 dxc4 15. Bxc4 b5 16. Bd3 a5 17. Nxg6 fxg6 18. Bc1 Rf7 19. g4 Nh4 20. Qe1 g5 21. Qg3 Raf8 22. e4 Qe7 23. e5 Nb6 24. b3 Nd5 25. Bd2 b4 26. axb4 axb4 27. Be4 Nxf4 28. Nxf4 gxf4 29. Rxf4 Rxf4 30. Bxf4 g5 31. Be3 Qf7 32. Bxc6 Nf3+ 33. Bxf3 Qxf3 34. Qxf3 Rxf3 35. Re1 Rxh3 36. Kg2 Rh4 37. Kg3 Kg7 38. Rc1 Bd8 39. Rc8 Be7 40. Rc7 Kf7 41. Bd2 Ke8 42. Rb7 Rh1 43. Bxb4 Bxb4 44. Rxb4 Rd1 45. Kf3 Kd7 46. Rb7+ Kc6 47. Rh7 Rxd4 48. Rxh6 Kd5 49. Rg6 Kxe5 50. Rxg5+ Kf6 51. Rb5 Rd3+ 52. Ke4 Rg3 53. g5+ Kg6 54. Re5 Rxb3 55. Rxe6+ Kxg5 56. Kd5 Kf5 57. Re5+ Kf4 58. Re8 Rd3+ 59. Kc4 Re3 60. Rf

4. Zużycie procesora i RAMu w momencie przetwarzania zadania zawierającego 10000 dokumentów

In [None]:
print("BEFORE")
display_metrics_text(before)
print()

print("AFTER")
display_metrics_text(after)
print()

print("DIFF")
display_metrics_text(subtract_measurements(after, before))

5. Średni czas odpowiedzi pzry wielu jednoczesnych wyszukiwaniach - analiza przeprowadzona będzie na jednej maszynie, przy użyciu jednego skryptu wysyłajacego zapytania HTTP aby zminimalizować wpływ innych czynników niz silnik

In [None]:
import requests
from timeit import default_timer as timer
from concurrent.futures import ThreadPoolExecutor

def query(n):
    start = timer()
    collection.documents.search(params)
    end = timer()
    return end - start

with ThreadPoolExecutor(max_workers=10000) as pool:
    r = list(pool.map(query,range(10000)))
    print(f"Average time: {sum(r)/len(r):.2f} seconds")

6. Wzrost wielkości indeksu w zależnosci od ilości zaindeksowanch dokumentów

In [None]:
import matplotlib.pyplot as plt

batch_size = 1000

def plot(data, name):
    x = [i for i, _ in enumerate(data)]
    y = data

    plt.xlabel(f'batch number [every {batch_size} games]')
    plt.ylabel(name)
    plt.scatter(x, y)
    plt.show()


def plot_metric(metric, name, transform=lambda x: x):
    plot([transform(measurement[metric]) for measurement in measurements], name)

measurements.clear()

names = [x['name'] for x in typesense_client.collections.retrieve()]

if collection_name in names:
    typesense_client.collections[collection_name].delete()

typesense_client.collections.create(schema)

collection = typesense_client.collections[collection_name]

measure_metrics()
for i in range(0, len(games), batch_size):
    print(f"Indexing batch {i // batch_size + 1}...")
    collection.documents.import_(games[i:i+batch_size], {
        'action': 'upsert',
    })
    measure_metrics()

plot_metric("system_disk_used_bytes", "DISK MB", lambda x: float(x) / 1024.0 / 1024.0)
plot_metric("system_cpu_active_percentage", "CPU %", lambda x: float(x))
plot_metric("system_memory_used_bytes", "MEMORY MB", lambda x: float(x) / 1024.0 / 1024.0)

7. Porównanie wsparcia da popularnych jezyków programowania oraz próba określenia łatwości korzystania z dokumentacji i jej kompletności