In [1]:
results_path = "search-results/"
xp_name = "query"

job_managers_qty= 2
kafka_qty= 8
kafka_partitions= 32
kafka_replicas= 8
kafka_node_selector= "kafka"
# eval
events_num= 12000000000
source_parallelism= 32

memory_granularity = 32768
memory_range = [32768, 65536]
task_slots_minimal = 9
task_slots_limits = [80]
initial_parallelism = 1
reset_kafka_data = True
limit_backpressure_source = 0
monitoring_step = "5s"
notebooks = ["/xp_intro_q5_kafka_custom_ratelimit"]
cpu = 16
task_slots_per_task_manager = 16
task_managers_qty = 8
run = 0
warmup = 450
timeout = 600
nb_runs_throughput = 1
nb_runs_parallelism = 1
kind = False
g5k = True
proxmox = False
results = ""
previous_results = [] 
filter_data = ""
dichotomic_mst_tuning = { #q5/q8! beware
    "initial_rate": 10**8,
    "slide_window": 75,
    "size_window": 60,
    "observation_size": 30,
    "timeout": 900,
    "mean_threshold": 0.01,
    "higher_bound_ratio": 2,
    "cooldown_throughput": 200,
    #"warmup": 120 # usage of higher level parameter
    #"nb_sources": usage of of higher level parameter
}

optimization = {
    "nb_iterations": 20,
    "base_estimator": "gp",
    "n_initial_points": 6,
    "initial_point_generator": "grid",
    "acq_func": "EI",
    "random_state": 42,
}

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.pipeline import make_pipeline
import seaborn as sns
from matplotlib import pyplot as plt
import logging
import numpy as np
from time import sleep
import sys
%run ../common/scripts.ipynb
if "../streambed" not in sys.path:
    sys.path.append("../streambed")
import streambed
streambed.setup_logging(default_path="./logging.yaml", default_level=logging.WARN)
logger = logging.getLogger('streambed')
if kind:
    streambed.init_kind()
elif g5k:
    streambed.init_g5k()
else:
    streambed.init_remote()

logging.info("Execute notebook")

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [3]:
#papermill_description=Reset_Kafka
if reset_kafka_data:
    #papermill_description=Initialize libraries
    init_label_nodes(jobmanagers_qty=job_managers_qty, kafka_qty=kafka_qty)

    address = "127-0-0-1"
    port=30081

    !kubectl delete -n kafka kafka/my-cluster
    sleep(10)
    #run_command("kubectl create -f ./kafka.yaml", shell=False)
    streambed.deploy_kafka(kafka_partitions, kafka_replicas, node_selector=kafka_node_selector, antiaffinity=g5k)
    run_command("kubectl wait kafka/my-cluster --for=condition=Ready --timeout=300s -n kafka", shell=False)
    # Kafka UI
    run_command("helm install -f values-kowl.yaml -n kafka kowl cloudhut/kowl", shell=False)
    #(ip_url, dns_url) = get_service_public_address("kafka", "manager", "kowl", 80)
    (manager_node, jobmanager_node, taskmanager_node) = get_label_nodes()
    print("Kafka Kowl: {} - Ingress: {}".format(manager_node, "http://kowl.{}.sslip.io:{}".format(address,port)))    
    !kubectl apply -f ./kafka-bridge.yaml
    !kubectl apply -f ./kafka-bridge-service.yaml

gros-1.nancy.grid5000.fr	{'beta.kubernetes.io/arch': 'amd64', 'beta.kubernetes.io/os': 'linux', 'kubernetes.io/arch': 'amd64', 'kubernetes.io/hostname': 'gros-1.nancy.grid5000.fr', 'kubernetes.io/os': 'linux', 'node-role.kubernetes.io/controlplane': 'true', 'node-role.kubernetes.io/etcd': 'true'}
gros-10.nancy.grid5000.fr	{'beta.kubernetes.io/arch': 'amd64', 'beta.kubernetes.io/os': 'linux', 'kubernetes.io/arch': 'amd64', 'kubernetes.io/hostname': 'gros-10.nancy.grid5000.fr', 'kubernetes.io/os': 'linux', 'node-role.kubernetes.io/worker': 'true', 'tier': 'manager'}
gros-100.nancy.grid5000.fr	{'beta.kubernetes.io/arch': 'amd64', 'beta.kubernetes.io/os': 'linux', 'kubernetes.io/arch': 'amd64', 'kubernetes.io/hostname': 'gros-100.nancy.grid5000.fr', 'kubernetes.io/os': 'linux', 'node-role.kubernetes.io/worker': 'true', 'tier': 'jobmanager'}
gros-101.nancy.grid5000.fr	{'beta.kubernetes.io/arch': 'amd64', 'beta.kubernetes.io/os': 'linux', 'kubernetes.io/arch': 'amd64', 'kubernetes.io/hostnam

In [5]:
#papermill_description=Initialize_data
if reset_kafka_data:
    streambed.deploy("nexmark", streambed.flink_configuration(cpu=cpu,memory=max(memory_range),task_managers=task_managers_qty, task_slots=task_slots_per_task_manager, evenly_spread="true", custom_memory=None))
    if streambed.common.dynamic_flink_url:
        (_, base_url) = get_service_public_address("default", "jobmanager", "nexmark-flink-jobmanager", 8081)
        streambed.common.flink_base_url = base_url
    streambed.wait_for_task_managers(task_managers_qty=task_managers_qty, base_url=streambed.common.flink_base_url)

    streambed.delete_kafka_topic("nexmark")
    sleep(10)
    streambed.launch_job_async(notebook="/xp_intro_init_kafka", params_datagen={
        "params":{
            "TOPIC" : "nexmark",
            "BOOTSTRAP_SERVERS": "my-cluster-kafka-bootstrap.kafka:9092",
            "TPS": "1000000000",
            "EVENTS_NUM": str(events_num),
            "PERSON_PROPORTION" : "1",
            "AUCTION_PROPORTION": "3",
            "BID_PROPORTION": "46",
            "NEXMARK_TABLE": "kafka"
        }}, params_query = {}, timeout=1200)

  0%|          | 0/8 [00:00<?, ?it/s]

In [6]:
list_xp=[]
for notebook in notebooks:
    for memory in memory_range:
        for task_slots_limit in task_slots_limits:
            list_xp.append(
                {"cpu": cpu, 
                "memory": memory, 
                "run": run, 
                "task_slots_per_task_manager": task_slots_per_task_manager, 
                "task_managers_qty": task_managers_qty,
                "source_parallelism": source_parallelism, 
                "parallelism": initial_parallelism, 
                "evenly_spread": "true", 
                "warmup": warmup,
                "custom_memory": None, 
                "dichotomic_mst_tuning": dichotomic_mst_tuning,
                "task_slots_limit": task_slots_limit,
                "throughputs": [10**8],
                "timeout": timeout,
                "limit_backpressure_source": limit_backpressure_source,
                "monitoring_step": monitoring_step,
                "nb_runs_throughput": nb_runs_throughput,
                "nb_runs_parallelism": nb_runs_parallelism,
                "notebook": notebook})
import datetime            
now = datetime.datetime.now()
now_str = now.strftime("%Y%m%d%H%M%S")     
path= "{}/{}-{}".format(results_path, xp_name, now_str)
display(list_xp)


[{'cpu': 16,
  'memory': 32768,
  'run': 0,
  'task_slots_per_task_manager': 16,
  'task_managers_qty': 8,
  'source_parallelism': 32,
  'parallelism': 1,
  'evenly_spread': 'true',
  'warmup': 180,
  'custom_memory': None,
  'dichotomic_mst_tuning': {'initial_rate': 100000000,
   'slide_window': 75,
   'size_window': 60,
   'observation_size': 30,
   'timeout': 900,
   'mean_threshold': 0.01,
   'higher_bound_ratio': 2,
   'cooldown_throughput': 200},
  'task_slots_limit': 80,
  'throughputs': [100000000],
  'timeout': 600,
  'limit_backpressure_source': 250,
  'nb_runs_throughput': 1,
  'nb_runs_parallelism': 1,
  'notebook': '/xp_intro_q5_kafka_custom_ratelimit'},
 {'cpu': 16,
  'memory': 65536,
  'run': 0,
  'task_slots_per_task_manager': 16,
  'task_managers_qty': 8,
  'source_parallelism': 32,
  'parallelism': 1,
  'evenly_spread': 'true',
  'warmup': 180,
  'custom_memory': None,
  'dichotomic_mst_tuning': {'initial_rate': 100000000,
   'slide_window': 75,
   'size_window': 60,


In [7]:
#papermill_description=Load precomputed data
reg = streambed.InfrastructureRegression()
if len(previous_results) == 0:
    results_estimation = streambed.loop_estimation(list_xp, results_path=path)
    reg.load_full([path])
    #reg.load(path + ".csv")
    #reg.load_details(path + ".yaml")
else:
    reg.load_full(previous_results)
    #for p in previous_results:
    #    reg.load(p + ".csv")
    #    reg.load_details(p + "-details.yaml")
min_memory = min(memory_range)
max_memory = max(memory_range)
min_ts = task_slots_minimal
max_ts = max(task_slots_limits) - source_parallelism
reg.filter_data("((memory=={})|(memory=={}))&((used_task_slots=={})|(used_task_slots=={}))".format(min_memory,max_memory,min_ts,max_ts), drop_duplicates=True) # task_slots_limit & memory
reg.data

Unnamed: 0,notebook,timeout,params.TPS,params.EVENTS_NUM,params.PERSON_PROPORTION,params.AUCTION_PROPORTION,params.BID_PROPORTION,params.NEXMARK_TABLE,params.TOPIC,params.BOOTSTRAP_SERVERS,...,"query.GroupWindowAggregate(groupBy=[\$f0], window=[SlidingGroupWindow('w\$, processingTime, 10000, 2000)], properties=[w\$start, w\$end, w\$proctime], select=[\$f0, COUNT(*) AS num, start('w\$) AS w\$start, end('w\$) AS w\$end, proctime('w\$) AS w\$proctime])","query.Calc(select=[\$f0 AS auction, num, w\$start AS starttime, w\$end AS endtime])","query.Calc(select=[w\$start AS starttime, w\$end AS endtime, num])","query.GroupAggregate(groupBy=[starttime, endtime], select=[starttime, endtime, MAX(num) AS maxn])","query.Calc(select=[maxn, starttime, endtime])","query.Join(joinType=[InnerJoin], where=[((starttime = starttime0) AND (endtime = endtime0) AND (num >= maxn))], select=[auction, num, starttime, endtime, maxn, starttime0, endtime0], leftInputSpec=[HasUniqueKey], rightInputSpec=[JoinKeyContainsUniqueKey])","query.Calc(select=[auction, num])","query.Sink: Sink(table=[default_catalog.default_database.discard_sink], fields=[auction, num])",used_task_slots,task_slot_memory
11,/xp_intro_q5_kafka_custom_ratelimit,120,100000000,0,1,3,46,kafka,nexmark;nexmark-control,my-cluster-kafka-bootstrap.kafka:9092,...,35,1,1,2,1,3,1,1,48,4096.0
0,/xp_intro_q5_kafka_custom_ratelimit,120,100000000,0,1,3,46,kafka,nexmark;nexmark-control,my-cluster-kafka-bootstrap.kafka:9092,...,1,1,1,1,1,1,1,1,9,4096.0
35,/xp_intro_q5_kafka_custom_ratelimit,120,100000000,0,1,3,46,kafka,nexmark;nexmark-control,my-cluster-kafka-bootstrap.kafka:9092,...,35,1,1,2,1,3,1,1,48,2048.0
24,/xp_intro_q5_kafka_custom_ratelimit,120,100000000,0,1,3,46,kafka,nexmark;nexmark-control,my-cluster-kafka-bootstrap.kafka:9092,...,1,1,1,1,1,1,1,1,9,2048.0


In [8]:
# Function to evaluate the model performance
def evaluate_model(model, X_val, y_val):
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    return mse

sqrt_regression = make_pipeline(FunctionTransformer(np.sqrt, validate=True), LinearRegression())
_, score = reg.generic_model("sqrt", sqrt_regression)

logarithmic_regression = make_pipeline(FunctionTransformer(np.log1p, validate=True), LinearRegression())
_, score = reg.generic_model("log", logarithmic_regression)

linear_regression = make_pipeline(PolynomialFeatures(degree=1), LinearRegression())
_, score = reg.generic_model("linear", linear_regression)

In [9]:
if len(task_slots_limits) == 1: 
    # get minimal number of task slots
    task_slots_range = np.array([source_parallelism + task_slots_minimal, task_slots_limits[0]]).astype(int)
else:
    task_slots_range = np.array(task_slots_limits).astype(int)
memory_range_reduced = (np.array(memory_range) / memory_granularity).astype(int)
display(task_slots_range)
display(memory_range_reduced)

array([41, 80])

array([1, 2])

In [10]:
def get_configuration(task_slots_qty, memory_qty):
    return [{"cpu": cpu, 
            "memory": memory_qty, 
            "run": run, 
            "task_slots_per_task_manager": task_slots_per_task_manager, 
            "task_managers_qty": task_managers_qty,
            "source_parallelism": source_parallelism, 
            "parallelism": initial_parallelism, 
            "evenly_spread": "true", 
            "warmup": warmup,
            "custom_memory": None, 
            "dichotomic_mst_tuning": dichotomic_mst_tuning,
            "task_slots_limit": task_slots_qty,
            "throughputs": [10**8],
            "timeout": timeout,
            "limit_backpressure_source": limit_backpressure_source,
            "monitoring_step": monitoring_step,
            "nb_runs_throughput": nb_runs_throughput,
            "nb_runs_parallelism": nb_runs_parallelism,
            "notebook": notebook}]

In [11]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from skopt import Optimizer
from skopt.space import Integer, Categorical
from functools import partial

def objective_function(models, X, y):
    for model_name in models:
        models[model_name].fit(X, y)
   
    mse_by_model = {}
    min_mse = float('inf')
    for model_name in models:
        y_pred = models[model_name].predict(X)    
        mse = mean_squared_error(y_pred, y)
        mse_by_model[model_name] = mse
        if mse < min_mse:
            min_mse = mse
    return min_mse ,mse_by_model

def objective_function_loo(models, X, y):
    loo = LeaveOneOut()
    
    min_mse = float('inf')
    mse_by_model = {}
    if len(X) > 1:
        for model_name in models:
            model = models[model_name]
            scores = cross_val_score(model, X, y, cv=loo, scoring='neg_root_mean_squared_error')
            
            # Compute the average MSE (scores are negative, so we negate the result)
            mse_avg = -1 * scores.mean()
            mse_by_model[model_name] = mse_avg

            if mse_avg < min_mse:
                min_mse = mse_avg
    else:
        return 0, {}
    return min_mse, mse_by_model

df = reg.data[["used_task_slots", "memory", "observed_source_rate"]]
df["used_task_slots"] = df["used_task_slots"] + source_parallelism
df["memory"] = df["memory"] / memory_granularity
X = df[["used_task_slots","memory"]].values 
y = df[["observed_source_rate"]].values 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["used_task_slots"] = df["used_task_slots"] + source_parallelism
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


In [12]:
sqrt_regression = make_pipeline(FunctionTransformer(np.sqrt, validate=True), LinearRegression())
_, score = reg.generic_model("sqrt", sqrt_regression)
logarithmic_regression = make_pipeline(FunctionTransformer(np.log1p, validate=True), LinearRegression())
_, score = reg.generic_model("log", logarithmic_regression)
linear_regression = make_pipeline(PolynomialFeatures(degree=1), LinearRegression())
_, score = reg.generic_model("linear", linear_regression)


In [13]:
x_array = []
y_array = []
for i,x in enumerate(X):
    x_reshaped = x.reshape(1, 2)
    x_array.append(x)
    y_array.append(y[i])
    _, mse_by_model = objective_function_loo(reg.models, x_array, y_array)
    print("{} - {}".format(x, mse_by_model))

[80.  2.] - {}
[41.  2.] - {'sqrt': 823592.2857142858, 'log': 823592.2857142858, 'linear': 823592.2857142858}
[80.  1.] - {'sqrt': 299357.49946493667, 'log': 451681.1707379395, 'linear': 306690.5972529878}
[41.  1.] - {'sqrt': 51022.6282104866, 'log': 51022.62821048677, 'linear': 51022.62821048639}


In [14]:
x_array = []
y_array = []
for i,x in enumerate(X):
    x_reshaped = x.reshape(1, 2)
    x_array.append(x)
    y_array.append(y[i])
    _, mse_by_model = objective_function(reg.models, x_array, y_array)
    print("{} - {}".format(x, mse_by_model))

[80.  2.] - {'sqrt': 0.0, 'log': 0.0, 'linear': 0.0}
[41.  2.] - {'sqrt': 6.88214269644119e-21, 'log': 1.2207862352302604e-19, 'linear': 3.7375328797596e-20}
[80.  1.] - {'sqrt': 3.524362921367268e-19, 'log': 1.1926929758129927e-18, 'linear': 4.588095130960794e-21}
[41.  1.] - {'sqrt': 162706786.8440961, 'log': 162706786.84409612, 'linear': 162706786.84409535}


In [15]:
#papermill_description=
space = [Integer(min(task_slots_range), max(task_slots_range), name="task_slots"), Integer(min(memory_range_reduced), max(memory_range_reduced), name="memory")]
optimizer = Optimizer(dimensions=space, base_estimator="gp", n_initial_points=6, initial_point_generator="grid", acq_func="EI", random_state=42)

for model in reg.models:
    reg.models[model].fit(X, y)
for i,x in enumerate(X):
    x_reshaped = x.reshape(1, 2)
    mse,_ = objective_function_loo(reg.models, x_reshaped, y[i])
    print("Tell:{} {}".format(x.tolist(), mse))
    optimizer.tell(x.tolist(), mse)
for i in range(optimization["nb_iterations"]):
    val = optimizer.ask()
    display(val)
    optimizer.tell(val,10000)

Tell:[80.0, 2.0] 0
Tell:[41.0, 2.0] 0
Tell:[80.0, 1.0] 0
Tell:[41.0, 1.0] 0


[41, 2]

[67, 2]



[80, 1]



[80, 2]



[41, 1]

[45, 1]

[78, 1]

[42, 2]

[79, 2]

[42, 1]



[80, 1]



[41, 2]



[80, 2]



[41, 1]



[80, 1]



[41, 2]



[80, 2]

[51, 1]

[48, 2]



[80, 1]

In [16]:
#papermill_description=Streambed_estimation
reg.save(path)
space = [Integer(min(task_slots_range), max(task_slots_range), name="task_slots"), Integer(min(memory_range_reduced), max(memory_range_reduced), name="memory")]
optimizer = Optimizer(dimensions = space, 
                      base_estimator = optimization["base_estimator"], 
                      n_initial_points = optimization["n_initial_points"], 
                      initial_point_generator = optimization["initial_point_generator"], 
                      acq_func = optimization["acq_func"], 
                      random_state = optimization["random_state"])

for model in reg.models:
    reg.models[model].fit(X, y)
for i,x in enumerate(X):
    x_reshaped = x.reshape(1, 2)
    mse,_ = objective_function_loo(reg.models, x_reshaped, y[i])
    optimizer.tell(x.tolist(), mse)


for i in range(optimization["nb_iterations"]):
    x = optimizer.ask()
    print(x)
    logger.debug("BO iteration {} gives solutions: {} ".format(i, x))
    # evaluate this x
    task_slots = x[0]
    memory = x[1] * memory_granularity
    list_xp = get_configuration(task_slots, memory)
    print("{}".format(list_xp))
    results_estimation = streambed.loop_estimation(list_xp, results_path=path, details=reg.data_details)
    reg = streambed.InfrastructureRegression()
    reg.load_full([path])
    df = reg.data[["used_task_slots", "memory", "observed_source_rate"]]
    df["memory"] = df["memory"] / memory_granularity 
    X = df[["used_task_slots","memory"]].values
    y = df[["observed_source_rate"]].values 
    # Evaluate the objective function
    mse_val, mse_by_model = objective_function_loo(reg.models, X, y)  
    logger.info("Found MSE: {}".format(mse_by_model))
    print("Found MSE: {}".format(mse_by_model))
    optimizer.tell(x, mse_val)

[41, 2]
[{'cpu': 16, 'memory': 65536, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 41, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

Found MSE: {'sqrt': 35571.60974734051, 'log': 35571.60974734063, 'linear': 35571.60974734051}
[67, 2]
[{'cpu': 16, 'memory': 65536, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 67, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


Found MSE: {'sqrt': 39227.98912964317, 'log': 27142.699751318247, 'linear': 62515.91561560325}
[41, 1]
[{'cpu': 16, 'memory': 32768, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 41, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]




  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

Found MSE: {'sqrt': 32009.43592532823, 'log': 21971.231867871385, 'linear': 51515.42487467656}
[80, 1]
[{'cpu': 16, 'memory': 32768, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 80, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


Found MSE: {'sqrt': 34171.22422663578, 'log': 19830.605100107903, 'linear': 53490.413765977144}
[80, 2]
[{'cpu': 16, 'memory': 65536, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 80, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]




  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


Found MSE: {'sqrt': 42847.552901894254, 'log': 32761.718213989036, 'linear': 54845.46137057054}
[80, 1]
[{'cpu': 16, 'memory': 32768, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 80, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]




  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

Found MSE: {'sqrt': 39095.62655095762, 'log': 29717.60324668505, 'linear': 49976.619923242215}
[41, 1]
[{'cpu': 16, 'memory': 32768, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 41, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


Found MSE: {'sqrt': 35652.49013083169, 'log': 27157.63764198683, 'linear': 45448.85636086828}
[41, 1]
[{'cpu': 16, 'memory': 32768, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 41, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]




  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


Found MSE: {'sqrt': 32720.58531479731, 'log': 24944.98578864547, 'linear': 41655.11038354017}
[76, 1]
[{'cpu': 16, 'memory': 32768, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 76, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


Found MSE: {'sqrt': 39766.710258094135, 'log': 30909.951561287628, 'linear': 50777.44369062704}
[79, 1]
[{'cpu': 16, 'memory': 32768, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 79, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


Found MSE: {'sqrt': 39527.015707492355, 'log': 32273.10171325895, 'linear': 50069.32730497652}
[79, 2]
[{'cpu': 16, 'memory': 65536, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 79, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


Found MSE: {'sqrt': 41128.69890468622, 'log': 36792.33487277004, 'linear': 49421.94182735686}
[42, 1]
[{'cpu': 16, 'memory': 32768, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 42, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


Found MSE: {'sqrt': 39344.98989149756, 'log': 34387.92637305802, 'linear': 49163.55833246682}
[42, 2]
[{'cpu': 16, 'memory': 65536, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 42, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


Found MSE: {'sqrt': 36656.55095965602, 'log': 32519.1846075859, 'linear': 45651.02100879521}
[48, 2]
[{'cpu': 16, 'memory': 65536, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 48, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


Found MSE: {'sqrt': 39781.99919718373, 'log': 31525.60628631835, 'linear': 52935.176383913866}
[61, 2]
[{'cpu': 16, 'memory': 65536, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 61, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


Found MSE: {'sqrt': 41540.14169137548, 'log': 31550.528354106576, 'linear': 58153.14412133048}
[56, 1]
[{'cpu': 16, 'memory': 32768, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 56, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


Found MSE: {'sqrt': 45519.34824587083, 'log': 30007.28619970351, 'linear': 67643.21548554672}
[69, 1]
[{'cpu': 16, 'memory': 32768, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 69, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


Found MSE: {'sqrt': 47326.09424426063, 'log': 30068.167818231763, 'linear': 72285.25786738834}
[55, 2]
[{'cpu': 16, 'memory': 65536, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 55, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


Found MSE: {'sqrt': 48847.35461925929, 'log': 30123.647616701866, 'linear': 77025.36545674439}
[73, 2]
[{'cpu': 16, 'memory': 65536, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 73, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


Found MSE: {'sqrt': 46593.111069024315, 'log': 28813.8250636035, 'linear': 74000.05226620928}
[62, 1]
[{'cpu': 16, 'memory': 32768, 'run': 0, 'task_slots_per_task_manager': 16, 'task_managers_qty': 8, 'source_parallelism': 32, 'parallelism': 1, 'evenly_spread': 'true', 'warmup': 180, 'custom_memory': None, 'dichotomic_mst_tuning': {'initial_rate': 100000000, 'slide_window': 75, 'size_window': 60, 'observation_size': 30, 'timeout': 900, 'mean_threshold': 0.01, 'higher_bound_ratio': 2, 'cooldown_throughput': 200}, 'task_slots_limit': 62, 'throughputs': [100000000], 'timeout': 600, 'limit_backpressure_source': 250, 'nb_runs_throughput': 1, 'nb_runs_parallelism': 1, 'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["memory"] = df["memory"] / memory_granularity


Found MSE: {'sqrt': 45999.79498836464, 'log': 33390.88272001807, 'linear': 72151.2251548203}


In [None]:
list_xp

[{'cpu': 16,
  'memory': 65536,
  'run': 0,
  'task_slots_per_task_manager': 16,
  'task_managers_qty': 8,
  'source_parallelism': 32,
  'parallelism': 1,
  'evenly_spread': 'true',
  'warmup': 180,
  'custom_memory': None,
  'dichotomic_mst_tuning': {'initial_rate': 100000000,
   'slide_window': 75,
   'size_window': 60,
   'observation_size': 30,
   'timeout': 900,
   'mean_threshold': 0.01,
   'higher_bound_ratio': 2,
   'cooldown_throughput': 200},
  'task_slots_limit': 40,
  'throughputs': [100000000],
  'timeout': 600,
  'limit_backpressure_source': 250,
  'nb_runs_throughput': 1,
  'nb_runs_parallelism': 1,
  'notebook': '/xp_intro_q5_kafka_custom_ratelimit'}]