In [None]:
!pip install memory-profiler openpyxl

In [None]:
import json, os
for k,v in json.load(open("local.settings.json"))["Values"].items():
    os.environ[k] = v

In [None]:
import pandas as pd
df = pd.read_excel("real_sample_formatted.xlsx").drop_duplicates().head(1000)

# Async

In [None]:
from memory_profiler import memory_usage
from libs.utils.esquire.neighbors.logic_async import get_all_neighbors
import asyncio, concurrent.futures, numpy as np

In [None]:
def worker(chunk):
  loop = asyncio.new_event_loop()
  asyncio.set_event_loop(loop)
  result = loop.run_until_complete(
    get_all_neighbors(chunk, 5, True, -1)
  )
  loop.close()
  return result

def run_async():
  with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
    results: pd.DataFrame = pd.concat(map(
      lambda f: f.result(),
      concurrent.futures.as_completed([
        executor.submit(worker, chunk)
        for chunk in np.array_split(df, len(df) // 1000)
      ])
    ))
  return results.drop_duplicates().reindex()

In [None]:
# Profile the memory usage
mem_usage_info_async, return_val_async = memory_usage(run_async, interval=0.1, timestamps=True, retval=True)

# Extract memory usage and timestamps
mem_usage_async = [item[0] for item in mem_usage_info_async]
timestamps_async = [item[1] for item in mem_usage_info_async]

# Convert timestamps to seconds relative to the first timestamp
relative_times_async = [(t - timestamps_async[0]) for t in timestamps_async]
len(return_val_async)

In [None]:
import matplotlib.pyplot as plt

# Plot with accurate time on the x-axis
plt.plot(relative_times_async, mem_usage_async)
plt.xlabel('Time (seconds)')
plt.ylabel('Memory Usage (MiB)')
plt.title('Memory Usage Over Time (Asynchronous)')
plt.show()

# Or simply print the maximum memory usage
print(f"Maximum memory usage: {max(mem_usage_async)} MiB")

# Synchronous

In [None]:
from libs.utils.esquire.neighbors.logic_vectorized import get_all_neighbors
from memory_profiler import memory_usage

# Define a wrapper function to run the profiling
def run_get_all_neighbors():
    return get_all_neighbors(df, 3)

# Profile the memory usage
mem_usage_info, return_val = memory_usage(run_get_all_neighbors, interval=0.1, timestamps=True, retval=True)

# Extract memory usage and timestamps
mem_usage = [item[0] for item in mem_usage_info]
timestamps = [item[1] for item in mem_usage_info]

# Convert timestamps to seconds relative to the first timestamp
relative_times = [(t - timestamps[0]) for t in timestamps]


In [None]:
import matplotlib.pyplot as plt

# Plot with accurate time on the x-axis
plt.plot(relative_times, mem_usage)
plt.xlabel('Time (seconds)')
plt.ylabel('Memory Usage (MiB)')
plt.title('Memory Usage Over Time (Synchronous)')
plt.show()

# Or simply print the maximum memory usage
print(f"Maximum memory usage: {max(mem_usage)} MiB")

In [None]:
print(f'   Synchronous |   Asynchronous |  % Change')
print(' ' + '='*45)
print(f'{max(mem_usage):>10.2f} MiB | {max(mem_usage_async):>10.2f} MiB | {((max(mem_usage_async) - max(mem_usage)) / max(mem_usage)) * 100:>+7.2f} %')
print(f'{max(relative_times):>10.2f} s   | {max(relative_times_async):>10.2f} s   | {((max(relative_times_async) - max(relative_times)) / max(relative_times)) * 100:>+7.2f} %')


In [None]:
if return_val.compare(return_val_async).empty:
    print('Same results!')