In [63]:
import numpy as np

In [64]:
data = np.random.rand(10**6, 2)
print(data)
data.shape

[[0.39536923 0.83150378]
 [0.98854686 0.68824278]
 [0.37858042 0.77229227]
 ...
 [0.52019464 0.03057228]
 [0.68997189 0.58832385]
 [0.88121626 0.55268753]]


(1000000, 2)

In [65]:
cntr = data.mean(axis=0)
print(cntr)
cntr.shape

[0.50001974 0.50018125]


(2,)

Note the amazing technique used here:

  data - cntr -> results in a new (1e6,2)
  array that contains the diff b/w each
  coordinate and the centroid.
  Since cntr is an array of shape (2,), it
  is broadcasted.

  After this, we square the x and y diff,
  and then sum up the squares.

  Finally, we take the root.

In [66]:
dist = np.sum((data - cntr) ** 2, axis=1) ** 0.5
print(dist)
dist.shape

[0.34745698 0.52347482 0.29797973 ... 0.47004214 0.20940615 0.38479566]


(1000000,)

We have the distances, now we need to add this to the same array. We use column_stack for this.

In [68]:
data = np.column_stack((data, dist))
print(data)
data.shape

[[0.39536923 0.83150378 0.34745698]
 [0.98854686 0.68824278 0.52347482]
 [0.37858042 0.77229227 0.29797973]
 ...
 [0.52019464 0.03057228 0.47004214]
 [0.68997189 0.58832385 0.20940615]
 [0.88121626 0.55268753 0.38479566]]


(1000000, 3)

In [84]:
indices = np.argsort(data, axis=0)[:3]
# print(indices)
# print(indices.T[2])
print(data[indices.T[2]])

[[5.00306157e-01 4.99962886e-01 3.60158108e-04]
 [5.00239241e-01 4.99870770e-01 3.80229475e-04]
 [5.00112014e-01 4.99781287e-01 4.10465414e-04]]
