In [16]:
import math
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics.pairwise import manhattan_distances, pairwise_distances

In [17]:
df_data = pd.read_csv("./df_data.csv")
feat_list = df_data.columns
feat_list

Index(['Part', 'Reticle', 'Prev1Reticle', 'Tool', 'Prev1Tool', 'Prev2Tool',
       'ChuckID'],
      dtype='object')

In [18]:
le_list = []
df_data_le = pd.DataFrame()
for _feat in feat_list:
    le = LabelEncoder()
    df_data_le[_feat] = le.fit_transform(df_data[_feat])
    le_list.append(le)
    
ohe = OneHotEncoder()
res = ohe.fit_transform(df_data_le).toarray()
res.shape

(2105352, 71)

In [19]:
df_data_le = df_data_le.astype("uint8")
df_data_le.values

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  1],
       [ 0,  0,  0, ...,  0,  1,  0],
       ...,
       [10, 10,  3, ...,  8,  7,  1],
       [10, 10,  3, ...,  8,  8,  0],
       [10, 10,  3, ...,  8,  8,  1]], dtype=uint8)

In [20]:
kres = res[:10000,:]
u_kres = df_data_le.values[:10000,:]
BATCH_SIZE = 100000

In [21]:
def get_max_batch(num, batch_size):
    return math.ceil(num/batch_size)

In [22]:
def m1(res1, res2):
    out1 = manhattan_distances(res1, res2) / 2
    return out1.astype("uint8")

for i in range(get_max_batch(res.shape[0], BATCH_SIZE)):
    st = i * BATCH_SIZE
    et = (i+1) * BATCH_SIZE if (i+1) * BATCH_SIZE < res.shape[0] else res.shape[0]
    ures = res[st:et,:]
    print(f'{st}~{et}')
    out1 = m1(ures, kres)
    print(out1)

0~100000
[[0 1 1 ... 6 5 6]
 [1 0 2 ... 5 6 5]
 [1 2 0 ... 6 5 6]
 ...
 [5 4 5 ... 4 6 5]
 [4 5 4 ... 6 4 5]
 [5 4 5 ... 5 5 4]]
100000~200000
[[4 5 4 ... 6 5 6]
 [5 4 5 ... 5 6 5]
 [4 5 4 ... 6 5 6]
 ...
 [7 6 7 ... 6 7 6]
 [5 6 6 ... 7 6 7]
 [6 5 7 ... 6 7 6]]
200000~300000
[[6 7 5 ... 7 6 7]
 [7 6 6 ... 6 7 6]
 [6 7 6 ... 7 6 7]
 ...
 [7 6 7 ... 6 6 5]
 [6 7 6 ... 7 6 7]
 [7 6 7 ... 6 7 6]]
300000~400000
[[6 7 6 ... 7 6 7]
 [7 6 7 ... 6 7 6]
 [6 7 6 ... 7 6 7]
 ...
 [6 5 7 ... 5 6 5]
 [6 7 5 ... 6 5 6]
 [7 6 6 ... 5 6 5]]
400000~500000
[[6 7 6 ... 6 5 6]
 [7 6 7 ... 5 6 5]
 [6 7 6 ... 5 5 6]
 ...
 [7 6 7 ... 5 6 5]
 [6 7 6 ... 6 5 6]
 [7 6 7 ... 5 6 5]]
500000~600000
[[6 7 6 ... 6 5 6]
 [7 6 7 ... 5 6 5]
 [6 7 6 ... 6 5 6]
 ...
 [7 6 6 ... 5 6 5]
 [6 7 6 ... 6 5 6]
 [7 6 7 ... 5 6 5]]
600000~700000
[[6 7 6 ... 5 5 6]
 [7 6 7 ... 4 6 5]
 [6 7 6 ... 6 4 5]
 ...
 [5 4 5 ... 6 7 6]
 [4 5 4 ... 7 6 7]
 [5 4 5 ... 6 7 6]]
700000~800000
[[4 5 4 ... 7 6 7]
 [5 4 5 ... 6 7 6]
 [3 4 4 ... 7 6

In [23]:
# 15.1
del out1

In [24]:
def m2(res1, res2):
    out2 = pairwise_distances(res1, res2, metric="hamming") * res1.shape[1]
    return out2.astype("uint8")

for i in range(get_max_batch(res.shape[0], BATCH_SIZE)):
    st = i * BATCH_SIZE
    et = (i+1) * BATCH_SIZE if (i+1) * BATCH_SIZE < res.shape[0] else res.shape[0]
    u_ures = df_data_le.values[st:et,:]
    print(f'{st}~{et}')
    out2 = m2(u_ures, u_kres)
    print(out2)

0~100000
[[0 1 1 ... 6 5 6]
 [1 0 2 ... 5 6 5]
 [1 2 0 ... 6 5 6]
 ...
 [5 4 5 ... 4 6 5]
 [4 5 4 ... 6 4 5]
 [5 4 5 ... 5 5 4]]
100000~200000
[[4 5 4 ... 6 5 6]
 [5 4 5 ... 5 6 5]
 [4 5 4 ... 6 5 6]
 ...
 [7 6 7 ... 6 7 6]
 [5 6 6 ... 7 6 7]
 [6 5 7 ... 6 7 6]]
200000~300000
[[6 7 5 ... 7 6 7]
 [7 6 6 ... 6 7 6]
 [6 7 6 ... 7 6 7]
 ...
 [7 6 7 ... 6 6 5]
 [6 7 6 ... 7 6 7]
 [7 6 7 ... 6 7 6]]
300000~400000
[[6 7 6 ... 7 6 7]
 [7 6 7 ... 6 7 6]
 [6 7 6 ... 7 6 7]
 ...
 [6 5 7 ... 5 6 5]
 [6 7 5 ... 6 5 6]
 [7 6 6 ... 5 6 5]]
400000~500000
[[6 7 6 ... 6 5 6]
 [7 6 7 ... 5 6 5]
 [6 7 6 ... 5 5 6]
 ...
 [7 6 7 ... 5 6 5]
 [6 7 6 ... 6 5 6]
 [7 6 7 ... 5 6 5]]
500000~600000
[[6 7 6 ... 6 5 6]
 [7 6 7 ... 5 6 5]
 [6 7 6 ... 6 5 6]
 ...
 [7 6 6 ... 5 6 5]
 [6 7 6 ... 6 5 6]
 [7 6 7 ... 5 6 5]]
600000~700000
[[6 7 6 ... 5 5 6]
 [7 6 7 ... 4 6 5]
 [6 7 6 ... 6 4 5]
 ...
 [5 4 5 ... 6 7 6]
 [4 5 4 ... 7 6 7]
 [5 4 5 ... 6 7 6]]
700000~800000
[[4 5 4 ... 7 6 7]
 [5 4 5 ... 6 7 6]
 [3 4 4 ... 7 6

In [25]:
# 15.1
del out2

In [26]:
from sklearn.metrics import DistanceMetric
def m3(res1, res2):
    out3 = DistanceMetric.get_metric('hamming').pairwise(res1, res2) * res2.shape[1]
    return out3.astype("uint8")

for i in range(get_max_batch(res.shape[0], BATCH_SIZE)):
    st = i * BATCH_SIZE
    et = (i+1) * BATCH_SIZE if (i+1) * BATCH_SIZE < res.shape[0] else res.shape[0]
    u_ures = df_data_le.values[st:et,:]
    print(f'{st}~{et}')
    out3 = m3(u_ures, u_kres)
    print(out3)

0~100000
[[0 1 1 ... 6 5 6]
 [1 0 2 ... 5 6 5]
 [1 2 0 ... 6 5 6]
 ...
 [5 4 5 ... 4 6 5]
 [4 5 4 ... 6 4 5]
 [5 4 5 ... 5 5 4]]
100000~200000
[[4 5 4 ... 6 5 6]
 [5 4 5 ... 5 6 5]
 [4 5 4 ... 6 5 6]
 ...
 [7 6 7 ... 6 7 6]
 [5 6 6 ... 7 6 7]
 [6 5 7 ... 6 7 6]]
200000~300000
[[6 7 5 ... 7 6 7]
 [7 6 6 ... 6 7 6]
 [6 7 6 ... 7 6 7]
 ...
 [7 6 7 ... 6 6 5]
 [6 7 6 ... 7 6 7]
 [7 6 7 ... 6 7 6]]
300000~400000
[[6 7 6 ... 7 6 7]
 [7 6 7 ... 6 7 6]
 [6 7 6 ... 7 6 7]
 ...
 [6 5 7 ... 5 6 5]
 [6 7 5 ... 6 5 6]
 [7 6 6 ... 5 6 5]]
400000~500000
[[6 7 6 ... 6 5 6]
 [7 6 7 ... 5 6 5]
 [6 7 6 ... 5 5 6]
 ...
 [7 6 7 ... 5 6 5]
 [6 7 6 ... 6 5 6]
 [7 6 7 ... 5 6 5]]
500000~600000
[[6 7 6 ... 6 5 6]
 [7 6 7 ... 5 6 5]
 [6 7 6 ... 6 5 6]
 ...
 [7 6 6 ... 5 6 5]
 [6 7 6 ... 6 5 6]
 [7 6 7 ... 5 6 5]]
600000~700000
[[6 7 6 ... 5 5 6]
 [7 6 7 ... 4 6 5]
 [6 7 6 ... 6 4 5]
 ...
 [5 4 5 ... 6 7 6]
 [4 5 4 ... 7 6 7]
 [5 4 5 ... 6 7 6]]
700000~800000
[[4 5 4 ... 7 6 7]
 [5 4 5 ... 6 7 6]
 [3 4 4 ... 7 6

In [27]:
# 15.1
del out3