In [1]:
import math
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics.pairwise import manhattan_distances, pairwise_distances, pairwise_distances_chunked

In [2]:
df_data = pd.read_csv("./df_data.csv")
feat_list = df_data.columns
feat_list

Index(['Part', 'Reticle', 'Prev1Reticle', 'Tool', 'Prev1Tool', 'Prev2Tool',
       'ChuckID'],
      dtype='object')

In [3]:
le_list = []
df_data_le = pd.DataFrame()
for _feat in feat_list:
    le = LabelEncoder()
    df_data_le[_feat] = le.fit_transform(df_data[_feat])
    le_list.append(le)
    
ohe = OneHotEncoder()
res = ohe.fit_transform(df_data_le).toarray()
res.shape

(493848, 73)

In [4]:
df_data_le = df_data_le.astype("uint8")
df_data_le.values

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  1],
       [ 0,  0,  0, ...,  0,  1,  0],
       ...,
       [10,  3,  0, ..., 10,  7,  1],
       [10,  3,  0, ..., 10,  8,  0],
       [10,  3,  0, ..., 10,  8,  1]], dtype=uint8)

In [5]:
kres = res[:10000,:]
u_kres = df_data_le.values[:10000,:]
BATCH_SIZE = 200000

In [6]:
def get_max_batch(num, batch_size):
    return math.ceil(num/batch_size)

In [7]:
def m1(res1, res2):
    out1 = manhattan_distances(res1, res2) / 2
    return out1.astype("uint8")

for i in range(get_max_batch(res.shape[0], BATCH_SIZE)):
    st = i * BATCH_SIZE
    et = (i+1) * BATCH_SIZE if (i+1) * BATCH_SIZE < res.shape[0] else res.shape[0]
    ures = res[st:et,:]
    print(f'{st}~{et}')
    out1 = m1(ures, kres)
    print(out1)

0~200000
[[0 1 1 ... 5 4 5]
 [1 0 2 ... 4 5 4]
 [1 2 0 ... 5 4 5]
 ...
 [6 5 6 ... 5 6 5]
 [4 5 5 ... 6 5 6]
 [5 4 6 ... 5 6 5]]
200000~400000
[[5 6 4 ... 6 5 6]
 [6 5 5 ... 5 6 5]
 [5 6 5 ... 6 5 6]
 ...
 [5 4 6 ... 3 4 3]
 [5 6 4 ... 4 3 4]
 [6 5 5 ... 3 4 3]]
400000~493848
[[5 6 5 ... 4 3 4]
 [6 5 6 ... 3 4 3]
 [5 6 5 ... 3 3 4]
 ...
 [6 5 6 ... 5 6 5]
 [5 6 5 ... 6 5 6]
 [6 5 6 ... 5 6 5]]


In [8]:
# 15.1
del out1

In [9]:
def m2(res1, res2):
    out2 = pairwise_distances(res1, res2, metric="hamming") * res1.shape[1]
    return out2.astype("uint8")

for i in range(get_max_batch(res.shape[0], BATCH_SIZE)):
    st = i * BATCH_SIZE
    et = (i+1) * BATCH_SIZE if (i+1) * BATCH_SIZE < res.shape[0] else res.shape[0]
    u_ures = df_data_le.values[st:et,:]
    print(f'{st}~{et}')
    out2 = m2(u_ures, u_kres)
    print(out2)

0~200000
[[0 1 1 ... 5 4 5]
 [1 0 2 ... 4 5 4]
 [1 2 0 ... 5 4 5]
 ...
 [6 5 6 ... 5 6 5]
 [4 5 5 ... 6 5 6]
 [5 4 6 ... 5 6 5]]
200000~400000
[[5 6 4 ... 6 5 6]
 [6 5 5 ... 5 6 5]
 [5 6 5 ... 6 5 6]
 ...
 [5 4 6 ... 3 4 3]
 [5 6 4 ... 4 3 4]
 [6 5 5 ... 3 4 3]]
400000~493848
[[5 6 5 ... 4 3 4]
 [6 5 6 ... 3 4 3]
 [5 6 5 ... 3 3 4]
 ...
 [6 5 6 ... 5 6 5]
 [5 6 5 ... 6 5 6]
 [6 5 6 ... 5 6 5]]


In [10]:
# 15.1
del out2

In [11]:
def m2c(res1, res2):
    out = np.empty((res1.shape[0], res2.shape[0]), dtype="uint8")
    col = 0
    for _m in pairwise_distances_chunked(res1, res2, metric="hamming", working_memory=0):
        out[col:col+len(_m)] = (_m * res1.shape[1])
        col += len(_m)
    return out.astype("uint8")


for i in range(get_max_batch(res.shape[0], BATCH_SIZE)):
    st = i * BATCH_SIZE
    et = (i+1) * BATCH_SIZE if (i+1) * BATCH_SIZE < res.shape[0] else res.shape[0]
    u_ures = df_data_le.values[st:et,:]
    print(f'{st}~{et}')
    out2c = m2c(u_ures, u_kres)
    print(out2c)

0~200000




[[0 1 1 ... 5 4 5]
 [1 0 2 ... 4 5 4]
 [1 2 0 ... 5 4 5]
 ...
 [6 5 6 ... 5 6 5]
 [4 5 5 ... 6 5 6]
 [5 4 6 ... 5 6 5]]
200000~400000




[[5 6 4 ... 6 5 6]
 [6 5 5 ... 5 6 5]
 [5 6 5 ... 6 5 6]
 ...
 [5 4 6 ... 3 4 3]
 [5 6 4 ... 4 3 4]
 [6 5 5 ... 3 4 3]]
400000~493848




[[5 6 5 ... 4 3 4]
 [6 5 6 ... 3 4 3]
 [5 6 5 ... 3 3 4]
 ...
 [6 5 6 ... 5 6 5]
 [5 6 5 ... 6 5 6]
 [6 5 6 ... 5 6 5]]


In [12]:
# 15.1
del out2c

In [13]:
from sklearn.metrics import DistanceMetric
def m3(res1, res2):
    out3 = DistanceMetric.get_metric('hamming').pairwise(res1, res2) * res2.shape[1]
    return out3.astype("uint8")

for i in range(get_max_batch(res.shape[0], BATCH_SIZE)):
    st = i * BATCH_SIZE
    et = (i+1) * BATCH_SIZE if (i+1) * BATCH_SIZE < res.shape[0] else res.shape[0]
    u_ures = df_data_le.values[st:et,:]
    print(f'{st}~{et}')
    out3 = m3(u_ures, u_kres)
    print(out3)

0~200000
[[0 1 1 ... 5 4 5]
 [1 0 2 ... 4 5 4]
 [1 2 0 ... 5 4 5]
 ...
 [6 5 6 ... 5 6 5]
 [4 5 5 ... 6 5 6]
 [5 4 6 ... 5 6 5]]
200000~400000
[[5 6 4 ... 6 5 6]
 [6 5 5 ... 5 6 5]
 [5 6 5 ... 6 5 6]
 ...
 [5 4 6 ... 3 4 3]
 [5 6 4 ... 4 3 4]
 [6 5 5 ... 3 4 3]]
400000~493848
[[5 6 5 ... 4 3 4]
 [6 5 6 ... 3 4 3]
 [5 6 5 ... 3 3 4]
 ...
 [6 5 6 ... 5 6 5]
 [5 6 5 ... 6 5 6]
 [6 5 6 ... 5 6 5]]


In [14]:
# 15.1
del out3

In [16]:
def m2c(res1, res2):
    out = np.empty((res1.shape[0], res2.shape[0]), dtype="uint8")
    col = 0
    for _m in pairwise_distances_chunked(res1, res2, metric="hamming"):
        out[col:col+len(_m)] = _m * res1.shape[1]
        col += len(_m)
    return out.astype("uint8")

for i in range(get_max_batch(res.shape[0], BATCH_SIZE)):
    st = i * BATCH_SIZE
    et = (i+1) * BATCH_SIZE if (i+1) * BATCH_SIZE < res.shape[0] else res.shape[0]
    u_ures = df_data_le.values[st:et,:]
    print(f'{st}~{et}')
    out2c = m2c(u_ures, u_kres)
    print(out2c)

0~200000
[[0 1 1 ... 5 4 5]
 [1 0 2 ... 4 5 4]
 [1 2 0 ... 5 4 5]
 ...
 [6 5 6 ... 5 6 5]
 [4 5 5 ... 6 5 6]
 [5 4 6 ... 5 6 5]]
200000~400000
[[5 6 4 ... 6 5 6]
 [6 5 5 ... 5 6 5]
 [5 6 5 ... 6 5 6]
 ...
 [5 4 6 ... 3 4 3]
 [5 6 4 ... 4 3 4]
 [6 5 5 ... 3 4 3]]
400000~493848
[[5 6 5 ... 4 3 4]
 [6 5 6 ... 3 4 3]
 [5 6 5 ... 3 3 4]
 ...
 [6 5 6 ... 5 6 5]
 [5 6 5 ... 6 5 6]
 [6 5 6 ... 5 6 5]]
