In [1]:
import math
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics.pairwise import manhattan_distances, pairwise_distances, pairwise_distances_chunked

In [2]:
df_data = pd.read_csv("./df_data.csv")
feat_list = df_data.columns
BATCH_SIZE = 200000
feat_list

Index(['Part', 'Reticle', 'Prev1Reticle', 'Tool', 'Prev1Tool', 'Prev2Tool',
       'ChuckID'],
      dtype='object')

In [3]:
le_list = []
df_data_le = pd.DataFrame()
for _feat in feat_list:
    le = LabelEncoder()
    df_data_le[_feat] = le.fit_transform(df_data[_feat])
    le_list.append(le)
    
ohe = OneHotEncoder()
res = ohe.fit_transform(df_data_le).toarray()
res.shape

(493848, 72)

In [4]:
res = res.astype("uint8")
res

array([[1, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 1]], dtype=uint8)

In [5]:
known_comb = res[:2500,:]
unknown_comb = res[2500:,:]


known_data = df_data_le.iloc[:2500, :]
unknown_data = df_data_le.iloc[2500:, :]

In [6]:
def get_max_batch(num, batch_size):
    return math.ceil(num/batch_size)

def m1(res1, res2):
    out1 = manhattan_distances(res1, res2) / 2
    return out1.astype("uint8")

In [7]:
for i in range(get_max_batch(unknown_comb.shape[0], BATCH_SIZE)):
    st = i * BATCH_SIZE
    et = (i+1) * BATCH_SIZE if (i+1) * BATCH_SIZE < res.shape[0] else res.shape[0]
    batch_unknown_comb = unknown_comb[st:et,:]
    print(f'{st}~{et}')
    out1 = m1(batch_unknown_comb, known_comb)
    print(out1)

0~200000
[[3 4 3 ... 2 1 2]
 [4 3 4 ... 1 2 1]
 [3 4 3 ... 2 1 2]
 ...
 [6 5 6 ... 4 5 4]
 [5 6 5 ... 5 4 5]
 [6 5 6 ... 4 5 4]]
200000~400000
[[5 6 5 ... 5 4 5]
 [6 5 6 ... 4 5 4]
 [4 5 5 ... 6 5 6]
 ...
 [5 4 6 ... 5 6 5]
 [5 6 4 ... 6 5 6]
 [6 5 5 ... 5 6 5]]
400000~493848
[[5 6 5 ... 6 5 6]
 [6 5 6 ... 5 6 5]
 [5 6 5 ... 6 5 6]
 ...
 [7 6 7 ... 6 7 6]
 [6 7 6 ... 7 6 7]
 [7 6 7 ... 6 7 6]]


In [8]:
def m2c(res1, res2):
    out = np.empty((res1.shape[0], res2.shape[0]), dtype="uint8")
    col = 0
    for _m in pairwise_distances_chunked(res1, res2, metric="hamming"):
        out[col:col+len(_m)] = (_m * res1.shape[1])
        col += len(_m)
    return out.astype("uint8")

In [9]:

known_data = df_data_le.iloc[:2500, :]
unknown_data = df_data_le.iloc[2500:, :]

In [10]:
for i in range(get_max_batch(unknown_data.shape[0], BATCH_SIZE)):
    st = i * BATCH_SIZE
    et = (i+1) * BATCH_SIZE if (i+1) * BATCH_SIZE < unknown_data.shape[0] else unknown_data.shape[0]
    batch_unknown_data = unknown_data.iloc[st:et,:]
    print(f'{st}~{et}')
    out2c = m2c(batch_unknown_data, known_data)
    print(out2c)

0~200000
[[3 4 3 ... 2 1 2]
 [4 3 4 ... 1 2 1]
 [3 4 3 ... 2 1 2]
 ...
 [6 5 6 ... 4 5 4]
 [5 6 5 ... 5 4 5]
 [6 5 6 ... 4 5 4]]
200000~400000
[[5 6 5 ... 5 4 5]
 [6 5 6 ... 4 5 4]
 [4 5 5 ... 6 5 6]
 ...
 [5 4 6 ... 5 6 5]
 [5 6 4 ... 6 5 6]
 [6 5 5 ... 5 6 5]]
400000~491348
[[5 6 5 ... 6 5 6]
 [6 5 6 ... 5 6 5]
 [5 6 5 ... 6 5 6]
 ...
 [7 6 7 ... 6 7 6]
 [6 7 6 ... 7 6 7]
 [7 6 7 ... 6 7 6]]
