In [3]:
%pylab inline
import numpy as np 
import pandas as pd

from sklearn.preprocessing import MinMaxScaler

Populating the interactive namespace from numpy and matplotlib


## Data

In [4]:
file_name = '/dataset/personal_projects/lsm/20160123_1차 발주_3280.xlsx'

## Settings

In [5]:
N_DAS = 96
SHEET_NAME = 'original'
FINDING_LOCATION_TIME = 9
FINDING_PRODUCT_TIME = 12

## Programs

In [10]:
class LSM(object):
    EMPTY_LOCATION = 'Z1-1'
    
    def __init__(self, n_das=N_DAS, sheet=SHEET_NAME):
        self.data = None
        self.vectors = None
        self.N_DAS = n_das
        self.sheet = SHEET_NAME
        
        self.barcode_to_index = None
        self.index_to_barcode = None
        
        self.location_to_idx = None
        self.idx_to_location = None
        
        self.barcode_vectors = None
        self.location_vectors = None
        
        
    def load(self, file_name):
        COL_NAMES = ['date', 'order_number', 'brand_code', 'brand_name', 
                     'product', 'property', 'location', 'barcode', 'n', 'price', 
                     'status', 'orderer', 'recipient', 'address', 'postcode', 
                     'contact1', 'contact2', 'comment', 'part_delivery', 'code']
        self.data = pd.read_excel(file_name, 
                                  names=COL_NAMES,
                                  sheetname=self.sheet)
        
    def frequent_barcodes(self):
        frq_barcodes = self.data[['barcode', 'n']].groupby(by='barcode').sum()
        frq_barcodes = frq_barcodes.sort_values(by='n', ascending=False)
        codes = []
        for barcode in frq_barcodes.index:
            for code in self.data[self.data['barcode'] == barcode]['code'].values:
                if code not in codes:
                    codes.append(code)
        return codes
    
    def useless_barcodes(self):
        pass
    
    
    def frequent_code_index(self):
        # 가장 빈번한 barcode를 찾음
        summed = np.sum(self.barcode_vectors, axis=0)
        most_frequent_barcode_idx = np.argmax(summed)
        most_frequent_code_idx = np.argmax(self.barcode_vectors[:, most_frequent_barcode_idx])
        return most_frequent_code_idx
    
        
    def preprocess(self):
        # Pre-Process missing locations
        self.data['location'] = self.data['location'].fillna(self.EMPTY_LOCATION)
        
        # Split Locations to l1, l2, l3
        location_regex = '(?P<l1>[a-zA-Z]+)-?(?P<l2>\d+)-(?P<l3>\d+)'
        self.data['location'][~self.data['location'].str.contains(location_regex)][:] = self.EMPTY_LOCATION
        location_series = self.data['location'].str.extract(location_regex, expand=False)
        self.data = pd.concat([self.data, location_series], axis=1)
        
        # Create Barcode Vectors
        n_uniq_code = len(self.data['code'].unique())
        n_uniq_barcode = len(self.data['barcode'].unique())
        barcode_vectors = np.zeros((n_uniq_code, n_uniq_barcode)) # (N 주문, N 바코드 즉 상품)
        
        self.code_to_index = {code: i for i, code in enumerate(self.data['code'].unique())}
        self.index_to_code = {idx: code for code, idx in self.code_to_index.items()}
        
        self.barcode_to_index = {barcode: i for i, barcode in enumerate(self.data['barcode'].unique())}
        self.index_to_barcode = {idx: barcode for barcode, idx in self.barcode_to_index.items()}
        
        for barcode, code, n in self.data[['barcode', 'code', 'n']].values:
            code_idx = self.code_to_index[code]
            barcode_idx = self.barcode_to_index[barcode]
            barcode_vectors[code_idx, barcode_idx] += n
        
        # Create Location Vectors
        unique_locations = list()
        for l1, l2, l3 in self.data[['l1', 'l2', 'l3']].values:
            l1 = 'l1_' + str(l1)
            l2 = 'l2_' + str(l2)
            l3 = 'l3_' + str(l3)
            if l1 not in unique_locations:
                unique_locations.append(l1)
                
            if l2 not in unique_locations:
                unique_locations.append(l2)
                
            if l3 not in unique_locations:
                unique_locations.append(l3)
        
        n_unique_location = len(unique_locations)
        self.location_to_idx = {loc: i for i, loc in enumerate(unique_locations)}
        self.idx_to_location = {i: loc for loc, i in self.location_to_idx.items()}
        location_vectors = np.zeros((n_uniq_code, n_unique_location)) # 주문별 위치의 vector 값 (N 주문, N 위치)
        
        for code, l1, l2, l3 in self.data[['code', 'l1', 'l2', 'l3']].values:
            code_idx = self.code_to_index[code]
            l1_idx = self.location_to_idx['l1_' + str(l1)]
            l2_idx = self.location_to_idx['l2_' + str(l2)]
            l3_idx = self.location_to_idx['l3_' + str(l3)]
            location_vectors[code_idx, l1_idx] += 1
            location_vectors[code_idx, l2_idx] += 1
            location_vectors[code_idx, l3_idx] += 1
        
        
        # Normalization
        self.barcode_scaler = MinMaxScaler()
        self.location_scaler = MinMaxScaler()
        
        self.barcode_vectors = self.barcode_scaler.fit_transform(barcode_vectors)
        self.location_vectors = self.barcode_scaler.fit_transform(location_vectors)
    
    def get_code(self, code):
        return self.data[self.data['code'] == code]
    
    def process(self): 
        # Init
        code_idx =self.frequent_code_index()
        n_barcode = self.barcode_vectors.shape[0]
        
        # Init for Barcode Vectors
        barcode_vectors = self.barcode_vectors.copy()
        barcode_acum = np.zeros(self.barcode_vectors.shape[-1])
        barcodes_indices = [code_idx] 
        
        # Search Similar Barcodes
        for i in range(n_barcode):
            bvector = self.barcode_vectors[code_idx]
            barcode_vectors[code_idx] = 0

            n_barcode = np.sum(bvector > 0)
            barcode_acum += bvector/n_barcode

            sim_scores = self.sim(barcode_vectors, barcode_acum)
            code_idx = np.argmax(sim_scores)
            code = self.index_to_code[code_idx]
            score = sim_scores[code_idx]
            barcodes_indices.append(code_idx)
            
            print(i, score, code_idx, np.sum(barcode_acum))
        
    def sim(self, a, b):
        # Sigmoid 
        return 1./(1+ np.e**np.sum(-np.multiply(a, b), axis=1)) - 0.5
        
    
    def _get_barcode_vector(self, code_idx, g):
        return self.barcode_vectors[code_idx]
    
        
lsm = LSM()
lsm.load(file_name)
lsm.preprocess()
# frq_codes = lsm.frequent_barcodes()
# start_code = frq_codes[0] # 2849240

lsm.process()



0 0.0915484699393 620 0.777777777778
1 0.177157814602 690 1.55555555556
2 0.285210995905 781 2.22222222222
3 0.361054265874 1042 2.93055555556
4 0.374839600425 921 3.70833333333
5 0.433896760843 1460 4.48611111111
6 0.448664206885 599 5.31944444444
7 0.451624022596 452 6.31944444444
8 0.476550256692 1407 7.15277777778
9 0.485936372957 1296 7.98611111111
10 0.479011990518 1134 8.98611111111
11 0.48473284612 358 9.76388888889
12 0.489013057369 1396 10.7638888889
13 0.489882637025 291 11.7638888889
14 0.491422514586 388 12.6805555556
15 0.495577715047 1054 13.6805555556
16 0.497313073248 1265 14.6805555556
17 0.497080250208 417 15.6805555556
18 0.498923897988 485 16.6805555556
19 0.499603854725 492 17.6805555556
20 0.499759688287 508 18.6805555556
21 0.49991158083 650 19.6805555556
22 0.499967470587 1219 20.6805555556
23 0.499967470587 663 21.6805555556
24 0.499988032852 701 22.6805555556
25 0.499995597499 736 23.6805555556
26 0.499998380406 769 24.6805555556
27 0.499999168472 1167 25.680

233 0.446656818469 1645 225.568722944
234 0.454057500252 270 226.568722944
235 0.47162146758 275 227.568722944
236 0.477759164126 504 228.568722944
237 0.491701384407 545 229.568722944
238 0.496931010859 1001 230.568722944
239 0.497608245192 1015 231.568722944
240 0.499118790297 1110 232.568722944
241 0.499675640388 1236 233.568722944
242 0.499880650297 1326 234.568722944
243 0.499956090385 1428 235.568722944
244 0.499983846107 1477 236.568722944
245 0.499988425177 1480 237.568722944
246 0.499992979483 1493 238.568722944
247 0.499997417285 1508 239.568722944
248 0.499999049871 1510 240.568722944
249 0.499999650467 1555 241.568722944
250 0.499999871414 1598 242.568722944
251 0.342399428165 1528 243.568722944
252 0.37627035423 1231 244.568722944
253 0.4186573277 604 245.568722944
254 0.398673859378 295 246.568722944
255 0.411355820521 223 247.468722944
256 0.404568820135 1011 248.343722944
257 0.409600457081 956 249.218722944
258 0.422614287808 1527 250.218722944
259 0.456168262863 1320 

468 0.446173837231 1269 457.463167388
469 0.460834277203 1457 458.463167388
470 0.428810579773 1455 459.463167388
471 0.437516587724 1523 460.213167388
472 0.396444931712 644 461.213167388
473 0.39488748681 840 462.213167388
474 0.392512568041 750 463.213167388
475 0.389272682028 469 464.213167388
476 0.456200002284 735 465.213167388
477 0.483428055905 1198 466.213167388
478 0.493838982939 1381 467.213167388
479 0.387620890659 11 468.213167388
480 0.428685247935 494 469.213167388
481 0.447846436922 727 470.213167388
482 0.467704535302 871 471.213167388
483 0.383397030839 238 472.213167388
484 0.40530200416 1630 473.213167388
485 0.424141819979 294 474.213167388
486 0.444450739842 1572 475.213167388
487 0.478820889545 1652 476.213167388
488 0.478647443839 1503 477.213167388
489 0.470687769249 180 478.213167388
490 0.489013057369 696 479.213167388
491 0.495929862284 919 480.213167388
492 0.460519472009 723 481.213167388
493 0.460202241287 530 482.213167388
494 0.435030830871 216 483.2131

702 0.317574476194 356 687.125204425
703 0.424141819979 771 688.125204425
704 0.470687769249 809 689.125204425
705 0.303502933453 1361 690.125204425
706 0.362158343035 192 691.125204425
707 0.397215975083 136 692.125204425
708 0.411600322793 27 693.125204425
709 0.465554804334 40 694.125204425
710 0.478820889545 52 695.125204425
711 0.492102916493 152 696.125204425
712 0.296841069544 638 697.125204425
713 0.299525814678 1074 698.125204425
714 0.293447594318 1353 698.875204425
715 0.322493497288 202 699.625204425
716 0.291391472674 161 700.625204425
717 0.291391472674 980 701.625204425
718 0.285484152665 1045 702.625204425
719 0.283420904232 1211 703.458537759
720 0.309998433985 1047 704.358537759
721 0.420561450816 1580 705.358537759
722 0.277299861175 163 706.358537759
723 0.277299861175 210 707.358537759
724 0.304815322986 753 708.358537759
725 0.273151123081 1295 709.358537759
726 0.270339159867 1347 710.358537759
727 0.270003918741 419 711.108537759
728 0.280885367462 552 712.10853

940 0.418089422845 114 908.09464887
941 0.439913349826 496 909.09464887
942 0.47702263009 673 910.09464887
943 0.491422514586 1397 911.09464887
944 0.494779874306 1601 912.09464887
945 0.197059283965 627 913.09464887
946 0.197059283965 1028 914.09464887
947 0.362158343035 1034 915.09464887
948 0.444450739842 1080 916.09464887
949 0.478820889545 1256 917.09464887
950 0.492102916493 1466 918.09464887
951 0.197059283965 1215 919.09464887
952 0.197059283965 1318 920.09464887
953 0.23105857863 671 921.09464887
954 0.380797077978 1405 922.09464887
955 0.452574126822 1447 923.09464887
956 0.482013790038 1469 924.09464887
957 0.197059283965 1317 925.09464887
958 0.197059283965 1406 926.09464887
959 0.197059283965 1421 927.09464887
960 0.186398078552 148 928.09464887
961 0.209806497656 755 928.927982203
962 0.268524783499 343 929.761315536
963 0.186398078552 510 930.761315536
964 0.179178699175 63 931.761315536
965 0.277299861175 65 932.761315536
966 0.351952801968 90 933.761315536
967 0.404650

1169 0.168187772168 1217 1130.9279822
1170 0.0906532827009 1618 1131.9279822
1171 0.0825702064623 31 1132.9279822
1172 0.122459331202 864 1133.9279822
1173 0.0825702064623 87 1134.9279822
1174 0.0825702064623 141 1135.9279822
1175 0.0825702064623 229 1136.9279822
1176 0.0825702064623 287 1137.9279822
1177 0.197059283965 824 1138.9279822
1178 0.291391472674 895 1139.9279822
1179 0.0825702064623 299 1140.9279822
1180 0.197059283965 1360 1141.9279822
1181 0.0825702064623 307 1142.9279822
1182 0.197059283965 443 1143.9279822
1183 0.0825702064623 310 1144.9279822
1184 0.0825702064623 359 1145.9279822
1185 0.160756368766 285 1146.9279822
1186 0.0825702064623 371 1147.9279822
1187 0.0825702064623 401 1148.9279822
1188 0.0825702064623 404 1149.9279822
1189 0.0825702064623 461 1150.9279822
1190 0.0825702064623 477 1151.9279822
1191 0.291391472674 1355 1152.9279822
1192 0.0825702064623 574 1153.9279822
1193 0.0825702064623 653 1154.9279822
1194 0.197059283965 1626 1155.9279822
1195 0.08257020646

1421 0.0 0 1381.09464887
1422 0.0 0 1382.09464887
1423 0.0 0 1383.09464887
1424 0.0 0 1384.09464887
1425 0.0 0 1385.09464887
1426 0.0 0 1386.09464887
1427 0.0 0 1387.09464887
1428 0.0 0 1388.09464887
1429 0.0 0 1389.09464887
1430 0.0 0 1390.09464887
1431 0.0 0 1391.09464887
1432 0.0 0 1392.09464887
1433 0.0 0 1393.09464887
1434 0.0 0 1394.09464887
1435 0.0 0 1395.09464887
1436 0.0 0 1396.09464887
1437 0.0 0 1397.09464887
1438 0.0 0 1398.09464887
1439 0.0 0 1399.09464887
1440 0.0 0 1400.09464887
1441 0.0 0 1401.09464887
1442 0.0 0 1402.09464887
1443 0.0 0 1403.09464887
1444 0.0 0 1404.09464887
1445 0.0 0 1405.09464887
1446 0.0 0 1406.09464887
1447 0.0 0 1407.09464887
1448 0.0 0 1408.09464887
1449 0.0 0 1409.09464887
1450 0.0 0 1410.09464887
1451 0.0 0 1411.09464887
1452 0.0 0 1412.09464887
1453 0.0 0 1413.09464887
1454 0.0 0 1414.09464887
1455 0.0 0 1415.09464887
1456 0.0 0 1416.09464887
1457 0.0 0 1417.09464887
1458 0.0 0 1418.09464887
1459 0.0 0 1419.09464887
1460 0.0 0 1420.09464887


In [162]:
lsm.process(start_code)

In [151]:
d = MinMaxScaler()
d.fit_transform