In [148]:
%pylab inline
import numpy as np 
import pandas as pd

from sklearn.preprocessing import MinMaxScaler

Populating the interactive namespace from numpy and matplotlib


## Data

In [11]:
file_name = '/dataset/personal_projects/lsm/20160123_1차 발주_3280.xlsx'

## Settings

In [12]:
N_DAS = 96
SHEET_NAME = 'original'
FINDING_LOCATION_TIME = 9
FINDING_PRODUCT_TIME = 12

## Programs

In [172]:
class LSM(object):
    EMPTY_LOCATION = 'Z1-1'
    
    def __init__(self, n_das=N_DAS, sheet=SHEET_NAME):
        self.data = None
        self.vectors = None
        self.N_DAS = n_das
        self.sheet = SHEET_NAME
        
        self.barcode_to_index = None
        self.index_to_barcode = None
        
        self.location_to_idx = None
        self.idx_to_location = None
        
        self.barcode_vectors = None
        self.location_vectors = None
        
        
    def load(self, file_name):
        COL_NAMES = ['date', 'order_number', 'brand_code', 'brand_name', 
                     'product', 'property', 'location', 'barcode', 'n', 'price', 
                     'status', 'orderer', 'recipient', 'address', 'postcode', 
                     'contact1', 'contact2', 'comment', 'part_delivery', 'code']
        self.data = pd.read_excel(file_name, 
                                  names=COL_NAMES,
                                  sheetname=self.sheet)
        
    def frequent_barcodes(self):
        frq_barcodes = self.data[['barcode', 'n']].groupby(by='barcode').sum()
        frq_barcodes = frq_barcodes.sort_values(by='n', ascending=False)
        codes = []
        for barcode in frq_barcodes.index:
            for code in self.data[self.data['barcode'] == barcode]['code'].values:
                if code not in codes:
                    codes.append(code)
        return codes
        
    def preprocess(self):
        # Pre-Process missing locations
        self.data['location'] = self.data['location'].fillna(self.EMPTY_LOCATION)
        
        # Split Locations to l1, l2, l3
        location_regex = '(?P<l1>[a-zA-Z]+)-?(?P<l2>\d+)-(?P<l3>\d+)'
        self.data['location'][~self.data['location'].str.contains(location_regex)][:] = self.EMPTY_LOCATION
        location_series = self.data['location'].str.extract(location_regex, expand=False)
        self.data = pd.concat([self.data, location_series], axis=1)
        
        # Create Barcode Vectors
        n_uniq_code = len(self.data['code'].unique())
        n_uniq_barcode = len(self.data['barcode'].unique())
        barcode_vectors = np.zeros((n_uniq_code, n_uniq_barcode)) # (N 주문, N 바코드 즉 상품)
        
        self.code_to_index = {code: i for i, code in enumerate(self.data['code'].unique())}
        self.index_to_code = {idx: code for code, idx in self.code_to_index.items()}
        
        self.barcode_to_index = {barcode: i for i, barcode in enumerate(self.data['barcode'].unique())}
        self.index_to_barcode = {idx: barcode for barcode, idx in self.barcode_to_index.items()}
        
        for barcode, code, n in self.data[['barcode', 'code', 'n']].values:
            code_idx = self.code_to_index[code]
            barcode_idx = self.barcode_to_index[barcode]
            barcode_vectors[code_idx, barcode_idx] += n
        
        
        # Create Location Vectors
        unique_locations = list()
        for l1, l2, l3 in self.data[['l1', 'l2', 'l3']].values:
            l1 = 'l1_' + str(l1)
            l2 = 'l2_' + str(l2)
            l3 = 'l3_' + str(l3)
            if l1 not in unique_locations:
                unique_locations.append(l1)
                
            if l2 not in unique_locations:
                unique_locations.append(l2)
                
            if l3 not in unique_locations:
                unique_locations.append(l3)
        
        n_unique_location = len(unique_locations)
        self.location_to_idx = {loc: i for i, loc in enumerate(unique_locations)}
        self.idx_to_location = {i: loc for loc, i in self.location_to_idx.items()}
        location_vectors = np.zeros((n_uniq_code, n_unique_location)) # 주문별 위치의 vector 값 (N 주문, N 위치)
        
        for code, l1, l2, l3 in self.data[['code', 'l1', 'l2', 'l3']].values:
            code_idx = self.code_to_index[code]
            l1_idx = self.location_to_idx['l1_' + str(l1)]
            l2_idx = self.location_to_idx['l2_' + str(l2)]
            l3_idx = self.location_to_idx['l3_' + str(l3)]
            location_vectors[code_idx, l1_idx] += 1
            location_vectors[code_idx, l2_idx] += 1
            location_vectors[code_idx, l3_idx] += 1
        
        
        # Normalization
        self.barcode_scaler = MinMaxScaler()
        self.location_scaler = MinMaxScaler()
        
        self.barcode_vectors = self.barcode_scaler.fit_transform(barcode_vectors)
        self.location_vectors = self.barcode_scaler.fit_transform(location_vectors)
    
    def get_code(self, code):
        return self.data[self.data['code'] == code]
    
    def process(self, start_code):    
        print(self._get_barcode_vector(start_code).tolist())
    
    def _get_barcode_vector(self, code):
        code_idx = self.code_to_index[code]
        return self.barcode_vectors[code_idx]
    
        
lsm = LSM()
lsm.load(file_name)
lsm.preprocess()
frq_codes = lsm.frequent_barcodes()
start_code = frq_codes[0] # 2849240

lsm.process(start_code)



[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [162]:
lsm.process(start_code)

In [151]:
d = MinMaxScaler()
d.fit_transform