In [1]:
import re
import os
import json
import h5py
import numpy as np
import pandas as pd
import sentencepiece as spm

from tqdm.notebook import tqdm

In [2]:
RAW_DATA_DIR = '../input/raw_data'
PROCESSED_DATA_DIR = '../input/processed'
VOCAB_DIR = os.path.join(PROCESSED_DATA_DIR, 'vocab')

In [3]:
train_file_list=['train.chunk.{0}'.format(str(i).zfill(2)) for i in range(1,10)]
dev_file_list = ['dev.chuck.01']
test_file_list = ['test.chuck.01','test.chuck.02']

In [4]:
train_path_list = [os.path.join(RAW_DATA_DIR, fn) for fn in train_file_list]
dev_path_list = [os.path.join(RAW_DATA_DIR, fn) for fn in dev_file_list]
test_path_list = [os.path.join(RAW_DATA_DIR, fn) for fn in test_file_list]

In [5]:
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
os.makedirs(VOCAB_DIR, exist_ok=True)

In [6]:
def get_colum_data(path_list, div, col):
    col_data = []
    for path in path_list:
        h = h5py.File(path, 'r')
        col_data.append(h[div][col][:])
        h.close()
    return np.concatenate(col_data)

In [7]:
def get_dataframe(path_list, div):
    pids = get_colum_data(path_list, div, col='pid')
    products = get_colum_data(path_list, div, col='product')
    brands = get_colum_data(path_list, div, col='product')
    makers = get_colum_data(path_list, div, col='maker')
    models = get_colum_data(path_list, div, col='model')
    prices = get_colum_data(path_list, div, col='price')
    updttms = get_colum_data(path_list, div, col='updttm')
    bcates = get_colum_data(path_list, div, col='bcateid')
    mcates = get_colum_data(path_list, div, col='mcateid')
    scates = get_colum_data(path_list, div, col='scateid')
    dcates = get_colum_data(path_list, div, col='dcateid')
    
    df = pd.DataFrame({'pid': pids, 'product': products, 'brand': brands , 'maker': makers, 'model': models,  'price': prices ,  'updttm': updttms, 'bcateid': bcates, 'mcateid': mcates, 'scateid': scates, 'dcateid': dcates})
    
    df['pid'] = df['pid'].map(lambda x: x.decode('utf-8'))
    df['product'] = df['product'].map(lambda x: x.decode('utf-8'))
    df['brand'] = df['brand'].map(lambda x: x.decode('utf-8'))
    df['maker'] = df['maker'].map(lambda x: x.decode('utf-8'))
    df['model'] = df['model'].map(lambda x: x.decode('utf-8'))
    df['updttm'] = df['updttm'].map(lambda x: x.decode('utf-8'))
    
    return df

In [8]:
%%time
train_df = get_dataframe(train_path_list, 'train')

CPU times: user 44.8 s, sys: 33.9 s, total: 1min 18s
Wall time: 1min 27s


In [9]:
cate_json = json.load(open(os.path.join(RAW_DATA_DIR, 'cate1.json')))

In [10]:
bid2nm = dict([(cid, name) for name, cid in cate_json['b'].items()])
mid2nm = dict([(cid, name) for name, cid in cate_json['m'].items()])
sid2nm = dict([(cid, name) for name, cid in cate_json['s'].items()])
did2nm = dict([(cid, name) for name, cid in cate_json['d'].items()])

In [11]:
train_df['bcatenm'] = train_df['bcateid'].map(bid2nm)
train_df['mcatenm'] = train_df['mcateid'].map(mid2nm)
train_df['scatenm'] = train_df['scateid'].map(sid2nm)
train_df['dcatenm'] = train_df['dcateid'].map(did2nm)

In [12]:
def get_vc_df(df, col):
    vc_df = df[col].value_counts().reset_index()
    vc_df.columns = [col, 'count']
    vc_df['per'] = (vc_df['count'] / vc_df['count'].sum()) * 100
    return vc_df

In [13]:
vc_df = get_vc_df(train_df, 'brand')

In [14]:
train_df = train_df[['pid', 'product', 'bcateid', 'mcateid', 'scateid', 'dcateid']]

In [15]:
p = re.compile('[\!@#$%\^&\*\(\)-=\[\]\{\}\.,/\?~\+\'"|]')

In [16]:
def remove_special_characters(sentence, lower=True):
    sentence = p.sub(' ', sentence)
    sentence = ' '.join(sentence.split())
    if lower:
        sentence = sentence.lower()
    return sentence

In [17]:
%%time
train_df['product'] = train_df['product'].map(remove_special_characters)

CPU times: user 28.4 s, sys: 532 ms, total: 29 s
Wall time: 29 s


In [18]:
train_df.head()

Unnamed: 0,pid,product,bcateid,mcateid,scateid,dcateid
0,O4486751463,직소퍼즐 조각 바다거북의 여행 pl,1,1,2,-1
1,P3307178849,모리케이스 아이폰 s s tree farm 다이어리케이스 바보사랑 무료배송,3,3,4,-1
2,R4424255515,크리비아 기모 부 속바지 glg p,5,5,6,-1
3,F3334315393,하프클럽 잭앤질 남성 솔리드 절개라인 포인트 포켓 팬츠 pt _na,7,7,8,-1
4,N731678492,코드프리혈당시험지 매 코드프리시험지 최장유효기간,10,9,11,-1


In [31]:
with open(os.path.join(VOCAB_DIR, 'product.txt'), 'w') as f:
    f.write(train_df['product'].str.cat(sep='\n'))

In [32]:
def train_spm(txt_path, spm_path, vocab_size=32000, input_sentence_size=1000000):
    spm.SentencePieceTrainer.Train(f'--input={txt_path} --model_type=bpe ' f'--model_prefix={spm_path} --vocab_size={vocab_size} ' f'--input_sentence_size={input_sentence_size} ' f'--shuffle_input_sentence=true')

In [33]:
%%time
train_spm(txt_path=os.path.join(VOCAB_DIR, 'product.txt'), spm_path=os.path.join(VOCAB_DIR, 'spm'))

CPU times: user 3min 31s, sys: 3.7 s, total: 3min 35s
Wall time: 3min 37s


In [34]:
os.remove(os.path.join(VOCAB_DIR, 'product.txt'))

In [35]:
for dirname, _, filenames in os.walk(VOCAB_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../input/processed/vocab/spm.model
../input/processed/vocab/spm.vocab


In [36]:
sp = spm.SentencePieceProcessor()
sp.Load(os.path.join(VOCAB_DIR, 'spm.model'))

True

In [37]:
train_df['tokens'] = train_df['product'].map(sp.EncodeAsPieces)

In [38]:
train_df[['product', 'tokens']].head()

Unnamed: 0,product,tokens
0,직소퍼즐 조각 바다거북의 여행 pl,"[▁직소퍼즐, ▁조각, ▁바다, 거북, 의, ▁여행, ▁pl]"
1,모리케이스 아이폰 s s tree farm 다이어리케이스 바보사랑 무료배송,"[▁모리케이스, ▁아이폰, ▁s, ▁s, ▁tree, ▁farm, ▁다이어리케이스,..."
2,크리비아 기모 부 속바지 glg p,"[▁크리비아, ▁기모, ▁부, ▁속바지, ▁gl, g, ▁p]"
3,하프클럽 잭앤질 남성 솔리드 절개라인 포인트 포켓 팬츠 pt _na,"[▁하프클럽, ▁잭앤질, ▁남성, ▁솔리드, ▁절개, 라인, ▁포인트, ▁포켓, ▁..."
4,코드프리혈당시험지 매 코드프리시험지 최장유효기간,"[▁코드, 프리, 혈, 당, 시험, 지, ▁매, ▁코드, 프리, 시험, 지, ▁최,..."
