In [90]:
from itertools import islice
import pandas as pd
import h5py
import regex as re
import sentencepiece as spm
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

In [280]:
dir_path = 'kakao_dataset'
data_number = '09'
file_name = 'train.chunk.{}'.format(data_number)
spm_path = 'spm_model/global_spm.model'

In [281]:
sp = spm.SentencePieceProcessor()
sp.Load(spm_path)
temp = h5py.File('{}/{}'.format(dir_path, file_name), 'r')['train']
dataset = pd.DataFrame(columns=temp.keys())
for k, v in temp.items():
    dataset[k] = v[()]
str_columns = ['brand', 'maker', 'model', 'product', 'pid', 'updttm']
for col in str_columns:
    dataset[col] = dataset[col].map(lambda x:x.decode('utf-8'))
    dataset[col] = dataset[col].map(lambda x:x if len(x) else None)


In [282]:
def clean_reivew(review):
    review = ''.join(re.compile('[가-힣.!?a-zA-Z ]').findall(review))
    review = re.sub('\n', ' ', review)
    review = re.sub(r'\!+', '!', review)
    review = re.sub(r'\?+', '?', review)
    review = re.sub(r'\.+', '.', review)
    review = re.sub(r'\([^)]*\)', '', review)    # 괄호 안 내용 삭제
    review = re.sub(r'\[[^)]*\]', '', review)    # 대괄호 안 내용 삭제
    return review 

In [283]:
def find(target, seq):
    max_window_size = len(target)
    if max_window_size <= 0:
        max_window_size = 1
        
    def window(seq, n=max_window_size):
        "Returns a sliding window (of width n) over data from the iterable"
        "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
        it = iter(seq)
        result = tuple(islice(it, n))
        if len(result) == n:
            yield result
        for elem in it:
            result = result[1:] + (elem,)
            yield result
      
    
    def tune_window_size():
        window_size = max_window_size
        for i in range(max_window_size, 0, -1):
      
            for _s in window(seq=seq, n=i):# _s : 윈도우 결과
                if target in ''.join(_s):
                    window_size = i
                    #print(i, _s, window_size)
                    break
                    
        return window_size
                
    window_size = tune_window_size()
      
        
    def get_ckpts(_seq):
        pos = 0
        ckpt = list()
          
        for _s in window(seq=_seq, n=window_size):
            if target in ''.join(_s):
                ckpt.append([pos, pos+window_size])
            pos += 1
        return ckpt
            
    ckpts = get_ckpts(seq)
    label_list = ['O']*len(seq)
    
    for ckpt in ckpts:
        label_list[ckpt[0]] = 'brand-B'
        for i in range(ckpt[0]+1, ckpt[1]):
            label_list[i] = 'brand-I'
    
  
      
    
    return label_list

In [284]:
def get_label_list(_dataset):
    dataset = _dataset
    dataset['brand'] = dataset['brand'].fillna('__없음__')
    dataset['product'] = dataset['product'].map(lambda x : clean_reivew(x))
    dataset = dataset.dropna(subset=['brand']).reset_index(drop=True)
    
    product = dataset['product'].values
    brand = dataset['brand'].values
    
    def cleaner(_list):
        _str = [re.sub('▁', '', s) for s in _list]
        _str = [s for s in _str if s]
        return _str#re.sub('▁', '', _str)
    
    dataset['product'] = dataset['product'].map(sp.encode_as_pieces)
    dataset['product'] = dataset['product'].map(lambda x: cleaner(x))
    label_list = list()
    a = [label_list.extend([find(b,p)]) for p,b in zip(product, brand)]
    
    dataset = dataset[['product' ,'brand']]
    dataset['label'] = label_list
    

    drop_idx = list()
    for i,r in tqdm(dataset.iterrows()):
        if 'brand-B' not in r['label'] and r['brand'] != '__없음__':#브랜드명이 있으면서 찾아내지 못 한 케이스 제거

            drop_idx.append(i)
    dataset = dataset.drop(drop_idx)
    
    return dataset ,drop_idx
    

In [285]:
def split_concat(df1, df2=None, train_size = 0.85):
    """
    df1 = brand명 있는 df
    df2 = brand명 없는 df
    """
    if type(df2) == pd.DataFrame:

        df1_size = df1.shape[0]
        df2_size = df2.shape[0]

        df1_train = df1.loc[:(int)(df1_size*train_size)]
        df1_test  = df1.loc[(int)(df1_size*train_size):]


        df2_train = df2.loc[:(int)(df2_size*train_size)]
        df2_test  = df2.loc[(int)(df2_size*train_size):]

        train = pd.concat([df1_train, df2_train])
        test = pd.concat([df1_test, df2_test])

        train = train.sample(len(train)).reset_index(drop=True)
        test = test.sample(len(test)).reset_index(drop=True)



        #size = dataframe.shape[0]
        #train = dataframe.loc[:(int)(size*train_size)]
        #test = dataframe.loc[(int)(size*train_size):]
        #test = test.reset_index(drop=True)

        return train, test
    
    else:
        size = df1.shape[0]
        train = df1.loc[:(int)(size*train_size)]
        test = df1.loc[(int)(size*train_size):]
        test = test.reset_index(drop=True)
        train = train.sample(len(train)).reset_index(drop=True)
        test = test.sample(len(test)).reset_index(drop=True)
        
        return train, test

In [286]:
df,drop_idx = get_label_list(dataset)#브랜드명이 있으면서 찾지 못 한 경우 제거

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [287]:
dataset = dataset.drop(drop_idx)

In [288]:
input_texts = dataset[['brand',  'product']]

In [289]:
input_texts['brand'] = input_texts['brand'].map(lambda x: ''.join(re.compile('[가-힣.!?a-zA-Z_0-9]').findall(x)))
input_texts['brand'] = input_texts['brand'].fillna('__없음__')
input_texts['product'] = input_texts['product'].map(lambda x: clean_reivew(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [290]:
drop_idx = list()
for i,r in tqdm(input_texts.iterrows()):
    if len(r['brand']) < 1:#브랜드명이 공백인 경우 제거
        drop_idx.append(i)
input_texts = input_texts.drop(drop_idx)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [291]:
df1 = input_texts[input_texts['brand'] != '__없음__']
df2 = input_texts[input_texts['brand'] == '__없음__']

In [292]:
data_len = min(df1.shape[0], df2.shape[0])

In [293]:
if len(df1) < len(df2):
    df1 = df1.reset_index(drop=True)
    df2 = df2.sample(data_len).reset_index(drop=True)
else:
    df1 = df1.sample(data_len).reset_index(drop=True)
    df2 = df2.reset_index(drop=True)

In [294]:
train, test = split_concat(df1=df1, df2=df2, train_size=0.85)

In [295]:
train.to_csv('data/train_{}.tsv'.format(data_number),sep='\t',index=False, header=None)

In [296]:
test.to_csv('data/test_{}.tsv'.format(data_number),sep='\t',index=False, header=None)

# Merge

In [297]:
train_df_list = []
test_df_list = []
for i in range(1,10):
    try:
        train_df_list.append(pd.read_csv('data/train_0{}.tsv'.format(str(i)), delimiter='\t',header=None))
        test_df_list.append(pd.read_csv('data/test_0{}.tsv'.format(str(i)),delimiter='\t', header=None))     
    except:
        print('{}번 tsv 없음'.format(i))
        continue

In [298]:
train = pd.concat(train_df_list).reset_index(drop=True)

In [299]:
test = pd.concat(test_df_list).reset_index(drop=True)

In [300]:
train.to_csv('data/train.tsv',sep='\t',index=False, header=None)

In [301]:
test.to_csv('data/test.tsv',sep='\t',index=False, header=None)

In [302]:
test

Unnamed: 0,0,1
0,프랜치캣,프랜치캣롯데백화점핑크모직코트QDAC
1,__없음__,주유구 꼬마젖병 남아 LPG 자동차스티커 주유구스티커
2,__없음__,스탠리 AL자석수평 STHT AL자석수평기
3,__없음__,레인제로유로와이퍼 스펙트라윙 mmmm
4,__없음__,르젠 겉기모 투톤사각 슬림핏 체크셔츠LNCSDN
...,...,...
569751,__없음__,다이아코트 DIACOAT BMW GT 자동차덮개바디커버P
569752,__없음__,엔키마운트 ENKTD이하브라켓거치대마운
569753,__없음__,엡손 정품드럼 S Aculaser CX용
569754,__없음__,핫딜바이일본 BZ ADIDAS ORIGINALS CAMPUS CORE BLACKRU...


In [303]:
train

Unnamed: 0,0,1
0,LG전자,LG전자 LG전자 MTDF 모니터
1,다슈,다슈 클래식 원더 스타일링 매트왁스 g
2,코베아,코베아 코베아 캠프 플러스 호스 버너 가스렌지 KGB
3,__없음__,플레이그로 목욕시간 원형볼
4,__없음__,닥터오라클 안티박 더마 밸런싱 에센스로션
...,...,...
3228583,시루원,시루원 쑥 오쟁이떡 개 g 가공식품특가
3228584,__없음__,해외Wholesale Paisley Khaki Gold Black Classic ...
3228585,__없음__,LG옵티머스G cat 꽃향내 하드 핸드폰케이스 F 핸
3228586,__없음__,M 스카치 브라이트 SL 스펀지 수세미 소 p 천연수


In [1]:
!nvidia-smi

Tue Oct 27 09:53:35 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.100      Driver Version: 440.100      CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  TITAN RTX           Off  | 00000000:02:00.0 Off |                  N/A |
| 40%   30C    P8    11W / 280W |   1183MiB / 24217MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|    0  

In [6]:
import pandas as pd

In [10]:
df = pd.read_csv('product_language_202009221449.csv')

In [11]:
df = df.sample(30000)

In [12]:
df.to_csv('test_sample3.txt', header=None, index=False, sep='\t')

In [46]:
df = pd.read_csv('brand_20%_batch_10_8.txt', names=['name', 'brand'])

In [47]:
df.loc[:59]

Unnamed: 0,name,brand
0,모디디 큐브 인용 쇼파 블랙,모디디
1,버블 LCD 액정필름 올레포빅 장 소니 RXM,__없음__
2,블루독베이비 WH 마우스고깔속싸보,블루독베이비
3,GD GRIP 미세조절 악력기 강약조절 손목강화 완력,G
4,DaoXiang 윈난 전통 디저트 로즈 꽃 화과자세트,DaoXiang
5,화신 P 다목적 가위,화신
6,여성 레인코트 리니지 레인코트 전기차 롱기장 성인 전신 남녀 비피모 확대,레인코트
7,심플 no 거실장 거실 서랍장 TV장 거실책장 공간박스 수납장,__없음__
8,사계절 사용가능한 펜션정원 그늘막 그네의자 흔들의자 정원의자,__없음__
9,단 국산 서빙 카트 식당 운반 웨건 식당용 업소용,__없음__


In [1]:
from predict import Predict

In [3]:
p = Predict('l')

In [4]:
p.predict()

AttributeError: 'str' object has no attribute 'model_dir'