# 1. 초기 환경 셋팅 & 데이터 전처리

## 1-1 Spacy, konlpy, mecab 초기 설정

In [None]:
# 실행 후 세션 RESET(런타임 재시작)
!apt-get update
!apt-get install g++ openjdk-8-jdk python-dev python3-dev
!pip install konlpy
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

In [None]:
!pip install plac
!pip install mecab-ko
!pip install mecab-ko-dic
!pip install natto-py
!python -m spacy download ko_core_news_md # pre-trained model download
!python -m spacy download ko_core_news_sm
!python -m spacy download ko_core_news_lg
!pip install spacy-transformers

In [16]:
import numpy as np
import pandas as pd
import spacy
import re
import json
import seaborn as sns
import matplotlib.pyplot as plt
# Load Packages
from __future__ import unicode_literals, print_function

import plac #  wrapper over argparse
import random
from pathlib import Path
import spacy
from tqdm import tqdm # loading bar
from spacy.training.example import Example
from spacy.util import minibatch, compounding
import os, zipfile

import warnings
warnings.filterwarnings("ignore")

## 1-2 데이터 업로드 및 json 파일 전처리
json example :
{   
          "sentence": "1+1 자연퐁 2X 솔잎 리필 100ml당682원 1.6L 02-04-04.1 10,900
           동일품목 교차가능 01/04~01/17 88010511519",  
          "labels": [
              {"start": 4, "end": 13, "entity": "상품명"},  
              {"start": 36, "end": 41, "entity": "가격"}  
          ]
      }  

In [3]:
# 파일 업로드
from google.colab import files
f= files.upload()

Saving train_data.json to train_data.json
Saving detectedV4.txt to detectedV4.txt


In [4]:
# JSON 파일 불러오기
# with open("/content/ssg_data.json", "r", encoding="utf-8") as json_file:
#     train2 = json.load(json_file)
# with open("/content/naver_data.json", "r", encoding="utf-8") as json_file:
#     train3 = json.load(json_file)
# with open("/content/labeled_data.json", "r", encoding="utf-8") as json_file:
#     train4 = json.load(json_file)
with open("/content/train_data.json", "r", encoding="utf-8") as json_file:
    train = json.load(json_file)

In [5]:
# 훈련 데이터 Merge
def train_dataset(df_lst,saved_data,wow,split_y = True):
  for df in df_lst:
    for i in df['data']:
      if split_y: x = (i['sentence'], {'entities': [(t['start'],t['end'],t['entity']) for t in i['labels'] if t['entity'] == wow]})
      else: x = (i['sentence'], {'entities': [(t['start'],t['end'],t['entity']) for t in i['labels']]})
      saved_data.append(x)

In [6]:
# 훈련 데이터 Merge
TRAIN_DATA1, TRAIN_DATA2 = [], []
df_lst = [train]
train_dataset(df_lst,TRAIN_DATA1,'상품명')
train_dataset(df_lst,TRAIN_DATA2,'가격')

print(TRAIN_DATA1[:5])
print(TRAIN_DATA2[:5])

[('[이달의특가] 유기농마루의 청국장 200g x 4개 / 100% 국내산 유기농 백태 17900', {'entities': [(7, 46, '상품명')]}), ('[신년특가]1+1 마밤 체크 피치기모 라운지웨어 아동 주니어 잠옷 파자마 24800', {'entities': [(11, 40, '상품명')]}), ('일본 오사카 패키지 해외가족여행 겨울온천 맛집 특가 2박3일 699900', {'entities': [(0, 33, '상품명')]}), ('(특가) 늘푸른식품 무안 자색 빨간양파즙 120팩 29000', {'entities': [(5, 27, '상품명')]}), ('땡처리 삿포로 특가 온천 일본 땡처리여행 북해도 여행프로모션 713600', {'entities': [(0, 33, '상품명')]})]
[('[이달의특가] 유기농마루의 청국장 200g x 4개 / 100% 국내산 유기농 백태 17900', {'entities': [(47, 52, '가격')]}), ('[신년특가]1+1 마밤 체크 피치기모 라운지웨어 아동 주니어 잠옷 파자마 24800', {'entities': [(41, 46, '가격')]}), ('일본 오사카 패키지 해외가족여행 겨울온천 맛집 특가 2박3일 699900', {'entities': [(34, 40, '가격')]}), ('(특가) 늘푸른식품 무안 자색 빨간양파즙 120팩 29000', {'entities': [(28, 33, '가격')]}), ('땡처리 삿포로 특가 온천 일본 땡처리여행 북해도 여행프로모션 713600', {'entities': [(34, 40, '가격')]})]


In [7]:
# 훈련 데이터 Merge
TRAIN_DATA = []
df_lst = [train]
train_dataset(df_lst,TRAIN_DATA,'',False)
print(TRAIN_DATA[:5])

[('[이달의특가] 유기농마루의 청국장 200g x 4개 / 100% 국내산 유기농 백태 17900', {'entities': [(7, 46, '상품명'), (47, 52, '가격')]}), ('[신년특가]1+1 마밤 체크 피치기모 라운지웨어 아동 주니어 잠옷 파자마 24800', {'entities': [(11, 40, '상품명'), (41, 46, '가격')]}), ('일본 오사카 패키지 해외가족여행 겨울온천 맛집 특가 2박3일 699900', {'entities': [(0, 33, '상품명'), (34, 40, '가격')]}), ('(특가) 늘푸른식품 무안 자색 빨간양파즙 120팩 29000', {'entities': [(5, 27, '상품명'), (28, 33, '가격')]}), ('땡처리 삿포로 특가 온천 일본 땡처리여행 북해도 여행프로모션 713600', {'entities': [(0, 33, '상품명'), (34, 40, '가격')]})]


In [8]:
len(TRAIN_DATA)

2845

In [9]:
### test data
# Reading the text file
file_path = '/content/detectedV4.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# 라인별 데이터 리스트로 불러오기
test_data = [line.replace('\t',' ').replace('\n','') for line in lines if line.strip()]
print(test_data[:5])

['고프로 HERO10 Black 할인 30,000원 전사가 619,000  ', '샘표전품목25만원이상5천원상품권 IN 폰테나스테0크소스   (2)드와인 ] 상품권   1087원 증정 2,880  ', '시베리아차가버섯   49,000   ', '맥널티 블렌드 모카포트세트 16,000  ', '9,400원   18,000원 W 담을수록 덕덕구스  캔 이득  506원 2,500 ']


# First Model (testing / learning)

## 2-1 새로운 모델 생성 후 훈련

In [None]:
# @title
# Define our variables
model = None
output_dir=Path("/content/")
n_iter=200

In [None]:
# @title
if model is not None:
    nlp1 = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
else:
    nlp1 = spacy.blank('ko')  # create blank Language class
    print("Created blank 'ko' model")

In [None]:
# @title
# create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp1.pipe_names:
    ner = nlp1.create_pipe('ner')
    nlp1.add_pipe('ner', last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp1.get_pipe('ner')

In [None]:
# @title
# 훈련 데이터 확인
TRAIN_DATA[:5]

In [None]:
# @title
# add labels
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp1.pipe_names if pipe != 'ner']
with nlp1.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp1.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        examples = []  # 리스트를 통해 Example 객체를 모았다가 한 번에 업데이트
        for text, annotations in tqdm(TRAIN_DATA):
            example = Example.from_dict(nlp1.make_doc(text), annotations)
            examples.append(example)
        nlp1.update(examples, drop=0.5, losses=losses)
        print(losses)

In [None]:
# @title
x = '궁중비책 프리뮨 모이스처 로션 식품 350ML { 10ml 당700원 02-02-02.2 24,500'
doc = nlp1(x)
print(doc.ents)

In [None]:
# @title
for ent in doc.ents:
    print(ent.text, ent.label_)

## 2-2 Pre-Trained Model Train

- 데이터 epoch당 200개 ... 100개 등...


In [10]:
def model_train(TRAIN_DATA, n_iter=200, batch_size = 150, save_model=False, save_name='pingu_model'):
  # Load pre-trained model
  nlp = spacy.load("ko_core_news_sm")

  # Update the NER model with new entity
  ner = nlp.get_pipe("ner")

  for _, annotations in TRAIN_DATA:
      for ent in annotations.get('entities'):
          ner.add_label(ent[2])

  loss_lst = []
  # get names of other pipes to disable them during training
  pipe_exceptions = ["ner","attribute_ruler"]
  other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

  with nlp.disable_pipes(*other_pipes):  # only train NER
      nlp.resume_training()
      for itn in range(n_iter):
          random.shuffle(TRAIN_DATA)
          losses = {}
          examples = []  # 리스트를 통해 Example 객체를 모았다가 한 번에 업데이트
          # minibatch를 사용하여 데이터를 나누고 업데이트 수행
          for batch in minibatch(TRAIN_DATA, size=batch_size):
              for text, annotations in tqdm(batch):
                  example = Example.from_dict(nlp.make_doc(text), annotations)
                  examples.append(example)

              nlp.update(examples, drop=0.5, losses=losses)
              examples = []  # 미니배치 업데이트 후 비우기
              loss_lst.append((itn+1, losses))

          if itn % 10 == 0:
            print(losses)
  if save_model: # Save the updated model
    nlp.to_disk(save_name)
  return nlp, loss_lst

In [11]:
def epoch_plot(loss_lst, iter, epoch):
  # loss_lst 리스트에서 에포크와 손실을 추출
  _, losses = zip(*loss_lst)
  losses = [loss['ner'] for loss in losses]
  # 데이터프레임 생성
  data = {'Epoch': range(1,len(losses)+1), 'Loss': losses}
  df = pd.DataFrame(data)

  # seaborn을 사용하여 플롯 그리기
  sns.lineplot(x='Epoch', y='Loss', data=df)
  plt.title(f'Loss Per epoch : {epoch} and iter : {iter}')
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.show()

In [12]:
def recog_rate(model, data):
  yes = 0
  for i in data:
    doc = model(i[0])
    if len(doc.ents) > 0:
      yes+= 1
  return yes/len(data)

In [None]:
# n_iter : 100, 200 일때 모델 학습 결과 공유
iter_lst = [100, 150, 300]
epoch_lst = np.linspace(0.1,1,10)
for iter in iter_lst:
  for epoch in epoch_lst:
    print('!!!!!!!!!!!!!!! We start iter :', iter, ' and epoch :', epoch, 'model Training!!!!!!!!!!!!!!!')
    globals()[f'nlp_{iter}_{epoch}'], nlp_loss = model_train(iter,epoch,False)
    epoch_plot(nlp_loss, iter, epoch)

    print('Train data recognition Rate :', recog_rate(globals()[f'nlp_{iter}_{epoch}'], TRAIN_DATA))
    print('\n')
    print('Test data recognition Rate :', recog_rate(globals()[f'nlp_{iter}_{epoch}'], test_data))


- Ensemble model(**상품명** + **가격**)

In [None]:
nlp_150_p, loss_lst = model_train(TRAIN_DATA1,150, 200, False)
epoch_plot(loss_lst, 150, 200)
recog_rate(nlp_150_p, test_data)

In [None]:
for t in test_data:
  doc = nlp_150_p(t)
  spacy.displacy.render(doc, style='ent', jupyter=True)

In [None]:
print(nlp_150.meta["version"])

3.6.0


In [None]:
nlp_150_s.to_disk('Pingu_model_1_19_150_price')

In [None]:
!unzip /content/Pingu_model_1_19_150_sm.zip -d /content/Pingu_model_1_19_150_sm

In [None]:
!zip -r /content/Pingu_model_1_19_150_price.zip /content/Pingu_model_1_19_150_price

## 2-3 모델링 저장 후, 불러오기

In [None]:
x = "신세계포인트적립시 2,480원 명태 160g 신세계 1800원 포인트 4,9 90918 2023.11.02"
# nlp_nerere = spacy.load('/content/Pingu_model_1_17_1000/content/Pingu_model_1_17_1000')
doc = nlp_nerere(x)
spacy.displacy.render(doc, style='ent', jupyter=True)

## 2-4 지금까지 만든 모델 Jacard score check

In [17]:
def unzip(zip_file_path, extract_path):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

In [20]:
# Unfolding .zip file
lst = os.listdir()
for l in lst:
  if l.endswith('.zip'):
    zip_file_path = '/content/' + l
    extract_path = zip_file_path.replace('.zip','')
    unzip(zip_file_path, extract_path)

In [45]:
current_directory = '/content/'
folder_lst = [f for f in os.listdir(current_directory) if os.path.isdir(os.path.join(current_directory, f))]
model_lst = []
for l in folder_lst:
  if not l.endswith('.config') and l != 'sample_data':
    model_lst.append(l)
model_lst

['Pingu_model_1_19_150_sm',
 'Pingu_model_1_17_500',
 'Pingu_model_1_19_150_product',
 'Pingu_model_1_19_150_price']

In [46]:
for model in model_lst:
  globals()[model] = spacy.load('/content/' + model + '/content/' + model)

In [47]:
def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    if len(a) == 0 or len(b) == 0:
      return 0
    else:
      return float(len(c)) / (len(a) + len(b) - len(c))

In [48]:
def predict_entities(text, model, label):
    doc = model(text)
    ent_array = []
    for ent in doc.ents:
        start = text.find(ent.text)
        end = start + len(ent.text)
        new_int = [start, end, ent.label_]
        if new_int not in ent_array:
            ent_array.append([start, end, ent.label_])
    ent_array = [t for t in ent_array if t[2] == label]
    selected_text = text[ent_array[0][0]: ent_array[0][1]] if len(ent_array) > 0 else ''
    return selected_text

In [49]:
def show_me_the_jaccard(model):
  jaccard_score1, jaccard_score2 = 0, 0
  for text, entity in tqdm(TRAIN_DATA):
    p1, p2 = [t for t in entity['entities'] if t[2] == '가격'], [t for t in entity['entities'] if t[2] == '상품명']
    jaccard_score1 += jaccard(predict_entities(text, model, '가격'), text[p1[0][0]:p1[0][1]] if len(p1) > 0 else '')
    jaccard_score2 += jaccard(predict_entities(text, model, '상품명'), text[p2[0][0]:p2[0][1]] if len(p2) > 0 else '')
  print(f'Average Price Jaccard Score is {jaccard_score1 / len(TRAIN_DATA)}')
  print(f'Average Product Jaccard Score is {jaccard_score2 / len(TRAIN_DATA)}')

In [50]:
for model in model_lst:
  print('#'*100)
  print(str(model))
  show_me_the_jaccard(globals()[model])
  print('\n')

####################################################################################################
Pingu_model_1_19_150_sm


100%|██████████| 2845/2845 [00:43<00:00, 65.33it/s]


Average Price Jaccard Score is 0.9193907439953196
Average Product Jaccard Score is 0.8308466889486221


####################################################################################################
Pingu_model_1_17_500


100%|██████████| 2845/2845 [00:49<00:00, 58.04it/s]


Average Price Jaccard Score is 0.9174282366725314
Average Product Jaccard Score is 0.8213170250955839


####################################################################################################
Pingu_model_1_19_150_product


100%|██████████| 2845/2845 [00:42<00:00, 66.27it/s]


Average Price Jaccard Score is 0.0
Average Product Jaccard Score is 0.8257629131267106


####################################################################################################
Pingu_model_1_19_150_price


100%|██████████| 2845/2845 [00:42<00:00, 66.73it/s]

Average Price Jaccard Score is 0.9479496192150034
Average Product Jaccard Score is 0.0







# 결과값 가져오기

In [879]:
def predict_entities(text, model, label):
    doc = model(text)
    ent_array = []

    for ent in doc.ents:
        start = text.find(ent.text)
        end = start + len(ent.text)
        new_int = [start, end, ent.label_]
        if new_int not in ent_array:
            ent_array.append([start, end, ent.label_])

    ent_array = [t for t in ent_array if t[2] == label]

    if len(ent_array) > 0 and label == '상품명':
        selected_texts = ' '.join([text[t[0]: t[1]] for t in ent_array])
    elif len(ent_array) > 0 and label == '가격':
        # 엔트리를 하나만 인식하는 경우
        if len(ent_array) == 1:
          ent_array = [text[t[0]: t[1]].split() for t in ent_array][0]
          ent_array = [sorted(ent_array,reverse=True)[0]]
          selected_texts = [re.sub(r'[^0-9]', '', t) for t in ent_array]

        # 엔트리를 두개 이상 인식하는 경우
        else:
          selected_texts = [re.sub(r'[^0-9]', '', text[t[0]: t[1]]) for t in ent_array]
        selected_texts = [int(price) for price in (delete_price(price) for price in selected_texts) if price is not None]
        selected_texts = sorted(selected_texts,reverse=True)

        if len(selected_texts) >= 2:
          selected_texts = selected_texts[1]
        elif len(selected_texts) == 1:
          selected_texts = selected_texts[0]
        else: selected_texts = 'No'

        ## 예외처리..
        if len(str(selected_texts)) == 8:
          if str(selected_texts)[:4] > str(selected_texts)[4:]:
            selected_texts = int(str(selected_texts)[4:])
          else: selected_texts = int(str(selected_texts)[:4])

    else:
        selected_texts = ''

    return selected_texts

In [494]:
def delete_product(sentence):
  sentence = re.sub(r'\b\d+원\b','',sentence)
  sentence = re.sub(r'[^a-zA-Z0-9가-힣\s]', '', sentence)
  words_to_remove = [')','NEW', 'ITEM', '교차상품', '기획상품', '행사', '할인', 'SALE',
                     '동일행사품목', '교차가능', '정상가', '동일품목', '가격할인', '추천',
                     '(','OPEN','특가행사','단독','상품','LAST','₩','단위','가격','구매시']
  for word in words_to_remove:
      sentence = sentence.replace(word, '')
  return sentence

In [741]:
def delete_price(sentence):
  sentence = re.findall(r'\b\d+00?(?:\b|,)', sentence)
  if sentence:
    return sentence[0]
  else:
    return None

In [942]:
### 상품명 추출
from collections import Counter

text = test_data[random.randint(0,len(test_data))]
print(text)

# 문장을 단어로 분리하는 함수
def tokenize(sentence):
    # 간단하게 공백을 기준으로 단어 분리
    return sentence.split()

for idx, model in enumerate(model_lst):
  globals()['words' + f'{idx}'] = tokenize(predict_entities(text, globals()[model],'상품명'))

# 단어의 등장 횟수를 세기
word_counts = Counter(words1 + words2 + words3)

# 가장 많이 등장한 단어들 추출 (예: 상위 5개)
top_words = ' '.join([word[0] for word in word_counts.most_common(5)])

top_words = delete_product(top_words)
print("추출된 문자:", top_words)

CPAM3FRCO007BK NO PRICE 689,000 ITEM 큐레이티드 마운틴 알파카 집코트 SIZE   
추출된 문자: CPAM3FRCO007BK NO PRICE  큐레이티드


In [941]:
text = test_data[random.randint(0,len(test_data)-1)]
print(text)
predict_entities(text, Pingu_model_1_19_150_price,'가격')

히말라야 허니 글로스 립밤 많이 살수록   더 싸게 050404,2     이상 개당 7동일품목 교차기능 3,900 1,950   


1950