In [None]:
# % cd "/content/drive/My Drive/py-hanspell/"

In [None]:
# ! python setup.py install

In [None]:
! pip install py-hanspell



In [None]:
import pandas as pd 
import numpy as np
import glob
from tqdm import tqdm
import re

In [None]:
from bs4 import BeautifulSoup
import requests
# from hanspell import spell_checker

In [None]:
def data_load():
    file_path = "/content/drive/My Drive/Colab Notebooks/2020bigcontest/data/"
    train_data = pd.read_csv(file_path+"train_data_pre1_0820.csv")
    cr_data = pd.read_csv(file_path+"crawl_uniquecode(0825).csv")
    return train_data, cr_data

## 1. Preprocessing

In [None]:
train_data , cr_data = data_load()
# make dictionary code to product name 
code2name_dict = dict(zip(train_data['p_code'].to_list(), train_data['p_name'].to_list()))

In [None]:
def code2name(code):
    return code2name_dict[code]

In [None]:
# get p_name of not cr data 
not_cr_data = cr_data[cr_data['cr'] != 1]
not_cr_data['p_name'] = not_cr_data['p_code'].apply(code2name)

In [None]:
def prep1(text):
    text = re.sub('무이자', '', text)
    text = re.sub('일시불', '', text)
    text = re.sub('패키지', '', text)
    text = re.sub('세트', '', text)
    text = re.sub('예약', '', text)
    text = re.sub('상담', '', text)
    text = re.sub('(무)', '', text)
    text = re.sub('(일)', '', text)
    
    text = re.sub(r'[0-9]+인용', '', text)
    text = re.sub(r'[0-9]+팩', '', text)
    text = re.sub(r'[0-9]+종', '', text)

    text = re.sub(r'[^a-zA-Zㄱ-힗]', ' ',text)
    text = text.split()
    # text = [tok for tok in text if len(tok)>1]
    text = " ".join(text)

    # 띄어쓰기
    result = spell_checker.check(text)
    text = ' '.join([key for key in result.words.keys() if len(key)>1])
    text = re.sub(r'\<[^)]*\>', '', text)

    return text

In [None]:
sample = "[무이자]안 삼성화재행복한파트너융주택화재보험(1912)융"
result = prep1(sample)
print(result)

삼성화재 행복한 파트너 주택화재보험


In [None]:
### Preprocessing 
# preprocessing - not crawl product name data 
not_cr_data['pre1_p_name'] = not_cr_data['p_name'].apply(prep1)
file_path = "/content/drive/My Drive/Colab Notebooks/2020bigcontest/data/"

### Save Result
not_cr_data.to_csv(file_path+"not_cr_data_prep_p_name.csv", index = False)

## 1.5 load preprocessed data

In [None]:
file_path = "/content/drive/My Drive/Colab Notebooks/2020bigcontest/data/"
not_cr_data = pd.read_csv(file_path+"not_cr_data_prep_p_name.csv")

In [None]:
# make dictionary code to product name 
notcrdata_code2name_dict = dict(zip(not_cr_data['p_code'].to_list(), not_cr_data['pre1_p_name'].to_list()))

In [None]:
print(len(notcrdata_code2name_dict)) ## 663

663


## 2. Crawl

In [None]:
def crawl(keyword):
    url = f"https://search.shopping.naver.com/search/all.nhn?query={keyword}&cat_id=&frm=NVSHATC"
    data = requests.get(url)
    return data.content

In [None]:
def getCrawlInfo(li):
    # get Productname, link, price 
    aTit = li.find("a", {"class": "basicList_link__1MaTN"})
    name = aTit.text
    link = aTit['href']
    price = li.find("span", {"class":"price_num__2WUXn"}).text.replace(",", "")

    # get Categories - 대/ 중/ 소  (없으면 None 으로 표기되도록)
    cate_class = li.findAll("a", {"class": "basicList_category__wVevj"})
    cate_list = [None, None, None]
    for idx in range(3):
        if cate_class:
            try : 
                cate_list[idx] = cate_class[idx].text
            except :
                pass
    result_dict = {"name":name, "link": link, "price":price, "cate1":cate_list[0] , "cate2":cate_list[1], "cate3":cate_list[2]}
    return result_dict

In [None]:
def parse(pageString):
    bsObj = BeautifulSoup(pageString, "html.parser")
    ul = bsObj.find("ul", {"class":"list_basis"})

    if ul: # 검색 결과 있는 경우 
        lis = ul.find("li",{"class":"basicList_item__2XT81"}) 
        crawl_info_dict = getCrawlInfo(lis)
        crawl_info_dict['cr'] = 1 # 검색 결과 여부도 labeling - (결과 있으면 1)

    else: # 검색 결과 없는 경우 
        key_list = ["name", "link", "price", "cate1", "cate2", "cate3"]
        crawl_info_dict = dict(zip(key_list, [None]*6))
        crawl_info_dict['cr'] = 0  # 검색 결과 여부도 labeling - (결과 없으면 0)

    return crawl_info_dict

In [None]:
def get_crawl_data(p_code, p_name):
    try : 
        pageString = crawl(p_name)
        crawl_info_dict = parse(pageString)
        crawl_info_dict['p_code'] = p_code
    except : 
        key_list = ["name", "link", "price", "cate1", "cate2", "cate3"]
        crawl_info_dict = dict(zip(key_list, [None]*6))
        crawl_info_dict['cr'] = -1  #에러 결과 여부도 labeling -1
        crawl_info_dict['p_code'] = p_code
    return crawl_info_dict

In [None]:
def main(notcrdata_code2name_dict):
    all_dict_list = []
    
    for idx, (p_code, p_name) in enumerate(notcrdata_code2name_dict.items()):
        print()
        one_crawl_dict = get_crawl_data(p_code, p_name)

        if not one_crawl_dict['name'] : # 검색이 되지 않은 경우 : 전체 단어 수에서 2개만 남을때까지 뒤에서 줄여나가며 재검색     
            p_name_list = p_name.split()
            total_len = len(p_name_list)
            if total_len > 2: 
                for _ in range(total_len-2):
                    print("update name -->", p_name)
                    p_name_list.pop() # 맨뒤의 글자 하나 빼기 
                    p_name = ' '.join(p_name_list) # 다시 문자열로 변환해서 
                    one_crawl_dict = get_crawl_data(p_code, p_name)
                    if one_crawl_dict['name']:
                        break 
                    else:
                        p_name_list = p_name.split()
                        
            else: # 단어가 2개 이하면 그냥 pass
                pass

        all_dict_list.append(one_crawl_dict)
        print(idx, p_code, p_name, one_crawl_dict)
        
    return all_dict_list

In [None]:
all_dict_list = main(notcrdata_code2name_dict)


0 200974 오고 레이스 파운데이션 브라 {'name': '[BH0428] 신품 삼각컵 브라 쁘띠파운데이션삼중지내의 여성감 소녀레이스 노와이어 얇은', 'link': 'https://cr.shopping.naver.com/adcr.nhn?x=cqoyGGxvkz0%2BrBo00ISEYf%2F%2F%2Fw%3D%3DsK26f7uMyFOE07F9ati74WUbzLOd6%2FIXMzM%2B5vB8q980AUIy7FbMbxELMonfhNuSkNTbD3oLpLEzd6UcZ1%2BczdnMF%2Fi3jnK7FR0iEzjTDYl8dil8pkfMxUNjvFBIDbfT83LwThXW5CYs8EhlgqI030kbYMcZAGUn2easaa%2Fw65xEwD7Nx75LGM4GIFNRYjgdlw%2BwxAstkrgvaEwwG4L9VrKNEyPiFlrqh02lENnEgeetTfpVZzegCWciQFpkVyinJIwcCdI0jKvuDw4%2BbhR6HdrVRM4YvFWOl4sc%2BMGbjuJKmv%2BEd0%2FjUgeYal%2B68N6R4yLfRC0o%2FWYDsFhk%2BnF4ktzVCZ4rdk8hPModxsSdH4jytQhlFXhnIGMaT60yatHma8yxvR8QX4a%2F%2BF%2FEToFU2qZF5vdiy1pKokcNpmVNFhSyMR5ZdrjPc%2Fi9NlKlQX4aQCDM2pEoEAwdSyzTigeBbvnokLHxCvvht6tjlwPPFI2wFw%2FYw8qmGkG7a73WOIcaVInJHV%2BhI7hoUCjO8tRndKCq1MAUflRs9jEHv%2FTS4sB6HUavez4rDU0VG2tlgoi9WU7xb9mCTzdTd%2BwTsq8TxbloliuLlfWwg8AshFbWvseWVxg4BeJt90jRRszyOgLAWIa51KK%2BZrcADfjZ3VJPfqUSMDSBLumBiruR%2FIl%2FtAhUaIbT4wPT2RVq4kndhIVrUQ2zglkiIJxFE2l9OMb6jS6PHkn1gGf5YOIti2vC9soqFbDf4gHI5rSMJU1BWcIOg&nvMi

In [None]:
import json
file_path = "/content/drive/My Drive/Colab Notebooks/2020bigcontest/data/"
result_file = open(file_path+"result_dict_notcr_663again(0826).json",'w+')
result_file.write(json.dumps(all_dict_list))

747482

In [None]:
crawling_df = pd.read_json(file_path+"result_dict_notcr_663again(0826).json")
crawling_df = crawling_df[['p_code','cr','name','link','price','cate1','cate2','cate3']]
crawling_df.to_csv(file_path+"crawl_uniquecode_notcr_663again(0826).csv", index = False)

In [None]:
check = pd.read_csv(file_path+"crawl_uniquecode_notcr_663again(0826).csv")

## 3. Concat all data

In [None]:
train_data, cr_data, cr_data2 = data_load()
cr_data1 = cr_data[cr_data['cr'] ==1]
all_cr_data = pd.concat([cr_data1,cr_data2])
all_cr_data = all_cr_data.reset_index(drop=True)