# Crawling


## Import Library

In [12]:
import numpy as np
import pandas as pd

import json
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from pprint import pprint
from tqdm import tqdm
from tqdm import tqdm_notebook

## Path

In [13]:
base_path = '/home/hyejeongeun/Todays_Chatbot/Data/Base/' # save crawling informs
qna_path = '/home/hyejeongeun/Todays_Chatbot/Data/QnA/' # save question & answer csv
review_path = '/home/hyejeongeun/Todays_Chatbot/Data/Review/Review/' # save review npy

## 1) 카테고리 정보 크롤링

In [14]:
def create_category():
    cate_link = []

    url = 'https://ohou.se/store/'

    # 해당 웹 페이지에서 데이터 받아오기
    response = requests.get(url) 
    if response.status_code != requests.codes.ok:
        print("접속 실패")
    else:
        print("접속 성공")

    # 받아온 데이터를 알아볼 수 있는 형태로 파싱(해석)
    soup = BeautifulSoup(response.text, 'html.parser') 

    # 카테고리 부분
    cate = soup.select('#store-index > section.container.store-index-section.store-index-category-list > div > div.category-list.fold > div > a')

    for ca in cate:
        cate_link.append([ca.div.div.text, ca['href']])  # 카테고리명, 해당 링크

    # http 부분 추가 & order=review로 바꾸고, affect_type & affect_id 없애는 작업 진행
    for ca in cate_link:
        ca[1] = 'https://ohou.se' + ca[1].replace('popular', 'review').split('&affect')[0]

    return cate_link

In [None]:
cate_link = create_category()

In [None]:
# 내보내기
np.save(base_path+'cate_link', cate_link, allow_pickle=True)

### 카테고리 id 추출

In [None]:
c_num=[] # 카테고리 아이디
for i in cate_link:
    c_num.append([i[0], i[1].split('category=')[1].split('&')[0]])

In [None]:
category_table = pd.DataFrame(c_num, columns=['category_name','category_id'])

In [None]:
category_table.to_csv(base_path+'category_table.csv', encoding='cp949', index=False)

## 2) 상품 정보 크롤링

In [None]:
def create_product_crawling(category_id):
    product = []
    for cate_id in category_id:
        for page_num in range(1,2): 
            url='https://ohou.se/store/category.json?v=2&category={}&order=review&page={}&per=24'.format(cate_id, page_num)
            resp=requests.get(url)
            data = resp.json()
            for i in range(0,10):
                product.append([cate_id, data['productions'][i]['id'], data['productions'][i]['name']])
    return product

In [None]:
product = create_product_crawling(category_table['category_id'])

In [None]:
tmp_table = pd.DataFrame(product, columns=['category_id','product_id','product_name'])

In [None]:
id_table = tmp_table[['category_id','product_id']]

In [None]:
product_table = tmp_table[['product_name','product_id']]

In [None]:
# id 중복 제거 (한 상품이 여러 카테고리에 존재하는 경우)
product_table = product_table.drop_duplicates().reset_index(drop=True)

In [None]:
product_table.to_csv(base_path+'product_table.csv', encoding='cp949', index=False)
id_table.to_csv(base_path+'id_table.csv', encoding='cp949', index=False)

## 3) 문의글 크롤링

In [None]:
def get_question(product_table):
    qna = []
    for prod_id in tqdm_notebook(product_table['product_id']):
        page=1
        while(1):
            url='https://ohou.se/production_questions/get_questions.json?product_id={}&page={}&per=5&v=2'.format(prod_id, page)
            resp=requests.get(url)
            data=resp.json()
            if len(data['questions'])==0: break
            elif data['questions'] is None: break
            else:
                for i in range(0,len(data['questions'])):
                    # 제품ID, 제품명, 옵션
                    prod_id = data['questions'][i]['production']['id']
                    prod_name = data['questions'][i]['production']['name']
                    prod_opt = data['questions'][i]['production']['explain']
                    # 유저ID, 구매여부
                    user_id = data['questions'][i]['id']
                    user_buyer = data['questions'][i]['is_buyer']
                    # 비밀글 여부, 문의 카테고리(label)
                    is_secret = data['questions'][i]['is_secret']
                    label = data['questions'][i]['type']
                    # 질문, 작성시간
                    que = data['questions'][i]['question']
                    que_time = data['questions'][i]['question_at']
                    # 답변, 작성시간
                    if data['questions'][i]['answer'] is not None:
                        ans = data['questions'][i]['answer']['answer']
                        ans_time = data['questions'][i]['answer']['answer_at']
                    else:
                        ans = 'None'
                        ans_time = 'None'
                    # qna list에 추가
                    qna.append([prod_id, prod_name, prod_opt, user_id, user_buyer, is_secret, label, que, que_time, ans, ans_time])
            page = page + 1
    return qna

In [None]:
product_table = pd.read_csv('data/product_table.csv', encoding='cp949')

In [None]:
question = get_question(product_table)

In [None]:
question_table = pd.DataFrame(question, columns=['product_id','product_name','product_option',
                                                 'user_id','user_buyer',
                                                 'is_secret', 'label',
                                                 'question','question_time',
                                                 'answer','answer_time'])

In [None]:
question_table.to_csv(qna_path+'qna_table.csv', index=False)

## 4) 리뷰 크롤링

In [None]:
def get_review(prod_id):
    review = []
    page=1
    while(1):
        url='https://ohou.se/production_reviews.json?production_id={}&page={}&order=best&photo_review_only='.format(prod_id,page)
        resp=requests.get(url)
        data=resp.json()
        if data['reviews'] == []: #is None
            break
        else:
            for i in range(0,len(data['reviews'])):
                review.append(
                    [data['reviews'][i]['created_at'],
                    data['reviews'][i]['writer_id'],
                    data['reviews'][i]['praise_count'],

                    data['reviews'][i]['production_information']['id'],
                    data['reviews'][i]['production_information']['name'],
                    data['reviews'][i]['production_information']['explain'],

                    data['reviews'][i]['review']['comment'],

                    data['reviews'][i]['card']['image_url']])
                    page=page+1
    return review

In [None]:
for prod_id in tqdm_notebook(product_table['product_id']):
    rev = get_review(prod_id)
    np.save(review_path+"review_"+str(prod_id)+".npy", rev)