<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load Data</a></span></li><li><span><a href="#Information" data-toc-modified-id="Information-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Information</a></span></li><li><span><a href="#Preprocessing" data-toc-modified-id="Preprocessing-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Preprocessing</a></span></li><li><span><a href="#성별-리뷰를-얼마나-남기는지" data-toc-modified-id="성별-리뷰를-얼마나-남기는지-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>성별 리뷰를 얼마나 남기는지</a></span></li><li><span><a href="#성별-연령-비율" data-toc-modified-id="성별-연령-비율-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>성별 연령 비율</a></span></li><li><span><a href="#성별-피부타입" data-toc-modified-id="성별-피부타입-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>성별 피부타입</a></span></li><li><span><a href="#성별-선호도-비율" data-toc-modified-id="성별-선호도-비율-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>성별 선호도 비율</a></span></li><li><span><a href="#성별-평점-비율" data-toc-modified-id="성별-평점-비율-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>성별 평점 비율</a></span></li><li><span><a href="#성별-사용-화장품의-가격-/-카테고리별-가격" data-toc-modified-id="성별-사용-화장품의-가격-/-카테고리별-가격-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>성별 사용 화장품의 가격 / 카테고리별 가격</a></span></li><li><span><a href="#성별-사용-화장품-용량" data-toc-modified-id="성별-사용-화장품-용량-10"><span class="toc-item-num">10&nbsp;&nbsp;</span>성별 사용 화장품 용량</a></span></li><li><span><a href="#성별-리뷰를-남기는-브랜드/상품/카테고리/유형" data-toc-modified-id="성별-리뷰를-남기는-브랜드/상품/카테고리/유형-11"><span class="toc-item-num">11&nbsp;&nbsp;</span>성별 리뷰를 남기는 브랜드/상품/카테고리/유형</a></span></li></ul></div>

In [19]:
import pandas as pd
import os 

import pyecharts as pye

print(os.listdir('../dataset'))

['glowpick_products.csv', 'glowpick_reviews.csv', 'oliveyoung_product_info.csv']


# Load Data

In [117]:
products = pd.read_csv('../dataset/glowpick_products.csv')
reviews = pd.read_csv('../dataset/glowpick_reviews.csv')
print('product shape: ',products.shape)
print('reviews shape: ',reviews.shape)

product shape:  (1183, 8)
reviews shape:  (5125, 7)


# Information

In [118]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1183 entries, 0 to 1182
Data columns (total 8 columns):
brand          1183 non-null object
product        1183 non-null object
vol_price      1183 non-null object
rate           1183 non-null float64
nb_reviews     1183 non-null object
sales_rank     1183 non-null object
product_url    1183 non-null object
category       1183 non-null object
dtypes: float64(1), object(7)
memory usage: 74.0+ KB


In [119]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5125 entries, 0 to 5124
Data columns (total 7 columns):
date             5125 non-null object
user_id          5125 non-null object
sex              5125 non-null object
age_skin_type    5125 non-null object
rate             5125 non-null object
content          5124 non-null object
product_url      5125 non-null object
dtypes: object(7)
memory usage: 280.4+ KB


# Preprocessing

In [169]:
products = products.rename(columns={'rate':'star'})
glowpick = pd.merge(reviews, products, on='product_url', how='left')
print('glowpick shape: ',glowpick.shape)

glowpick shape:  (5713, 14)


In [170]:
# make unique id
glowpick['unique_id'] = np.arange(glowpick.shape[0])

In [171]:
# sex type
sex_dict = {
    'f':'Female',
    'm':'Male'
}
glowpick.sex = glowpick.sex.map(sex_dict)

In [173]:
# age and skin_type
age_skin_type_df = glowpick.age_skin_type.str.replace('·','').str.split('  ', expand=True)
age_skin_type_df.columns = ['age','skin_type']
age_skin_type_df.skin_type = age_skin_type_df.skin_type.str.strip()
age_skin_type_df.age = age_skin_type_df.age.str.replace('세','').astype(int)
glowpick = pd.concat([glowpick,age_skin_type_df], axis=1)

remove_id = glowpick[glowpick.age > 100].unique_id.tolist()
remove_id.extend(glowpick[glowpick.age < 10].unique_id.tolist())
print('이상치 수: ',len(remove_id))
glowpick = glowpick[~glowpick.unique_id.isin(remove_id)]

def age_category(x):
    for i in range(10,110,10):
        if x < i:
            return f'{i}대'

glowpick['age'] = glowpick.age.map(age_category)

# 성별 리뷰를 얼마나 남기는지

In [196]:
sex_cnt = glowpick.sex.value_counts()
pie_sex = pye.Pie('성별 리뷰 비율',title_pos='center',width=400, height=300)
pie_sex.add(' ',sex_cnt.index, sex_cnt,
            radius=[20, 60],
            is_label_show=True,
            legend_orient='vertical',
            legend_pos='left')
pie_sex

In [201]:
pie_sex.render('../images/graphs/sex_nbreviews.html')

# 성별 연령 비율

In [193]:
f_age_cnt = glowpick.loc[glowpick.sex=='Female','age'].value_counts()
m_age_cnt = glowpick.loc[glowpick.sex=='Male','age'].value_counts()

pie_age = pye.Pie('성별 연령대 비율 [좌: 여성, 우: 남성]',title_pos='center')
pie_age.add('여성',f_age_cnt.index, f_age_cnt,
            center=[30,50],
            radius=[20,50],
           is_label_show=True)
pie_age.add('남성',m_age_cnt.index, m_age_cnt,
            center=[70,50],
            radius=[20,50],
           is_label_show=True,
           legend_orient='vertical',
           legend_pos='left')
pie_age

# 성별 피부타입

In [198]:
f_skin_type_cnt = glowpick.loc[glowpick.sex=='Female','skin_type'].value_counts()
m_skin_type_cnt = glowpick.loc[glowpick.sex=='Male','skin_type'].value_counts()

pie_skin_type = pye.Pie('성별 피부타입 비율 [좌: 여성, 우: 남성]',title_pos='center')
pie_skin_type.add('여성',f_skin_type_cnt.index, f_skin_type_cnt,
            center=[30,50],
            radius=[20,50],
           is_label_show=True)
pie_skin_type.add('남성',m_skin_type_cnt.index, m_skin_type_cnt,
            center=[70,50],
            radius=[20,50],
           is_label_show=True,
           legend_orient='vertical',
           legend_pos='left')
pie_skin_type

# 성별 선호도 비율

In [248]:
f_rate_cnt = glowpick.loc[glowpick.sex=='Female','rate'].value_counts()
m_rate_cnt = glowpick.loc[glowpick.sex=='Male','rate'].value_counts()

pie_rate = pye.Pie('성별 선호도 비율 [좌: 여성, 우: 남성]',title_pos='center')
pie_rate.add('여성',f_rate_cnt.index, f_rate_cnt,
            center=[30,50],
            radius=[20,50],
           is_label_show=True)
pie_rate.add('남성',m_rate_cnt.index, m_rate_cnt,
            center=[70,50],
            radius=[20,50],
           is_label_show=True,
           legend_orient='vertical',
           legend_pos='left')
pie_rate

# 성별 평점 비율
큰 차이 없어서 안써도될듯

In [242]:
star_lst = []
star_lst.append(glowpick.loc[glowpick.sex=='Female', 'star'].tolist())
star_lst.append(glowpick.loc[glowpick.sex=='Male', 'star'].tolist())

In [247]:
boxplot_star = pye.Boxplot("성별 평점 비율", title_pos='center')
x_axis = ['Female','Male']
y_axis = boxplot_star.prepare_data(star_lst)
boxplot_star.add("", x_axis, y_axis)
boxplot_star

# 성별 사용 화장품의 가격 / 카테고리별 가격

TODO: vol_price 나눠야함

In [254]:
for s in ['ea','ml','g']:
    glowpick['vol_price'] = glowpick.vol_price.str.replace(s,' ')


In [260]:
glowpick.vol_price.str.split(' ',expand=True).iloc[:,2].unique()

array([None, '14,000원', '15,000원', '4,900원', '52,000원', '9,000원'],
      dtype=object)

In [257]:
glowpick[glowpick.vol_price=='-36,000원']

Unnamed: 0,date,user_id,sex,age_skin_type,rate,content,product_url,brand,product,vol_price,star,nb_reviews,sales_rank,category,unique_id,age,skin_type
4591,2018.11.10,superstars,Male,34세 · 복합성 ·,good,"장점 : 발림성, 지속력?",/product/85004,레오 티지아노,뚜또베네 젠틀 선 비비 로션,"-36,000원",4.0,(1),25,선케어,4591,40대,복합성


In [251]:
glowpick.vol_price.unique()

array(['180ml19,000원', '200ml54,000원', '250ml59,000원', '120ml35,000원',
       '180ml30,000원', '100ml25,000원', '125ml14,000원', '140ml14,800원',
       '120ml28,000원', '250ml36,000원', '250ml22,500원', '150ml25,000원',
       '125ml25,000원', '200ml45,000원', '150ml17,000원', '120ml24,000원',
       '130ml23,000원', '120ml60,000원', '155ml19,000원', '120ml27,000원',
       '140ml25,000원', '120ml50,000원', '135ml40,000원', '150ml11,900원',
       '130ml13,000원', '135ml18,000원', '120ml30,000원', '150ml10,500원',
       '100ml60,000원', '200ml53,000원', '170ml23,000원', '200ml28,900원',
       '200ml49,000원', '35ea15,000원', '150ml30,000원', '140ml17,000원',
       '150ml23,000원', '100ml15,000원', '100ml23,000원', '110ml45,000원',
       '170ml19,000원', '150ml18,000원', '120ml25,000원', '100ml43,000원',
       '140ml12,000원', '130ml11,800원', '50ml20,000원', '125ml15,000원',
       '170ml22,000원', '180ml13,000원', '120ml45,000원', '140ml50,000원',
       '100ml22,000원', '140ml18,000원', '180ml25,000원', '200ml16,000원',
       '

In [212]:
glowpick.groupby('sex').category.value_counts()

sex     category
Female  로션          665
        스킨          575
        에센스         534
        헤어스타일링      384
        선케어         278
        메이크업        240
        클렌징         225
        헤어/바디       221
        크림          204
        쉐이빙         127
        애프터쉐이브       99
        스크럽/필링       22
        마스크/팩        15
Male    메이크업        300
        에센스         270
        로션          265
        스킨          259
        헤어스타일링      222
        쉐이빙         171
        선케어         161
        클렌징         148
        크림          120
        헤어/바디       112
        애프터쉐이브       41
        마스크/팩        17
        스크럽/필링       14
Name: category, dtype: int64

# 성별 사용 화장품 용량

# 성별 리뷰를 남기는 브랜드/상품/카테고리/유형