## Import Libraries and Load data

In [1]:
import os
import sys
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns 
from mpl_toolkits.mplot3d import Axes3D
import math
from IPython.core.interactiveshell import InteractiveShell
from datetime import *
import statistics as stats
InteractiveShell.ast_node_interactivity = "all" 

%matplotlib inline

# pyspark 관련
import pyspark
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
# SparkSession 만들기

spark = SparkSession.builder.appName('eCommerce - Consumer Behavior Analysis').getOrCreate()

In [3]:
spark

In [4]:
# 폰트 설정

from matplotlib import font_manager, rc
import platform

if platform.system() == 'Windows' :
    font_name = font_manager.FontProperties(fname = 'C:\Windows\\Fonts\\NanumBarunGothic.ttf').get_name()
    rc('font', family = font_name)
else :
    rc('font', family = 'AppleGothic')

In [5]:
#commerce_df = spark.read.option('header', 'true').csv('C:\\Users\\u7rye\\Desktop\\e-commerce\\2019-Nov.csv')
commerce_df = spark.read.option('header', 'true').csv('2019-Nov.csv')
print('Data frame type: ' + str(type(commerce_df)))

Data frame type: <class 'pyspark.sql.dataframe.DataFrame'>


In [93]:
commerce_df.cache()

DataFrame[event_time: string, event_type: string, product_id: string, category_id: string, category_code: string, brand: string, price: double, user_id: string, user_session: string]

## Overview of Dataset

In [6]:
print('Data overview')
commerce_df.printSchema()

Data overview
root
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_session: string (nullable = true)



In [7]:
# price double 형변환

commerce_df = commerce_df.withColumn('price', commerce_df.price.cast('double'))
print('Change Data overview')
commerce_df.printSchema()

Change Data overview
root
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_session: string (nullable = true)



In [8]:
print('Data frame describe:')
commerce_df.describe().toPandas()

print(f'There are total {commerce_df.count()} row, Let print first 5 data rows:')
commerce_df.show(5)

Data frame describe:


Unnamed: 0,summary,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,count,67501979,67501979,67501979.0,67501979.0,45603808,58283744,67501979.0,67501979.0,67501969
1,mean,,,12514064.889882294,2.0578976443190984e+18,,,292.4593165646144,538639745.6296759,
2,stddev,,,17257413.62984622,2.0125490328842856e+16,,,355.67449958606784,22885161.05152206,
3,min,2019-11-01 00:00:00 UTC,cart,100000000.0,2.053013552226108e+18,accessories.bag,a-case,0.0,100963605.0,0000007c-adbf-4ed7-af17-d1fef9763d67
4,max,2019-11-30 23:59:59 UTC,view,9900463.0,2.1877078610380068e+18,stationery.cartrige,zyxel,2574.07,97129396.0,fffffde2-4522-4b44-8a32-510c55739ba1


There are total 67501979 row, Let print first 5 data rows:
+--------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code| brand| price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|2019-11-01 00:00:...|      view|   1003461|2053013555631882655|electronics.smart...|xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:...|      view|   5000088|2053013566100866035|appliances.sewing...|janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 00:00:...|      view|  17302664|2053013553853497655|                null| creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 00:00:...|      view|   3601530|2053013563810775923|appliances.kitche...|    lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 00:00:...|  

확인 결과 'category_code', 'brand', 'user_session'에서 null값이 존재한다.

우선, 'user_id'와 'user_session'의 관계에 대해 확인해보고자 한다.

## Detect missing values and abnormal zeroes

In [13]:
# null 갯수 확인하기

commerce_df.select([functions.count(functions.when(isnan(c) | col(c).isNull(), c)).alias(c) for c in commerce_df.columns]).show()

+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|event_time|event_type|product_id|category_id|category_code|  brand|price|user_id|user_session|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|         0|         0|         0|          0|     21898171|9224078|    0|      0|          10|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+



### category_code

'category_id'는 모두 존재하지만 'category_code'는 null 값이 존재한다.

In [77]:
commerce_df.select(commerce_df.category_id, commerce_df.category_code, commerce_df.product_id, commerce_df.price).distinct() \
                        .orderBy(commerce_df.category_id, commerce_df.category_code, commerce_df.product_id).show()

+-------------------+-------------+----------+-------+
|        category_id|category_code|product_id|  price|
+-------------------+-------------+----------+-------+
|2053013552259662037|         null| 100006048| 504.13|
|2053013552259662037|         null| 100006048| 504.08|
|2053013552259662037|         null| 100006048| 504.22|
|2053013552259662037|         null| 100006048| 504.11|
|2053013552259662037|         null| 100006048| 504.14|
|2053013552259662037|         null| 100006048|    0.0|
|2053013552259662037|         null| 100006048| 504.24|
|2053013552259662037|         null| 100007646|    0.0|
|2053013552259662037|         null| 100007646| 707.59|
|2053013552259662037|         null| 100014817|    0.0|
|2053013552259662037|         null| 100016854|    0.0|
|2053013552259662037|         null| 100016854| 1851.5|
|2053013552259662037|         null| 100018391| 818.78|
|2053013552259662037|         null| 100018391|    0.0|
|2053013552259662037|         null| 100018756|1364.63|
|205301355

#### 동일한 값이 있으면 대체하기

'product_id'가 같으면 'category_id'와 'price'가 모두 같은 동일 상품이다. 따라서 'category_code'가 null인 상품의 'product_id'를 찾아준 후 'category_code'를 추가해준다.

In [61]:
checked_category_code = commerce_df.select(commerce_df.category_code) \
                        .filter(commerce_df.category_id == '2053013562753811257')

checked_category_code.show()

+-------------+
|category_code|
+-------------+
|         null|
|         null|
|         null|
|         null|
|         null|
|         null|
|         null|
|         null|
|         null|
|         null|
|         null|
|         null|
|         null|
|         null|
|         null|
|         null|
|         null|
|         null|
|         null|
|         null|
+-------------+
only showing top 20 rows



동일한 값은 없다.

#### product_id로 category_id, category_code 유추하기

같은 'category_id'를 가진 'product_id'들은 서로 어떠한 범위를 가지고 있을 것이다. 라는 가정 

In [66]:
checked_category_id_for_code = commerce_df.select(commerce_df.category_id, commerce_df.product_id) \
                        .distinct() \
                        .orderBy(commerce_df.category_id, commerce_df.product_id)

checked_category_id_for_code.show(truncate=False)

+-------------------+----------+
|category_id        |product_id|
+-------------------+----------+
|2053013552226107603|100001537 |
|2053013552226107603|100019954 |
|2053013552226107603|8600006   |
|2053013552226107603|8600036   |
|2053013552226107603|8600037   |
|2053013552226107603|8600043   |
|2053013552226107603|8600044   |
|2053013552226107603|8600067   |
|2053013552226107603|8600076   |
|2053013552226107603|8600087   |
|2053013552226107603|8600094   |
|2053013552226107603|8600116   |
|2053013552226107603|8600117   |
|2053013552226107603|8600118   |
|2053013552226107603|8600134   |
|2053013552226107603|8600135   |
|2053013552226107603|8600139   |
|2053013552226107603|8600140   |
|2053013552226107603|8600143   |
|2053013552226107603|8600146   |
+-------------------+----------+
only showing top 20 rows



어느정도 연관성이 보이는 것 같다!

product_id가 범위로 존재한다고 가정했을 때, 사이에 없는 product_id를 확인해본다. 

In [67]:
commerce_df.select(commerce_df.category_id).filter(commerce_df.product_id == '8600119').show()

+-----------+
|category_id|
+-----------+
+-----------+



In [68]:
commerce_df.select(commerce_df.category_id).filter(commerce_df.product_id == '8600141').show()

+-----------+
|category_id|
+-----------+
+-----------+



In [69]:
commerce_df.filter(commerce_df.product_id == '8600119').show()

+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+
|event_time|event_type|product_id|category_id|category_code|brand|price|user_id|user_session|
+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+
+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+



연관된 'product_id'중 빈 숫자는 아예 존재하지 않는 값인 것 같다.

product_id의 숫자 범위를 확인해보자.

In [63]:
checked_category_code = commerce_df.select(commerce_df.product_id, commerce_df.category_id, commerce_df.category_code, commerce_df.price) \
                        .filter(commerce_df.category_code.isNull()) \
                        .distinct() \
                        .orderBy(commerce_df.product_id)

checked_category_code.show(truncate=False)

+----------+-------------------+-------------+-----+
|product_id|category_id        |category_code|price|
+----------+-------------------+-------------+-----+
|100000000 |2053013562393101093|null         |0.0  |
|100000000 |2053013562393101093|null         |9.63 |
|100000001 |2053013562753811257|null         |9.63 |
|100000002 |2053013562393101093|null         |9.86 |
|100000003 |2053013562393101093|null         |8.28 |
|100000008 |2053013561814287111|null         |28.42|
|100000008 |2053013561814287111|null         |0.0  |
|100000008 |2053013561814287111|null         |31.34|
|100000008 |2053013561814287111|null         |26.64|
|100000009 |2053013561814287111|null         |28.39|
|100000009 |2053013561814287111|null         |31.31|
|100000009 |2053013561814287111|null         |26.62|
|100000010 |2053013562812531517|null         |3.74 |
|100000010 |2053013562812531517|null         |4.43 |
|100000011 |2053013561814287111|null         |28.44|
|100000011 |2053013561814287111|null         |

- 같은 product_id를 가지고 있음에도 price가 존재하는 경우, 존재하지 않는 경우가 있다.
- 같은 product_id, category_code를 가진 경우에 price가 다른 경우도 있다. 할인, 이벤트 등을 고려하여 날짜와 함께 보는 것이 좋겠다.

In [73]:
gussed_category_code = commerce_df.groupBy("category_id") \
                        .agg(min("product_id").alias("min_product_id") \
                        , max("product_id").alias("max_product_id")) \
                        .orderBy('category_id')

gussed_category_code.show(truncate=False)

+-------------------+--------------+--------------+
|category_id        |min_product_id|max_product_id|
+-------------------+--------------+--------------+
|2053013552226107603|100001537     |8600236       |
|2053013552259662037|100006048     |8500589       |
|2053013552293216471|100000811     |6201398       |
|2053013552326770905|100000368     |3901205       |
|2053013552351936731|100004381     |4201635       |
|2053013552385491165|100014566     |4300490       |
|2053013552427434207|22300002      |22300023      |
|2053013552469377249|100014018     |22500381      |
|2053013552502931683|9000118       |9500152       |
|2053013552570040549|100006004     |9900463       |
|2053013552603594983|2100016       |2100105       |
|2053013552637149417|100003318     |21800314      |
|2053013552662315243|100022627     |15300399      |
|2053013552695869677|18500001      |18500081      |
|2053013552737812719|100003566     |12901549      |
|2053013552788144369|100017349     |12800877      |
|20530135528

'product_id'를 숫자형으로 변경하고 진행해보자.

In [135]:
commerce_df_int = commerce_df.withColumn('product_id', commerce_df.product_id.cast('int'))
print('Change Data overview')
commerce_df_int.printSchema()

Change Data overview
root
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_session: string (nullable = true)



In [136]:
commerce_df_int = commerce_df_int.withColumn('category_id', commerce_df.category_id.cast('int'))
print('Change Data overview')
commerce_df_int.printSchema()

Change Data overview
root
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: integer (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_session: string (nullable = true)



In [104]:
gussed_category_code = commerce_df_int.groupBy(commerce_df_int.category_id).agg(
                                            min(commerce_df_int.product_id).alias('min_product_id') \
                                            , max(commerce_df_int.product_id).alias('max_product_id')) \
                        .orderBy(commerce_df_int.category_id)

gussed_category_code.show(truncate=False)

+-------------------+--------------+--------------+
|category_id        |min_product_id|max_product_id|
+-------------------+--------------+--------------+
|2053013552226107603|8600006       |100019954     |
|2053013552259662037|8500001       |100027800     |
|2053013552293216471|5600042       |100026992     |
|2053013552326770905|3900002       |100007037     |
|2053013552351936731|4200034       |100018846     |
|2053013552385491165|4300000       |100023701     |
|2053013552427434207|22300002      |22300023      |
|2053013552469377249|22500001      |100019052     |
|2053013552502931683|9000118       |9500152       |
|2053013552570040549|9900009       |100015189     |
|2053013552603594983|2100016       |2100105       |
|2053013552637149417|21800000      |100003318     |
|2053013552662315243|15300000      |100022901     |
|2053013552695869677|18500001      |18500081      |
|2053013552737812719|12900004      |100020524     |
|2053013552788144369|12800002      |100022352     |
|20530135528

In [120]:
import seaborn as sns
import matplotlib.pyplot as plt

In [86]:
# 범위 추출이 가장 그럴듯 한 category_id = 2053013552603594983를 가지고 확인해보자.

commerce_df.select(commerce_df.product_id).filter(commerce_df.category_id == '2053013552603594983').distinct().orderBy(commerce_df.product_id).show()

+----------+
|product_id|
+----------+
|   2100016|
|   2100028|
|   2100029|
|   2100035|
|   2100036|
|   2100079|
|   2100080|
|   2100093|
|   2100097|
|   2100099|
|   2100104|
|   2100105|
+----------+



In [110]:
commerce_df.select(commerce_df.product_id).filter((commerce_df.category_id == '2053013552603594983') & (commerce_df.category_code.isNull())).distinct().show()

+----------+
|product_id|
+----------+
+----------+



In [89]:
# 한번 더 product_id의 빈 값을 확인해보자.

commerce_df.filter(commerce_df.product_id == '2100032').orderBy(commerce_df.product_id).show()

+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+
|event_time|event_type|product_id|category_id|category_code|brand|price|user_id|user_session|
+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+
+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+



값의 차이가 없다.

### price

In [11]:
# price가 0인 경우도 확인

string_columns = ['event_time', 'event_type', 'product_id', 'category_id', 'category_code', 'brand', 'user_id', 'user_session']
numeric_columns = ['price']
missing_values = {}

for index, column in enumerate(commerce_df.columns) :
    if column in string_columns :
        missing_count = commerce_df.filter(col(column).eqNullSafe(None)|col(column).isNull()).count()
        missing_values.update({column:missing_count})
    if column in numeric_columns :
        missing_count = commerce_df.where(col(column).isin([0,None,np.nan])).count()
        missing_values.update({column:missing_count})
        
missing_df = pd.DataFrame.from_dict([missing_values])
missing_df

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,0,0,0,0,21898171,9218235,188088,0,10


In [28]:
# price가 0인 경우 상세 확인하기

checked_price = commerce_df.filter(
    commerce_df.price == 0
)
checked_price.show(30)

+--------------------+----------+----------+-------------------+--------------------+-----+-----+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code|brand|price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+-----+-----+---------+--------------------+
|2019-11-01 00:38:...|      view|  33100000|2058719826188173878|                null| null|  0.0|546996930|969ea68f-a919-4d3...|
|2019-11-01 00:42:...|      view|  33100000|2058719826188173878|                null| null|  0.0|546996930|b1ab3863-bbf5-437...|
|2019-11-01 01:07:...|      view|  12720812|2053013553559896355|                null| null|  0.0|516269492|9bf68f2a-fd78-4b1...|
|2019-11-01 01:07:...|      view|  12720812|2053013553559896355|                null| null|  0.0|516269492|9bf68f2a-fd78-4b1...|
|2019-11-01 01:26:...|      view|  38900075|2085718636156158307|                null| null|  0.0|

앞서
- 같은 product_id를 가지고 있음에도 price가 존재하는 경우, 존재하지 않는 경우가 있다.
- 같은 product_id, category_code를 가진 경우에 price가 다른 경우도 있다. 할인, 이벤트 등을 고려하여 날짜와 함께 보는 것이 좋겠다.
라는 결과를 확인했다.

이 경우에 대해서는 'product_id', 'category_id'가 동일한 행이 존재하는지 확인해서 price를 채워주거나 만약 시기별로 할인율 등이 반영되어 가격이 다르다면 어떻게 처리할지 확인해봐야 한다.

In [78]:
commerce_df.filter(commerce_df.product_id == '33100000').show()

+--------------------+----------+----------+-------------------+-------------+-------+-----+---------+--------------------+
|          event_time|event_type|product_id|        category_id|category_code|  brand|price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+-------------+-------+-----+---------+--------------------+
|2019-11-01 00:38:...|      view|  33100000|2058719826188173878|         null|   null|  0.0|546996930|969ea68f-a919-4d3...|
|2019-11-01 00:42:...|      view|  33100000|2058719826188173878|         null|   null|  0.0|546996930|b1ab3863-bbf5-437...|
|2019-11-01 02:17:...|      view|  33100000|2058719826188173878|         null|   null|  0.0|553127751|0b1cf0bc-51d3-180...|
|2019-11-01 02:19:...|      view|  33100000|2058719826188173878|         null|   null|  0.0|553127751|0b1cf0bc-51d3-180...|
|2019-11-01 02:19:...|      view|  33100000|2058719826188173878|         null|   null|  0.0|546273062|e200d925-6aa9-4e5...|
|2019-11

'price' 심지어 'brand'도 존재한다!

In [80]:
commerce_df.filter(commerce_df.product_id == '33100001').show()

+--------------------+----------+----------+-------------------+-------------+-------+-----+---------+--------------------+
|          event_time|event_type|product_id|        category_id|category_code|  brand|price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+-------------+-------+-----+---------+--------------------+
|2019-11-01 02:19:...|      view|  33100001|2180736567012753620|         null|   null|  0.0|553127751|0b1cf0bc-51d3-180...|
|2019-11-01 03:35:...|      view|  33100001|2180736567012753620|         null|   null|  0.0|512483076|8075ada0-1669-413...|
|2019-11-01 04:16:...|      view|  33100001|2180736567012753620|         null|   null|  0.0|517119568|b08e1dab-f3ff-4ea...|
|2019-11-01 04:16:...|      view|  33100001|2180736567012753620|         null|   null|  0.0|517119568|b08e1dab-f3ff-4ea...|
|2019-11-01 04:32:...|      view|  33100001|2180736567012753620|         null|   null|  0.0|529378497|6114c198-dfec-45c...|
|2019-11

### user_session에 관한 분석

In [11]:
checked_count_user = commerce_df.select(count('user_id'), count('user_session'), \
                                            countDistinct('user_id'), countDistinct('user_session'))

checked_count_user.show()

+--------------+-------------------+-----------------------+----------------------------+
|count(user_id)|count(user_session)|count(DISTINCT user_id)|count(DISTINCT user_session)|
+--------------+-------------------+-----------------------+----------------------------+
|      67501979|           67501969|                3696117|                    13776050|
+--------------+-------------------+-----------------------+----------------------------+



In [21]:
# 'user_session'에 따른 행의 갯수를 파악하고 세부 내역 확인을 통해 관게를 파악

checked_user_relation = commerce_df.select(commerce_df.user_session) \
                        .groupBy(commerce_df.user_session) \
                        .count() \
                        .orderBy("count", ascending=False) \
                        .limit(5)

checked_user_relation.show(truncate=False)

+------------------------------------+-----+
|user_session                        |count|
+------------------------------------+-----+
|d99d91bf-40f8-4e29-9593-54b4a1826542|4128 |
|fc749a4e-c432-4dae-a0a1-04de89f1e4ea|2466 |
|b556f0c7-3a23-44f5-9f34-e713fefa9686|1963 |
|d6433d7b-3846-456a-88de-748c3fac2675|1658 |
|88206fc3-b5ea-4e3b-be68-67edfbf7009b|1373 |
+------------------------------------+-----+



In [40]:
checked_user_relation_detail = commerce_df.select(commerce_df.user_id) \
                            .distinct() \
                            .filter(commerce_df.user_session == 'd99d91bf-40f8-4e29-9593-54b4a1826542') \

checked_user_relation_detail.show(truncate=False)

+---------+
|user_id  |
+---------+
|573277455|
+---------+



'user_id'와 'user_session'이 1:N 관계이다.

그렇다면 세션이 null인 경우는 어떤 경우일지?

In [14]:
# user_session이 null인 경우 확인하기

commerce_df.filter(commerce_df.user_session.isNull()).show()

+--------------------+----------+----------+-------------------+--------------------+-------+-------+---------+------------+
|          event_time|event_type|product_id|        category_id|       category_code|  brand|  price|  user_id|user_session|
+--------------------+----------+----------+-------------------+--------------------+-------+-------+---------+------------+
|2019-11-09 15:32:...|      cart|  19700004|2053013559104766575|                null|kabrita|  37.77|539704497|        null|
|2019-11-09 17:15:...|      cart|   1005083|2053013555631882655|electronics.smart...|  honor| 566.27|568843390|        null|
|2019-11-13 04:02:...|      cart|   4804008|2053013554658804075|electronics.audio...|bluedio|  97.81|570411102|        null|
|2019-11-13 07:18:...|      cart|   1004767|2053013555631882655|electronics.smart...|samsung| 243.51|570878749|        null|
|2019-11-23 12:53:...|      cart|   7600528|2053013552821698803|                null|tp-link|  16.73|575357602|        null|


모든 경우가 'event_type'이 'cart'인 경우에서 발생했다. 'user_session' 에 대해서는 일정 시간 이상 행동이 없는 경우 종료된다고 나와있기 때문에 아마 선행된 view와 연결된 경우라고 생각된다.

In [None]:
commerce_df.filter(commerce_df.user_id == '539704497').show()

In [16]:
# id, session 관계 확인 - 수정

checked_id_session = commerce_df.select(commerce_df.user_id, commerce_df.user_session) \
    .distinct() \
    .count()

checked_id_session.show()

+---------+-----+
|  user_id|count|
+---------+-----+
|568778435|22542|
|569335945|14810|
|512475445| 6074|
|568793129| 4453|
|567475167| 3617|
+---------+-----+



In [29]:
# id, session 관계 확인

checked_id_session = commerce_df.select(commerce_df.user_id, commerce_df.user_session) \
    .distinct() \
    .groupBy(commerce_df.user_id, commerce_df.user_session) \
    .count() \
    .orderBy("count", ascending=False) \
    .limit(5)
    
checked_id_session.show()

+---------+--------------------+-----+
|  user_id|        user_session|count|
+---------+--------------------+-----+
|520772685|816a59f3-f5ae-4cc...|    1|
|513200477|742aba02-727b-4d1...|    1|
|553802615|e09684bb-0c95-4f6...|    1|
|542346595|75c35801-ce60-44b...|    1|
|558726315|e43aa696-aefc-406...|    1|
+---------+--------------------+-----+



event type 관련 확인하기.

In [15]:
checked_event_type = commerce_df.select(commerce_df.event_type, commerce_df.user_id, commerce_df.user_session) \
    .distinct() \
    .groupBy(commerce_df.event_type) \
    .count() \
    .orderBy("count", ascending=False)
    
checked_event_type_df = checked_event_type.toPandas()
checked_event_type_df

Unnamed: 0,event_type,count
0,view,13767353
1,cart,1743354
2,purchase,773214


In [16]:
checked_event_type_not_distinct = commerce_df.select(commerce_df.event_type, commerce_df.user_id, commerce_df.user_session) \
    .groupBy(commerce_df.event_type) \
    .count() \
    .orderBy("count", ascending=False)
    
checked_event_type_not_distinct_df = checked_event_type_not_distinct.toPandas()
checked_event_type_not_distinct_df

Unnamed: 0,event_type,count
0,view,63556110
1,cart,3028930
2,purchase,916939


In [None]:
# Top 5 workout types

highest_sport_users_df = ranked_sport_users_df.limit(5).toPandas()

# Rename column name : 'count' --> Users count
highest_sport_users_df.rename(columns = {'count':'Users count'}, inplace = True)

# Caculate the total users, we will this result to compute percentage later
total_sports_users = ranked_sport_users_df.groupBy().sum().collect()[0][0]

In [125]:
# 퍼널 차트 그려보기

attr = ["A", "B", "C", "D", "E", "F"]
value = [20, 40, 60, 80, 100, 120]
funnel = Funnel("퍼널 그래프")
funnel.add(
    "퍼널",
    attr,
    value,
    is_label_show=True,
    label_pos="inside",
    label_text_color="#fff",
)
funnel.width=700
funnel.height=500
funnel

NameError: name 'Funnel' is not defined

In [124]:
!pip install pyecharts

Collecting pyecharts
  Downloading pyecharts-1.9.1-py3-none-any.whl (135 kB)
     -------------------------------------- 135.6/135.6 kB 4.0 MB/s eta 0:00:00
Collecting prettytable
  Downloading prettytable-3.4.1-py3-none-any.whl (26 kB)
Collecting simplejson
  Downloading simplejson-3.17.6-cp39-cp39-win_amd64.whl (75 kB)
     ---------------------------------------- 75.8/75.8 kB 4.1 MB/s eta 0:00:00
Installing collected packages: simplejson, prettytable, pyecharts
Successfully installed prettytable-3.4.1 pyecharts-1.9.1 simplejson-3.17.6




In [127]:
from pyecharts import Funnel

ModuleNotFoundError: No module named 'pyecharts'

In [None]:
# event_type 종류 확인

commerce_df.select('event_type').distinct().show()

kaggle data 설명에는 event_type에 'remove_from_cart'가 있었으나 해당 데이터셋에서는 확인되지 않았음.

In [None]:
# evnet_type == remove_from_cart 값 확인

df_event_type_remove_cart = commerce_df.select('*').where(commerce_df.event_type == 'remove_from_cart')
df_event_type_remove_cart.show()

### 전체 데이터셋 시각화

In [None]:
!pip install dask[dataframe]

In [None]:
import dask.dataframe as dd

In [None]:
commerce_df_dask = dd.read_csv("2019-Nov.csv")
commerce_df_dask = commerce_df_dask.compute()
commerce_df_dask.head()

In [None]:
create_report(commerce_df_dask)

In [None]:
print(commerce_df_pandas.info())

In [None]:
df = df.astype({'price':'float', 'event_time':'datetime'})

In [None]:
# evnet_type == view 값 확인

df_event_type_view = commerce_df.select('*').where(commerce_df.event_type == 'view')
df_event_type_view.show()

In [None]:
# evnet_type == purchase 값 확인

df_event_type_purchase = commerce_df.select('*').where(commerce_df.event_type == 'purchase')
df_event_type_purchase.show()

In [None]:
# evnet_type == cart 값 확인

df_event_type_cart = commerce_df.select('*').where(commerce_df.event_type == 'cart')
df_event_type_cart.show()

kaggle 데이터 설명에는 event_type에 'remove_from_cart'가 있다고 하였지만 실제로는 확인되지 않음.

In [None]:
df_event_type_purchase.count()

In [None]:
df_event_type_cart.count()

In [None]:
df_event_type_view.count()

In [None]:
# brand null값 확인

commerce_df.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in commerce_df.columns)).show()

'category_id', 'category_code', 'brand'의 미싱값이 이상함. id, brand는 있는데 코드는 없거나. id, code, 

In [None]:
# category_code, brand 종류 확인

commerce_df.select("category_code","brand").distinct().show()