<font color = "#CC3D3D"><b>
# (DW Practice #3) Market Basket Analysis

- 장바구니분석(Market Basket Analysis)은 거래내역(Transaction)을 통해 고객이 구매한 상품 간의 연관 관계 또는 규칙를 찾을 때 사용하는 분석기법이다.  
  - (연관규칙의 표현) `항목 A`와 `품목 B`를 구매한 고객은 `품목 C`를 구매한다: *(품목 A) & (품목 B) => (품목 C)*
- 교차판매, 상품진열, 부정탐지, 상품 카달로그 디자인 등에 주로 활용된다.  
<img align='left' src='https://blog.rsquaredacademy.com/img/mba_steps.png' style='width: 80%; height: auto;'>

- 장바구니분석을 하게되면 수많은 연관규칙이 나오기 때문에 이 중에서 유용한 규칙을 선별할 수 있는 아래와 같은 평가기준이 요구된다.  
<img align='left' src='http://drive.google.com/uc?export=view&id=191LWlu63r0T3GIv-FX-x7Ds4bezBfxfU' style='width: 80%; height: auto;'>

#### 데이터 준비

In [1]:
import pandas as pd
import numpy as np

In [2]:
# read raw data
cs = pd.read_csv('L사_고객정보.csv')
gd = pd.read_csv('L사_상품정보.csv')
tr = pd.read_csv('L사_거래정보.csv')

# merge data 
gd.pd_c = gd.pd_c.astype(str).str.zfill(4)  # 문자열이 4자리로 되어있기 때문
df = pd.merge(tr, cs).merge(gd, on='pd_c')
df.de_dt = df.de_dt.astype(str).astype('datetime64') 

In [3]:
# transform data: 고객별로 상품을 구매했는지 안했는지 여부만 나타내는 데이터프레임 생성
store_data = pd.pivot_table(df, index='clnt_id', columns='clac_nm3', values='buy_ct', aggfunc=np.size, fill_value=0) \
            .applymap(lambda x: 1 if x>=1 else 0).reset_index()
transactions = store_data.iloc[:,1:]  # 고객ID 제외
transactions

clac_nm3,Unnamed: 1,Accessory Bags,Accident Prevention Equipment,Adhesive Tapes,Adhesives,Adult's Bed Covers and Skirts,Adult's Diapers,Adult's Disposable Briefes,Adults' Bed Fillings,Adults' Bedding Sets,...,Yoga / Pilates Clothing,Yoga Mats / Exercise Mats,Yoghurts / Chunggukjang Makers,Yogurt Drinks,Young Pumpkins,Young Radishes,Yuzus / Quinces,Zipper Poly Bags / Plastic Bags,kelp,life Vests / Safety Accessories
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11265,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11266,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11267,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11268,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# 가장 많이 구매한 상위 10개 상품
transactions.sum().sort_values(ascending=False).head(10)

clac_nm3
Chicken Eggs          4545
Ramens                4400
General Snacks        4168
Fresh Milk            4067
Tofu                  3974
Bibim Ramens          3195
Trash Bags            3089
Spoon Type Yogurts    3062
Corn Snacks           2946
Sausages              2906
dtype: int64

#### 빈발항목집합 추출 - Apriori

In [5]:
# 대표적인 연관규칙탐사 알고리즘인 Apriori를 실행하기 위해서는 mlxtend 패키지를 설치해야 함
#!pip install mlxtend

In [6]:
# apriori => 빈발항목집합(itemsets) 추출
# association_rules => 연관규칙 생성
from mlxtend.frequent_patterns import apriori, association_rules

In [7]:
# 최소 지지도(support) 이상인 빈발항목집합만 추출하고 지지도 기준 내림차순으로 출력
freq_items = apriori(transactions, min_support=0.1, use_colnames=True)
freq_items.sort_values(by='support', ascending=False)

Unnamed: 0,support,itemsets
8,0.403283,(Chicken Eggs)
62,0.390417,(Ramens)
41,0.369831,(General Snacks)
33,0.360870,(Fresh Milk)
73,0.352618,(Tofu)
...,...,...
463,0.100177,"(Corn Snacks, Fish Cakes, General Snacks)"
180,0.100177,"(Cookies, Water)"
373,0.100089,"(Cheese, General Snacks, Fresh Milk)"
474,0.100089,"(Corn Snacks, Ramens, Spoon Type Yogurts)"


<font color='red'><p>
##### Added Codes

In [8]:
# 2개 이상의 빈발항목들만 추출하고자 할 때
freq_items['length'] = freq_items['itemsets'].apply(lambda x: len(x))
freq_items.query('length >= 2')

Unnamed: 0,support,itemsets,length
80,0.117303,"(Bibim Ramens, Bananas)",2
81,0.101242,"(Cheese, Bananas)",2
82,0.178971,"(Chicken Eggs, Bananas)",2
83,0.111358,"(Cookies, Bananas)",2
84,0.117657,"(Corn Snacks, Bananas)",2
...,...,...,...
562,0.107010,"(Corn Snacks, Ramens, General Snacks, Fresh Milk)",4
563,0.101952,"(Corn Snacks, Tofu, General Snacks, Fresh Milk)",4
564,0.105679,"(Corn Snacks, Tofu, Ramens, General Snacks)",4
565,0.118367,"(Tofu, Ramens, General Snacks, Fresh Milk)",4


#### 연관규칙 도출

In [9]:
# 최소 신뢰도(confidence) 이상인 연관규칙만 출력
rules = association_rules(freq_items, metric='confidence')
rules.query('confidence >= 0.8')

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(Tofu, Bananas)",(Chicken Eggs),0.164774,0.403283,0.132032,0.801292,1.986923,0.065581,3.002990
1,"(Corn Snacks, Bananas)",(General Snacks),0.117657,0.369831,0.100621,0.855204,2.312415,0.057108,4.352102
2,"(Tofu, Bibim Ramens)",(Chicken Eggs),0.170009,0.403283,0.139752,0.822025,2.038333,0.071190,3.352814
3,"(Cookies, Bibim Ramens)",(General Snacks),0.127240,0.369831,0.104703,0.822873,2.224995,0.057645,3.557723
4,"(Cookies, Bibim Ramens)",(Ramens),0.127240,0.390417,0.103372,0.812413,2.080885,0.053695,3.249598
...,...,...,...,...,...,...,...,...,...
107,"(Tofu, Ramens, Spoon Type Yogurts)",(Chicken Eggs),0.125555,0.403283,0.107010,0.852297,2.113396,0.056376,4.039974
108,"(Corn Snacks, Ramens, Fresh Milk)",(General Snacks),0.122360,0.369831,0.107010,0.874547,2.364717,0.061757,5.023136
109,"(Corn Snacks, Tofu, Fresh Milk)",(General Snacks),0.116859,0.369831,0.101952,0.872437,2.359014,0.058734,4.940072
110,"(Corn Snacks, Tofu, Ramens)",(General Snacks),0.121295,0.369831,0.105679,0.871251,2.355806,0.060820,4.894548


<font color='red'><p>
##### Added Codes

#### 상품 분류수준(level) 조정

In [10]:
# 빈발항목에 들어간 상품(소분류 기준) 확인
lev3_freq_items = np.unique([item for items in freq_items.itemsets for item in items])
lev3_freq_items

array(['Apples', 'Bacons', 'Bananas', 'Bar Ice Creams', 'Bibim Ramens',
       'Canned Tunas', 'Carrots', 'Cheese', 'Chicken Eggs',
       'Chickins for Braised Spicy Soups',
       'Chickins for Chicken Soup With Ginseng',
       'Chilled Chicken, Jokbal and Pork Feet',
       'Chilled Coffee and Tea Beverages',
       'Chilled Fruit and Vegetable Beverages', 'Chilled Noodles',
       'Ciders', 'Coffee Drinks', 'Cokes', 'Cookies', 'Corn Snacks',
       'Crab Sticks', 'Crackers', 'Cucumbers', 'Cup Noodles',
       'Domestic Beefs - Rounds', 'Domestic Beer',
       'Domestic Porks - Bellys', 'Domestic Porks - Picnics',
       'Domestic Porks - Shoulder Butts', 'Enoki Mushrooms', 'Fish Cakes',
       'Fixed-price Living Products', 'Flavored Milk', 'Fresh Milk',
       'Fried Tofu', 'Frozen Dumplings', 'Frozen Fried Foods',
       'Frozen Korean Pancakes', 'Fruit Juices', 'Functional Milk',
       'General Cereals', 'General Snacks', 'General Yogurt', 'Grapes',
       'Ham', 'Imported Bee

In [11]:
# 빈발항목에 포함되지 않은 희소 상품은 대분류 수준으로 level을 상향
df['adj_item'] = df.clac_nm3
not_freq = df.clac_nm3.apply(lambda x,y: True if x not in y else False, args=(lev3_freq_items,))
df['adj_item'][not_freq] = df.clac_nm1
df

  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,clnt_id,trans_id,trans_seq,biz_unit,pd_c,de_dt,de_tm,buy_am,buy_ct,clnt_gender,clnt_age,clac_nm1,clac_nm2,clac_nm3,adj_item
0,21922,104999,3,A03,0182,2019-09-20,12:41,10900,1,F,50,Chilled Foods,Chilled Instant Foods,Chilled Soups,Chilled Foods
1,39423,105124,10,A03,0182,2019-09-20,17:26,21800,2,F,50,Chilled Foods,Chilled Instant Foods,Chilled Soups,Chilled Foods
2,39423,89469,2,A03,0182,2019-09-01,03:32,25800,2,F,50,Chilled Foods,Chilled Instant Foods,Chilled Soups,Chilled Foods
3,39423,88436,1,A03,0182,2019-08-30,17:15,25800,2,F,50,Chilled Foods,Chilled Instant Foods,Chilled Soups,Chilled Foods
4,18362,50872,3,A03,0182,2019-07-15,09:46,31600,4,F,40,Chilled Foods,Chilled Instant Foods,Chilled Soups,Chilled Foods
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
582304,53632,75835,1,A01,0258,2019-08-14,10:00,77040,2,M,50,Computers,Computer Accessories,Printer Inks / Toners,Computers
582305,1499,92574,4,A01,1303,2019-09-04,08:09,29000,1,F,50,Sport Fashion,Women's Climbing Clothing,Women's Climbing Vests,Sport Fashion
582306,58443,103525,1,A01,0263,2019-09-18,10:30,568000,1,F,30,Computers,Computers / Laptops,Tablet PCs,Computers
582307,21232,103476,1,A01,0263,2019-09-18,01:38,362500,1,F,30,Computers,Computers / Laptops,Tablet PCs,Computers


#### 장바구니분석 다시 수행

In [12]:
# (조정된 상품 수준으로) 데이터 준비
store_data = pd.pivot_table(df, index='clnt_id', columns='adj_item', values='buy_ct', aggfunc=np.size, fill_value=0) \
            .applymap(lambda x: 1 if x>=1 else 0).reset_index()
transactions = store_data.iloc[:,1:]  # 고객ID 제외
transactions

adj_item,Apples,Automotive Products,Baby Foods,Bacons,Ball Game / Field Sports,Bananas,Bar Ice Creams,Bedding / Handicraft,Beverages,Bibim Ramens,...,Trash Bags,Travel / Leisure Services,Underwear / Socks and Hosiery / Homewear,Vegetables,Video / Audio System Electronics,Water,Watermelons,Women's Clothing,Yogurt Drinks,Young Pumpkins
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,1,...,0,0,0,1,0,1,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
4,0,0,0,1,0,1,1,0,1,0,...,1,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11266,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11267,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11268,0,0,0,0,0,1,0,0,1,0,...,0,0,0,1,0,1,0,1,0,0
11269,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# 빈발항목 추출
freq_items = apriori(transactions, min_support=0.15, use_colnames=True)
freq_items.sort_values(by='support', ascending=False)

Unnamed: 0,support,itemsets
49,0.467927,(Spices / Seasonings)
56,0.431461,(Vegetables)
29,0.424630,(Fruits)
37,0.421879,(Meats)
5,0.403247,(Chicken Eggs)
...,...,...
417,0.150120,"(Corn Snacks, Chicken Eggs, Spices / Seasonings)"
920,0.150031,"(Snack Foods, Ramens, Spices / Seasonings, Meats)"
577,0.150031,"(Tofu, Dried Sea Foods, Spices / Seasonings)"
837,0.150031,"(Tofu, Chicken Eggs, Substitute Foods, Meats)"


In [14]:
# 빈발항목의 변화 확인
adj_freq_items = np.unique([item for items in freq_items.itemsets for item in items])

print('새로 나타난 상품:\n', set(adj_freq_items) - set(lev3_freq_items))
print('사라진 상품:\n', set(lev3_freq_items) - set(adj_freq_items))

새로 나타난 상품:
 {'Cooked Foods', 'Coffee / Tea', 'Sea Foods', 'Detergents / Hygiene Goods', 'Substitute Foods', 'Tenants / Restaurants', 'Snack Foods', 'Fruits', 'Spices / Seasonings', 'Cosmetics / Beauty Care', 'Frozen Foods', 'Meats', "Women's Clothing", 'Chilled Foods', 'Beverages', 'Dried Sea Foods', 'Kitchenware', 'Vegetables', 'Canned / Jarred Foods', 'Personal Care', 'Fashion Accessories', 'Underwear / Socks and Hosiery / Homewear', 'Grains'}
사라진 상품:
 {'Watermelons', 'Domestic Beefs - Rounds', 'Domestic Porks - Shoulder Butts', 'Domestic Porks - Picnics', 'Saesongi Mushrooms', 'Packged Kimchi', 'Pies', 'Bacons', 'Cokes', 'Pickled Radishes', 'Other Retort Pouches', 'Chilled Coffee and Tea Beverages', 'Paprika', 'General Yogurt', 'General Cereals', 'Radishes', 'Coffee Drinks', 'Soft Drink Mixes', 'Salads', 'Fried Tofu', 'Ciders', 'Imported Beer', 'Domestic Beer', 'Bar Ice Creams', 'Sesame Leaves', 'Korean Soju', 'Toilet Papers', 'Potatoes', 'Canned Tunas', 'Ham', 'Chickins for Braised

In [15]:
# 연관규칙 출력
rules = association_rules(freq_items, metric='confidence')
rules.query('confidence >= 0.8')

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Canned / Jarred Foods),(Spices / Seasonings),0.265016,0.467927,0.213734,0.806495,1.723550,0.089726,2.749660
1,(Cheese),(Spices / Seasonings),0.211960,0.467927,0.170260,0.803265,1.716648,0.071078,2.704518
2,(Cup Noodles),(Spices / Seasonings),0.213912,0.467927,0.171768,0.802986,1.716052,0.071673,2.700693
3,(Dried Sea Foods),(Spices / Seasonings),0.260403,0.467927,0.209298,0.803748,1.717680,0.087449,2.711173
4,(Fish Cakes),(Spices / Seasonings),0.214533,0.467927,0.179487,0.836642,1.787977,0.079102,3.257098
...,...,...,...,...,...,...,...,...,...
556,"(Fruits, Substitute Foods, Spices / Seasonings...",(Meats),0.182415,0.421879,0.152870,0.838035,1.986434,0.075913,3.569419
557,"(Meats, Tofu, Fruits, Spices / Seasonings)",(Vegetables),0.172301,0.431461,0.153757,0.892379,2.068271,0.079416,5.282786
558,"(Meats, Tofu, Fruits, Vegetables)",(Spices / Seasonings),0.173631,0.467927,0.153757,0.885539,1.892475,0.072511,4.648517
559,"(Meats, Tofu, Spices / Seasonings, Vegetables)",(Fruits),0.184988,0.424630,0.153757,0.831175,1.957412,0.075206,3.408089


<font color = "#CC3D3D"><b>
# End