# 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np

In [3]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [54]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score 

In [4]:
# 경고 메세지 안보이게 설정
import warnings
warnings.filterwarnings('ignore')

In [120]:
df = pd.read_csv('data/modeling_data.csv', encoding='utf-8')
df.head()

Unnamed: 0,성별,연령대,거주지분류대코드,채널구분,상품대분류명,구매시간,구매금액,구매월,군집
0,여성,40대,Z07,1,과자,16,12757,09월,1
1,여성,30대,Z11,1,여성의류,16,267864,01월,3
2,여성,50대,Z17,1,대용식,15,2571,07월,0
3,여성,40대,Z17,1,과자,18,11556,04월,0
4,여성,40대,Z17,1,화장품/뷰티케어,18,91648,07월,0


In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26917 entries, 0 to 26916
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   성별        26917 non-null  object
 1   연령대       26917 non-null  object
 2   거주지분류대코드  26917 non-null  object
 3   채널구분      26917 non-null  int64 
 4   상품대분류명    26917 non-null  object
 5   구매시간      26917 non-null  int64 
 6   구매금액      26917 non-null  int64 
 7   구매월       26917 non-null  object
 8   군집        26917 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 1.8+ MB


# 경우 1. Onehot + Ordinal(변형)
주로 오디널인코딩 사용해서 수치화. 트리 계열 모델은 오디널인코딩에서 순위 영향을 받지 않는다고 한다.

1. 성별  
명목변수 - 원핫인코딩
2. 연령대  
순위변수(10년이 기준) - 20대: 2, 30대: 3, ..로 오디널인코딩
3. 거주지분류대코드  
명목변수 - 종류 매우 많음. 따라서 오디널인코딩
4. 상품대분류명  
명목변수 - 종류 매우 많음. 따라서 오디널인코딩
5. 구매월  
명목변수 - 1월:1, 2월:2, ..로 오디널인코딩

In [122]:
df1 = df

In [123]:
# 성별 - 원핫인코딩
before = list(np.sort(df['성별'].unique()))
print(before)
after = [i for i in range(len(before))]
print(after)
df1 = pd.get_dummies(df, columns=['성별'])

['남성', '여성']
[0, 1]


In [124]:
# 연령대 - 순서형(기존 데이터 반영)
before = list(np.sort(df['연령대'].unique()))
print(before)
after = [2, 3, 4, 5, 6, 7]
print(after)
df1['연령대'].replace(before, after, inplace=True)

['20대', '30대', '40대', '50대', '60대', '70대']
[2, 3, 4, 5, 6, 7]


In [125]:
# 거주지분류대코드 - 순서형(기존 데이터 반영)
before = list(np.sort(df['거주지분류대코드'].unique()))
print(before)
after = [i+1 for i in range(len(before))]
print(after)
df1['거주지분류대코드'].replace(before, after, inplace=True)

['Z01', 'Z02', 'Z03', 'Z04', 'Z05', 'Z06', 'Z07', 'Z08', 'Z09', 'Z10', 'Z11', 'Z12', 'Z13', 'Z14', 'Z15', 'Z16', 'Z17']
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]


In [126]:
# 상품대분류명 - 순서형
before = list(np.sort(df['상품대분류명'].unique()))
print(before)
after = [i for i in range(len(before))]
print(after)
df1['상품대분류명'].replace(before, after, inplace=True)

['가구', '건강식품', '건강용품', '건해산물', '계절가전', '공구/안전용품', '과일', '과자', '구기/필드스포츠', '기타(비상품)', '기타상품', '남성의류', '냉동식품', '냉장/세탁가전', '냉장식품', '담배', '대용식', '모바일', '문구/사무용품', '병통조림', '상품권', '생활/렌탈서비스', '생활/주방가전', '서적/음반/악기', '세제/위생', '속옷/양말/홈웨어', '수산물', '스포츠패션', '시즌스포츠', '식기/조리기구', '아웃도어/레저', '양곡', '여성의류', '여행/레저서비스', '영상/음향가전', '완구', '원예/애완', '유아동의류', '유아식품', '유제품', '음료', '인테리어/조명', '자동차용품', '조리식품', '조미료', '주류', '주방잡화', '채소', '청소/세탁/욕실용품', '축산물', '출산/육아용품', '침구/수예', '커피/차', '컴퓨터', '테넌트/음식점', '패션잡화', '퍼스널케어', '헬스/피트니스', '화장품/뷰티케어']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58]


In [127]:
# 구매월 - 순서형(기존 데이터 반영)
before = list(np.sort(df['구매월'].unique()))
print(before)
after = [int(before[i][:2]) for i in range(len(before))]
print(after)
df1['구매월'].replace(before, after, inplace=True)

['01월', '02월', '03월', '04월', '05월', '06월', '07월', '08월', '09월', '10월', '11월', '12월']
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


In [128]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26917 entries, 0 to 26916
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   연령대       26917 non-null  int64
 1   거주지분류대코드  26917 non-null  int64
 2   채널구분      26917 non-null  int64
 3   상품대분류명    26917 non-null  int64
 4   구매시간      26917 non-null  int64
 5   구매금액      26917 non-null  int64
 6   구매월       26917 non-null  int64
 7   군집        26917 non-null  int64
 8   성별_남성     26917 non-null  uint8
 9   성별_여성     26917 non-null  uint8
dtypes: int64(8), uint8(2)
memory usage: 1.7 MB


모든 변수가 수치형 변수로 바뀌었다.

## Random Forest 모델링

In [131]:
# X, Y 설정
x_cols = ['연령대', '거주지분류대코드', '채널구분', '상품대분류명', '구매시간', '구매금액', '구매월', '성별_남성', '성별_여성']
X = df1[x_cols].values
y = df1['군집'].values

# train, test 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7) # 20%를 test로, random_state으로 랜덤한 값 고정
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

(18841, 9) (8076, 9) (18841,) (8076,)


In [132]:
## Random Forest 학습
model1 = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=0)
model1.fit(X_train, y_train)
y_pred1 = mode2.predict(X_test)

print("Train set 정확도: {:.3f}".format(model1.score(X_train, y_train)))
print("Test set 정확도: {:.3f}\n".format(model1.score(X_test, y_test)))

print(f"Accuracy: {accuracy_score(y_test, y_pred1):.3f}") 
print(f"Precision: {precision_score(y_test, y_pred1, average='macro'):.3f}")  #micro보다 성능이 좋게 나옴
print(f"Recall: {recall_score(y_test, y_pred1, average='micro'):.3f}")
print(f"F1-score: {f1_score(y_test, y_pred1, average='micro'):.3f}")

Train set 정확도: 0.856
Test set 정확도: 0.855

Accuracy: 0.855
Precision: 0.886
Recall: 0.855
F1-score: 0.855


# 경우 2. Ordinal(변형)

In [84]:
df2 = df

In [88]:
# 성별 - 원핫인코딩
before = list(np.sort(df['성별'].unique()))
print(before)
after = [i for i in range(len(before))]
print(after)
df2['성별'].replace(before, after, inplace=True)

['남성', '여성']
[0, 1]


In [85]:
# 연령대 - 순서형(기존 데이터 반영)
before = list(np.sort(df['연령대'].unique()))
print(before)
after = [2, 3, 4, 5, 6, 7]
print(after)
df2['연령대'].replace(before, after, inplace=True)

['20대', '30대', '40대', '50대', '60대', '70대']
[2, 3, 4, 5, 6, 7]


In [86]:
# 거주지분류대코드 - 순서형(기존 데이터 반영)
before = list(np.sort(df['거주지분류대코드'].unique()))
print(before)
after = [i+1 for i in range(len(before))]
print(after)
df2['거주지분류대코드'].replace(before, after, inplace=True)

['Z01', 'Z02', 'Z03', 'Z04', 'Z05', 'Z06', 'Z07', 'Z08', 'Z09', 'Z10', 'Z11', 'Z12', 'Z13', 'Z14', 'Z15', 'Z16', 'Z17']
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]


In [89]:
# 상품대분류명 - 순서형
before = list(np.sort(df['상품대분류명'].unique()))
print(before)
after = [i for i in range(len(before))]
print(after)
df2['상품대분류명'].replace(before, after, inplace=True)

['가구', '건강식품', '건강용품', '건해산물', '계절가전', '공구/안전용품', '과일', '과자', '구기/필드스포츠', '기타(비상품)', '기타상품', '남성의류', '냉동식품', '냉장/세탁가전', '냉장식품', '담배', '대용식', '모바일', '문구/사무용품', '병통조림', '상품권', '생활/렌탈서비스', '생활/주방가전', '서적/음반/악기', '세제/위생', '속옷/양말/홈웨어', '수산물', '스포츠패션', '시즌스포츠', '식기/조리기구', '아웃도어/레저', '양곡', '여성의류', '여행/레저서비스', '영상/음향가전', '완구', '원예/애완', '유아동의류', '유아식품', '유제품', '음료', '인테리어/조명', '자동차용품', '조리식품', '조미료', '주류', '주방잡화', '채소', '청소/세탁/욕실용품', '축산물', '출산/육아용품', '침구/수예', '커피/차', '컴퓨터', '테넌트/음식점', '패션잡화', '퍼스널케어', '헬스/피트니스', '화장품/뷰티케어']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58]


In [92]:
# 구매월 - 순서형(기존 데이터 반영)
before = list(np.sort(df['구매월'].unique()))
print(before)
after = [int(before[i][:2]) for i in range(len(before))]
print(after)
df2['구매월'].replace(before, after, inplace=True)

['01월', '02월', '03월', '04월', '05월', '06월', '07월', '08월', '09월', '10월', '11월', '12월']
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


In [93]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26917 entries, 0 to 26916
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   성별        26917 non-null  int64
 1   연령대       26917 non-null  int64
 2   거주지분류대코드  26917 non-null  int64
 3   채널구분      26917 non-null  int64
 4   상품대분류명    26917 non-null  int64
 5   구매시간      26917 non-null  int64
 6   구매금액      26917 non-null  int64
 7   구매월       26917 non-null  int64
 8   군집        26917 non-null  int64
dtypes: int64(9)
memory usage: 1.8 MB


## Random Forest 모델링

In [133]:
# X, Y 설정
x_cols = ['성별', '연령대', '거주지분류대코드', '채널구분', '상품대분류명', '구매시간', '구매금액', '구매월']
X = df2[x_cols].values
y = df2['군집'].values

# train, test 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7) # 20%를 test로, random_state으로 랜덤한 값 고정
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

(18841, 8) (8076, 8) (18841,) (8076,)


In [135]:
## Random Forest 학습
model2 = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=0)
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)

print("Train set 정확도: {:.3f}".format(model2.score(X_train, y_train)))
print("Test set 정확도: {:.3f}\n".format(model2.score(X_test, y_test)))

print(f"Accuracy: {accuracy_score(y_test, y_pred1):.3f}") 
print(f"Precision: {precision_score(y_test, y_pred2, average='macro'):.3f}")  #micro보다 성능이 좋게 나옴
print(f"Recall: {recall_score(y_test, y_pred2, average='micro'):.3f}")
print(f"F1-score: {f1_score(y_test, y_pred2, average='micro'):.3f}")

Train set 정확도: 0.867
Test set 정확도: 0.862

Accuracy: 0.855
Precision: 0.895
Recall: 0.862
F1-score: 0.862


# 경우 3. Onehot

In [105]:
df3 = df
df3 = pd.get_dummies(df, columns=['성별', '연령대', '거주지분류대코드', '채널구분', '상품대분류명', '구매월'])
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26917 entries, 0 to 26916
Columns: 101 entries, 구매시간 to 구매월_12
dtypes: int64(3), uint8(98)
memory usage: 3.1 MB


## Radom Forest 모델링

In [136]:
# X, Y 설정
X = df3.drop(['군집'], axis=1).values
y = df3['군집'].values

# train, test 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7) # 20%를 test로, random_state으로 랜덤한 값 고정
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

(18841, 100) (8076, 100) (18841,) (8076,)


In [138]:
## Random Forest 학습
model3 = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=0)
model3.fit(X_train, y_train)
y_pred3 = model3.predict(X_test)

print("Train set 정확도: {:.3f}".format(model3.score(X_train, y_train)))
print("Test set 정확도: {:.3f}\n".format(model3.score(X_test, y_test)))

print(f"Accuracy: {accuracy_score(y_test, y_pred1):.3f}") 
print(f"Precision: {precision_score(y_test, y_pred3, average='macro'):.3f}")  #micro보다 성능이 좋게 나옴
print(f"Recall: {recall_score(y_test, y_pred3, average='micro'):.3f}")
print(f"F1-score: {f1_score(y_test, y_pred3, average='micro'):.3f}")

Train set 정확도: 0.879
Test set 정확도: 0.881

Accuracy: 0.855
Precision: 0.904
Recall: 0.881
F1-score: 0.881


# Lpay 데이터에 성능 테스트

In [142]:
lpay = pd.read_csv('data/mergedata/lpay.csv', encoding='utf-8')
lpay.head()

Unnamed: 0,고객번호,영수증번호,제휴사,채널구분,이용일자,이용시간,이용금액,성별,연령대,거주지분류대코드
0,M629656521,210803210311226,A03,1,20210803,21,10900,남성,40대,Z04
1,M216016456,210803130167542,L01,2,20210803,13,6860,여성,40대,Z04
2,M205142844,210803140275112,A02,1,20210803,14,9000,여성,60대,Z17
3,M737010483,210803040637594,A06,2,20210803,4,36740,남성,40대,Z16
4,M707775545,210803140675502,A06,2,20210803,14,138500,남성,30대,Z11


In [143]:
lpay['구매월'] = lpay['이용일자'].map(lambda x: str(x)[4:6]+'월')
lpay = lpay[['채널구분', '이용시간', '성별', '연령대', '거주지분류대코드', '구매월']]
lpay.head()

Unnamed: 0,채널구분,이용시간,성별,연령대,거주지분류대코드,구매월
0,1,21,남성,40대,Z04,08월
1,2,13,여성,40대,Z04,08월
2,1,14,여성,60대,Z17,08월
3,2,4,남성,40대,Z16,08월
4,2,14,남성,30대,Z11,08월


In [144]:
testset = pd.get_dummies(lpay, columns=['채널구분', '성별', '연령대', '거주지분류대코드', '구매월'])
testset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353184 entries, 0 to 353183
Data columns (total 40 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   이용시간          353184 non-null  int64
 1   채널구분_1        353184 non-null  uint8
 2   채널구분_2        353184 non-null  uint8
 3   성별_남성         353184 non-null  uint8
 4   성별_여성         353184 non-null  uint8
 5   연령대_20대       353184 non-null  uint8
 6   연령대_30대       353184 non-null  uint8
 7   연령대_40대       353184 non-null  uint8
 8   연령대_50대       353184 non-null  uint8
 9   연령대_60대       353184 non-null  uint8
 10  연령대_70대       353184 non-null  uint8
 11  거주지분류대코드_Z01  353184 non-null  uint8
 12  거주지분류대코드_Z02  353184 non-null  uint8
 13  거주지분류대코드_Z03  353184 non-null  uint8
 14  거주지분류대코드_Z04  353184 non-null  uint8
 15  거주지분류대코드_Z05  353184 non-null  uint8
 16  거주지분류대코드_Z06  353184 non-null  uint8
 17  거주지분류대코드_Z07  353184 non-null  uint8
 18  거주지분류대코드_Z08  353184 non-null  uint8
 19  거주

In [147]:
testset = testset.values

array([[21,  1,  0, ...,  0,  0,  0],
       [13,  0,  1, ...,  0,  0,  0],
       [14,  1,  0, ...,  0,  0,  0],
       ...,
       [20,  1,  0, ...,  0,  0,  0],
       [18,  1,  0, ...,  0,  0,  0],
       [15,  0,  1, ...,  0,  0,  0]], dtype=int64)

In [151]:
## Random Forest 학습
model1.predict(testset)

ValueError: X has 40 features, but RandomForestClassifier is expecting 9 features as input.