### Imports

In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
from tensorflow import keras
import tensorflow as tf
import warnings
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import cross_val_score

from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectPercentile
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
plt.rc('font', family=fm.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()) # for Windows OS user
import datetime
import os

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import BayesianRidge
from tensorflow import keras
from scipy.stats.mstats import gmean
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import sys, warnings
if not sys.warnoptions: warnings.simplefilter("ignore")

In [3]:
import datetime
from sklearn.model_selection import StratifiedKFold, KFold
import math

In [4]:
from sklearn.decomposition import PCA

In [5]:
from sklearn.preprocessing import PowerTransformer, StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer

In [6]:
train = pd.read_csv(os.path.abspath("../input")+'/X_train.csv', encoding='cp949')
test = pd.read_csv(os.path.abspath("../input")+'/X_test.csv', encoding='cp949')
y_target = pd.read_csv(os.path.abspath("../input")+'/y_train.csv').group
y_target_c = pd.read_csv(os.path.abspath("../input")+'/y_train.csv')
IDtrain = pd.DataFrame({'custid': train.custid.unique()})
IDtest = pd.DataFrame({'custid': test.custid.unique()})

- custid 고객 아이디  
- goodcd	상품코드  

**수치형_연속**  
- tot_amt	구매금액  
- dis_amt	할인금액  
- net_amt	실구매액  

**수치형_이산**  
- sales_month	판매월(13은 다음해 1월을 의미) : 5월 ~ 다음해 4월  
- sales_day	판매일  
- sales_time	판매시간
- import_flg	수입품여부(0:비수입품)
- inst_mon	할부개월  
- inst_fee	무이자할부여부(0:무이자)  

**범주형**  
- sales_dayofweek	판매요일  
- str_nm	지점명 : 총 4개, 비율은 비슷  
- brd_nm	브랜드이름  
- corner_nm	코너이름  
- pc_name	상품군이름  
- part_nm	상품관리파트이름  
- team_nm	상품관리팀이름  
- buyer_nm	바이어이름  

### 데이터 전처리

In [7]:
# 사이버 쇼핑, 점외 등 1개 밖에 없는 data를 가지고 있는 행 삭제 

train.drop(index = 578987, inplace = True)

In [8]:
# 한번에 처리하기 위해 합침
tr = pd.concat([train, test])

In [9]:
# 합친 unique id
id_data = pd.DataFrame({'custid': tr.custid.unique()})

In [10]:
# 월별 사용금액 
tr['real_amt']= tr.tot_amt / tr.inst_mon

In [11]:
# 할인율
tr['dist_rate'] = (tr['dis_amt']/tr['tot_amt'])*100

In [12]:
# 구매시간 -> 구매 시, 구매 분
tr['sales_hour'] = tr['sales_time']//100
tr['sales_min'] = tr['sales_time']%100

In [13]:
# 구매월 -> str형태로
# 구매 월일 feature
tr["sales_date"] = np.where(tr["sales_month"] >9 , 
                            (tr["sales_month"].astype(str)),("0"+tr["sales_month"].astype(str)))

tr["sales_date"] = np.where(tr["sales_day"]>9,(tr["sales_date"]+tr["sales_day"].astype(str)),
                            (tr["sales_date"]+"0"+tr["sales_day"].astype(str))
                            )

In [14]:
# 구매월 ( 12 넘는 월들 -12)
tr["sales_month"] = np.where(tr["sales_month"]>12 , tr["sales_month"]-12, tr["sales_month"])

In [15]:
# 구매시간 -> datetime 형태로
tr["time"] = pd.to_datetime(tr.sales_time, format = "%H%M")

In [16]:
# 환불금액
tr["refund"] = np.where(tr["net_amt"]<0,1,0)

---

In [17]:
# 위에꺼 합친거
tr['sales_month_ver1'] = tr['sales_month']
tr['sales_month'] = tr['sales_month'].apply(lambda x : x-12 if x > 12 else x )
tr['sales_hour'] = tr['sales_time'].apply(lambda x : x//100 )
tr['sales_sec'] = tr['sales_time'].apply(lambda x : x%100 )
tr['total_sec'] = tr['sales_time'].apply(lambda x : x//100*60 + x%100 )
tr['환불여부'] = tr['tot_amt'].apply(lambda x : 1 if x < 0 else 0 )
tr['refund'] = tr['tot_amt'].apply(lambda x : abs(x) if x < 0 else 0 )
tr['tot_amt'] = tr['tot_amt'] .apply(lambda x : 0 if x < 0 else x )
tr['real_amt'] = ( tr['tot_amt'] / tr['inst_mon'] ).apply(lambda x : math.trunc(x)) 
tr['sales_date'] =tr['sales_month_ver1'].astype(str).apply(lambda x : "0"+x if len(x) == 1  else x ) +tr['sales_day'].astype(str).apply(lambda x : "0"+x if len(x) == 1 else  x )
tr['sales_date'] = tr['sales_date'].astype(int)

In [18]:
df = pd.merge(tr, y_target_c, on = 'custid')
tr['age_group'] = df['group'].apply(lambda x : 'twenty' if (x=='F20') | (x=='M20')
                                     else 'thirty' if (x=='F30') | (x=='M30')
                                     else 'forty' if (x=='F40') | (x=='M40')
                                     else 'fifty' if (x=='F50') | (x=='M50')
                                     else 'sixty')
                                   

tr['gender_group'] = df['group'].apply(lambda x : 'female' if (x=='F20') | (x=='F30') | (x=='F40') | (x=='F50') else 'male')

In [19]:
twenty_prefer_brd = tr[tr['age_group'] == 'twenty'].brd_nm.value_counts().index[1:].to_list()
thirty_prefer_brd = tr[tr['age_group'] == 'thirty'].brd_nm.value_counts().index[1:].to_list()
forty_prefer_brd = tr[tr['age_group'] == 'forty'].brd_nm.value_counts().index[1:].to_list()
fifty_prefer_brd = tr[tr['age_group'] == 'fifty'].brd_nm.value_counts().index[1:].to_list()
sixty_prefer_brd = tr[tr['age_group'] == 'sixty'].brd_nm.value_counts().index[1:].to_list()

def prefer_brd(x, list):
    for i in range(len(list)):
        if(x == list[i]):
            return len(list)-i

tr['20_weight'] = tr['brd_nm'].apply(lambda x: prefer_brd(x, twenty_prefer_brd)).fillna(0)
tr['30_weight'] = tr['brd_nm'].apply(lambda x: prefer_brd(x, thirty_prefer_brd)).fillna(0)
tr['40_weight'] = tr['brd_nm'].apply(lambda x: prefer_brd(x, forty_prefer_brd)).fillna(0)
tr['50_weight'] = tr['brd_nm'].apply(lambda x: prefer_brd(x, fifty_prefer_brd)).fillna(0)
tr['60_weight'] = tr['brd_nm'].apply(lambda x: prefer_brd(x, sixty_prefer_brd)).fillna(0)

In [20]:
female_prefer_brd = tr[tr['gender_group'] == 'female'].brd_nm.value_counts().index[1:].to_list()
male_prefer_brd = tr[tr['gender_group'] == 'male'].brd_nm.value_counts().index[1:].to_list()

tr['f_weight'] = tr['brd_nm'].apply(lambda x: prefer_brd(x, female_prefer_brd)).fillna(0)
tr['m_weight'] = tr['brd_nm'].apply(lambda x: prefer_brd(x, male_prefer_brd)).fillna(0)

In [21]:
tr["age_weight_sum"] = tr['20_weight']+tr['30_weight']+tr['40_weight']+tr['50_weight']+tr['60_weight']
tr['gender_weight_sum'] = tr['f_weight']+tr['m_weight']

In [22]:
tr["sales_season"] = tr['sales_month'].apply(lambda x : 'SPRING' if (x>=3) and (x<=5)
                                                  else 'SUMMER' if (x>=6) and (x<=8)
                                                  else 'FALL' if (x>=9) and (x<=11)
                                                  else 'WINTER')

In [23]:
tr['tot_amt>=0'] = tr['tot_amt'].apply(lambda x : x if x>=0 else 0)
tr['dis_amt>=0'] = tr['dis_amt'].apply(lambda x : x if x>=0 else 0)
tr['net_amt>=0'] = tr['net_amt'].apply(lambda x : x if x>=0 else 0)

In [24]:
# 중복 명 처리
tr.loc[tr.corner_nm == '모피.피혁', 'corner_nm'] = '모피/피혁'
tr.loc[tr.corner_nm == '원목(주니어)', 'corner_nm'] = '원목/주니어'
tr.loc[tr.corner_nm == '우산,장갑', 'corner_nm'] = '우산/장갑'
tr.loc[tr.corner_nm == '우산장갑', 'corner_nm'] = '우산/장갑'
tr.loc[tr.corner_nm == '트.단품 ', 'corner_nm'] = '트단품'
tr.loc[tr.corner_nm == 'TV,VTR', 'corner_nm'] = 'TV/VTR'
tr.loc[tr.corner_nm == 'TV.VTR', 'corner_nm'] = 'TV/VTR'
tr.loc[tr.corner_nm == 'GBR  지원', 'corner_nm'] = 'GBR지원'
tr.loc[tr.corner_nm == '페레  지원', 'corner_nm'] = '페레지원'
tr.loc[tr.corner_nm == '라디오.카세트', 'corner_nm'] = '라디오/카세트'
tr.loc[tr.corner_nm == '스포츠용퓸', 'corner_nm'] = '스포츠용품'
tr.loc[tr.corner_nm == '카세트,전화기', 'corner_nm'] = '전화기/카세트'

In [25]:
# pc_nm 에서 겹치는 값들 처리

tr.loc[tr.pc_nm == '침구,수예', 'pc_nm'] = '침구/수예'
tr.loc[tr.pc_nm == '디자이너부띠크', 'pc_nm'] = '디자이너부띠끄'
tr.loc[tr.pc_nm == '디자이너부틱', 'pc_nm'] = '디자이너부띠끄'
tr.loc[tr.pc_nm == '니트,단품,모피', 'pc_nm'] = '니트/단품/모피'
tr.loc[tr.pc_nm == '니트/단품', 'pc_nm'] = '니트/단품/모피'
tr.loc[tr.pc_nm == '로얄부틱', 'pc_nm'] = '로얄부띠끄'
tr.loc[tr.pc_nm == '트랜디 케쥬얼', 'pc_nm'] = '트랜디캐쥬얼'

In [26]:
# buyer_nm 에서 겹치는 값들 처리

tr['buyer_nm'] = tr['buyer_nm'].str.replace('행사장.*','행사장')
tr['buyer_nm'] = tr['buyer_nm'].str.replace('피혁A','피혁')
tr['buyer_nm'] = tr['buyer_nm'].str.replace('피혁B','피혁')

In [27]:
tr

Unnamed: 0,custid,sales_month,sales_day,sales_dayofweek,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,...,50_weight,60_weight,f_weight,m_weight,age_weight_sum,gender_weight_sum,sales_season,tot_amt>=0,dis_amt>=0,net_amt>=0
0,0,6,25,일,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,...,1664.0,0,1846.0,1779.0,6886.0,3625.0,SUMMER,90000,9000,81000
1,0,6,25,일,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,...,1658.0,0,1839.0,1774.0,6863.0,3613.0,SUMMER,39000,3900,35100
2,0,8,26,토,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,...,1660.0,0,1845.0,1776.0,6879.0,3621.0,SUMMER,175000,17500,157500
3,0,8,26,토,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,...,1219.0,0,1269.0,1287.0,4758.0,2556.0,SUMMER,455000,45500,409500
4,0,9,3,일,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,...,1663.0,0,1847.0,1780.0,6892.0,3627.0,FALL,100000,10000,90000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
414950,49993,1,31,수,1750,신촌점,4405551020474,톰키드,아동,아동,...,1100.0,0,1573.0,1484.0,5419.0,3057.0,WINTER,20000,0,20000
414951,49993,1,31,수,1833,신촌점,2139140008300,폴로화장품,향수,화장품,...,1615.0,0,1811.0,1756.0,6748.0,3567.0,WINTER,70000,3500,66500
414952,49994,4,14,토,1750,본점,4230120011274,스테파넬,영트랜드,영트렌디,...,1613.0,0,1800.0,1704.0,6675.0,3504.0,SPRING,39000,0,39000
414953,49994,4,14,토,1810,본점,4409271026010,써스데이아일앤드,진케주얼,진케주얼,...,1417.0,0,1613.0,1562.0,5971.0,3175.0,SPRING,34200,0,34200


### features 생성

In [28]:
features = []

- 환불여부

In [29]:
t = tr.groupby("custid")["refund"].agg([("refund_count","sum")]).reset_index()
t["refund_bool"] = np.where(t["refund_count"]>0,1,0)
features.append(t[['custid','refund_bool']])

- 환불 총액, 평균 , 최대

In [30]:
def refund_check(x):
    if x<0:
        return -x
    else:
        return 0
    
copy_df = tr.copy()
copy_df["tot_amt_refund"] = copy_df.net_amt.apply(refund_check)
t = copy_df.groupby("custid")["tot_amt_refund"].agg([("amt_refund","sum"),
                                                    ("amt_refund_mean","mean"),
                                                    ("amt_refund_max","min")]).reset_index()
features.append(t)

- 최애브랜드 사용수

In [31]:
def g(x):
    y = x.value_counts()
    y = y.iloc[0]
    return y

f = tr.groupby('custid')['brd_nm'].agg([('love_brd_count', g)]).reset_index()
f = f.fillna(0)
features.append(f)

- 연령대별 브랜드 선호도에 따른 가중치 비율

In [32]:
f = tr.groupby('custid')['20_weight', '30_weight', '40_weight', '50_weight', '60_weight', 'age_weight_sum'].sum()

f['20_weight_ratio'] = f['20_weight'] / f['age_weight_sum']
f['30_weight_ratio'] = f['30_weight'] / f['age_weight_sum']
f['40_weight_ratio'] = f['40_weight'] / f['age_weight_sum']
f['50_weight_ratio'] = f['50_weight'] / f['age_weight_sum']
f['60_weight_ratio'] = f['60_weight'] / f['age_weight_sum']

f = f.fillna(0)
features.append(f)

- 성별별 브랜드 선호도에 따른 가중치 비율

In [33]:
f = tr.groupby('custid')['f_weight','m_weight', 'gender_weight_sum'].sum()

f['f_weight_ratio'] = f['f_weight'] / f['gender_weight_sum']
f['m_weight_ratio'] = f['m_weight'] / f['gender_weight_sum']

f = f.fillna(0)
features.append(f)

- 성별별 선호브랜드

In [34]:
sex = y_target.map(lambda x : x[0])
age = y_target.map(lambda x : x[1:])

In [35]:
train_sex = pd.concat([pd.read_csv(os.path.abspath("../input")+'/y_train.csv').custid, sex], axis=1).rename(columns={'group':'sex'})
train_age = pd.concat([pd.read_csv(os.path.abspath("../input")+'/y_train.csv').custid, age], axis=1).rename(columns={'group':'age'})
train_sex.sex = train_sex.sex.astype('str')
train_age = train_age.astype('int')

In [36]:
train = pd.merge(train, train_sex, on='custid')
train = pd.merge(train, train_age, on='custid')

In [37]:
tr['구매여부'] = tr['tot_amt>=0'].apply(lambda x: 1 if x  > 0 else 0)
tr['환불여부'] = tr['dis_amt>=0'].apply(lambda x: 1 if x  > 0 else 0)

In [38]:
# 남성선호 상품 100개
best_brd_M = list(train.query('sex=="M"').brd_nm.value_counts().index[:100])

f = tr.groupby('custid')['구매여부'].agg([('구매건수', np.sum)]).reset_index()
brd_M = tr.query('brd_nm == @best_brd_M').groupby('custid')['구매여부'].agg([('남성선호브랜드_구매건수', np.sum)]).reset_index()

f = pd.merge(f, brd_M, on = 'custid', how = 'left').fillna(0)
f['남성선호브랜드_구매비율'] = f['남성선호브랜드_구매건수'] / f['구매건수']
f = f.iloc[:,[0,2,3]]
features.append(f); display(f)

Unnamed: 0,custid,남성선호브랜드_구매건수,남성선호브랜드_구매비율
0,0,9.0,0.818182
1,2,5.0,0.555556
2,3,11.0,0.392857
3,4,1.0,0.250000
4,5,20.0,0.625000
...,...,...,...
35962,49988,2.0,0.500000
35963,49990,1.0,1.000000
35964,49992,1.0,0.500000
35965,49993,3.0,0.750000


In [39]:
# 여성선호 상품 100개
best_brd_F = list(train.query('sex=="F"').brd_nm.value_counts().index[:100])

f = tr.groupby('custid')['구매여부'].agg([('구매건수', np.sum)]).reset_index()
brd_F = tr.query('brd_nm == @best_brd_F').groupby('custid')['구매여부'].agg([('여성선호브랜드_구매건수', np.sum)]).reset_index()

f = pd.merge(f, brd_F, on = 'custid', how = 'left').fillna(0)
f['여성선호브랜드_구매비율'] = f['여성선호브랜드_구매건수'] / f['구매건수']
f = f.iloc[:,[0,2,3]]
features.append(f); display(f)

Unnamed: 0,custid,여성선호브랜드_구매건수,여성선호브랜드_구매비율
0,0,9.0,0.818182
1,2,4.0,0.444444
2,3,10.0,0.357143
3,4,1.0,0.250000
4,5,18.0,0.562500
...,...,...,...
35962,49988,1.0,0.250000
35963,49990,1.0,1.000000
35964,49992,1.0,0.500000
35965,49993,3.0,0.750000


- 연령대별 브랜드 구매건수, 구매비율

In [40]:
# 20대
best_brd_20 = list(train.query('age<30').brd_nm.value_counts().index[:185])

f = tr.groupby('custid')['구매여부'].agg([('구매건수', np.sum)]).reset_index()
brd_20 = tr.query('brd_nm == @best_brd_20').groupby('custid')['구매여부'].agg([('20대브랜드_구매건수', np.sum)]).reset_index()

f = pd.merge(f, brd_20, on = 'custid', how = 'left').fillna(0)
f['20대브랜드_구매비율'] = f['20대브랜드_구매건수'] / f['구매건수']
f = f.iloc[:,[0,2,3]]
features.append(f); display(f)

Unnamed: 0,custid,20대브랜드_구매건수,20대브랜드_구매비율
0,0,9.0,0.818182
1,2,8.0,0.888889
2,3,15.0,0.535714
3,4,1.0,0.250000
4,5,18.0,0.562500
...,...,...,...
35962,49988,1.0,0.250000
35963,49990,1.0,1.000000
35964,49992,1.0,0.500000
35965,49993,3.0,0.750000


In [41]:
# 30대
best_brd_30 = list(train.query('30<=age<40').brd_nm.value_counts().index[:185])

f = tr.groupby('custid')['구매여부'].agg([('구매건수', np.sum)]).reset_index()
brd_30 = tr.query('brd_nm == @best_brd_30').groupby('custid')['구매여부'].agg([('30대브랜드_구매건수', np.sum)]).reset_index()

f = pd.merge(f, brd_30, on = 'custid', how = 'left').fillna(0)
f['30대브랜드_구매비율'] = f['30대브랜드_구매건수'] / f['구매건수']
f = f.iloc[:,[0,2,3]]
features.append(f); display(f)

Unnamed: 0,custid,30대브랜드_구매건수,30대브랜드_구매비율
0,0,9.0,0.818182
1,2,7.0,0.777778
2,3,15.0,0.535714
3,4,1.0,0.250000
4,5,22.0,0.687500
...,...,...,...
35962,49988,3.0,0.750000
35963,49990,1.0,1.000000
35964,49992,1.0,0.500000
35965,49993,4.0,1.000000


In [42]:
# 40대
best_brd_40 = list(train.query('40<=age<50').brd_nm.value_counts().index[:185])

f = tr.groupby('custid')['구매여부'].agg([('구매건수', np.sum)]).reset_index()
brd_40 = tr.query('brd_nm == @best_brd_40').groupby('custid')['구매여부'].agg([('40대브랜드_구매건수', np.sum)]).reset_index()

f = pd.merge(f, brd_40, on = 'custid', how = 'left').fillna(0)
f['40대브랜드_구매비율'] = f['40대브랜드_구매건수'] / f['구매건수']
f = f.iloc[:,[0,2,3]]
features.append(f); display(f)

Unnamed: 0,custid,40대브랜드_구매건수,40대브랜드_구매비율
0,0,9.0,0.818182
1,2,8.0,0.888889
2,3,18.0,0.642857
3,4,1.0,0.250000
4,5,22.0,0.687500
...,...,...,...
35962,49988,2.0,0.500000
35963,49990,1.0,1.000000
35964,49992,1.0,0.500000
35965,49993,3.0,0.750000


In [43]:
# 50대
best_brd_50 = list(train.query('50<=age').brd_nm.value_counts().index[:185])

f = tr.groupby('custid')['구매여부'].agg([('구매건수', np.sum)]).reset_index()
brd_50 = tr.query('brd_nm == @best_brd_50').groupby('custid')['구매여부'].agg([('50대브랜드_구매건수', np.sum)]).reset_index()

f = pd.merge(f, brd_50, on = 'custid', how = 'left').fillna(0)
f['50대브랜드_구매비율'] = f['50대브랜드_구매건수'] / f['구매건수']
f = f.iloc[:,[0,2,3]]
features.append(f); display(f)

Unnamed: 0,custid,50대브랜드_구매건수,50대브랜드_구매비율
0,0,9.0,0.818182
1,2,8.0,0.888889
2,3,13.0,0.464286
3,4,1.0,0.250000
4,5,25.0,0.781250
...,...,...,...
35962,49988,1.0,0.250000
35963,49990,1.0,1.000000
35964,49992,1.0,0.500000
35965,49993,3.0,0.750000


- 시간대별 방문횟수

In [44]:
t = pd.pivot_table(tr, index='custid', columns='sales_hour', values='tot_amt', 
                   aggfunc=np.size).fillna(0).astype(int).reset_index().add_prefix('time_c_').rename(columns={'time_c_custid':'custid'})
features.append(t)

- 시간대별 구매금액

In [45]:
t = pd.pivot_table(tr, index='custid', columns='sales_hour', values='tot_amt', 
                   aggfunc="sum").fillna(0).astype(int).reset_index().drop([0,1,8,9],axis = 1).add_prefix('time_s_').rename(columns={'time_s_custid':'custid'})
features.append(t)

- 날짜별 방문횟수

In [46]:
'''
t = pd.pivot_table(tr, index='custid', columns='sales_day', values='tot_amt', 
                   aggfunc=np.size).fillna(0).astype(int).reset_index()
'''

"\nt = pd.pivot_table(tr, index='custid', columns='sales_day', values='tot_amt', \n                   aggfunc=np.size).fillna(0).astype(int).reset_index()\n"

In [47]:
'''
f = id_data.copy()
f["month_start"] = 0
f["month_mid"] = 0
f["month_end"] = 0 
for i in [x for x in range(1,11)]:
    f["month_start"] += t[i]
    f["month_mid"] += t[i+10]
    f["month_end"] += t[i+20]
f["month_end"] += t[31]
features.append(f)
'''

'\nf = id_data.copy()\nf["month_start"] = 0\nf["month_mid"] = 0\nf["month_end"] = 0 \nfor i in [x for x in range(1,11)]:\n    f["month_start"] += t[i]\n    f["month_mid"] += t[i+10]\n    f["month_end"] += t[i+20]\nf["month_end"] += t[31]\nfeatures.append(f)\n'

- 날짜별 구매금액

In [48]:
'''
t = pd.pivot_table(tr, index='custid', columns='sales_day', values='tot_amt', 
                   aggfunc="sum").fillna(0).astype(int).reset_index()
'''                   

'\nt = pd.pivot_table(tr, index=\'custid\', columns=\'sales_day\', values=\'tot_amt\', \n                   aggfunc="sum").fillna(0).astype(int).reset_index()\n'

In [49]:
'''
f = id_data.copy()
f["month_start"] = 0
f["month_mid"] = 0
f["month_end"] = 0 
for i in [x for x in range(1,11)]:
    f["month_start"] += t[i]
    f["month_mid"] += t[i+10]
    f["month_end"] += t[i+20]
f["month_end"] += t[31]
features.append(f)
'''

'\nf = id_data.copy()\nf["month_start"] = 0\nf["month_mid"] = 0\nf["month_end"] = 0 \nfor i in [x for x in range(1,11)]:\n    f["month_start"] += t[i]\n    f["month_mid"] += t[i+10]\n    f["month_end"] += t[i+20]\nf["month_end"] += t[31]\nfeatures.append(f)\n'

- 월간별 날짜횟수

In [50]:
t = pd.pivot_table(tr, index='custid', columns='sales_month', values='tot_amt', 
                   aggfunc=np.size).fillna(0).astype(int).reset_index().add_prefix('sm_c_').rename(columns={'sm_c_custid':'custid'})
features.append(t)

- 월간 구매금액

In [51]:
t = pd.pivot_table(tr, index='custid', columns='sales_month', values='tot_amt', 
                   aggfunc="sum").fillna(0).astype(int).reset_index().add_prefix('sm_s_').rename(columns={'sm_s_custid':'custid'})
features.append(t)

In [52]:
#쇼핑시간
time_sum = tr.groupby(['sales_date','custid'])['time'].agg([('time', ['min','max'])]).reset_index()
time_sum['shopping_time'] = (time_sum['time']['max'] - time_sum['time']['min']).dt.total_seconds()
time_sum.drop(['sales_date','time'], axis=1, inplace=True,level=0)
time_sum = time_sum.groupby(['custid'])['shopping_time'].agg([('shopping_time_mean','mean')]).reset_index()
features.append(time_sum)

In [53]:
#평균할인율
f = tr.groupby('custid')['dist_rate'].agg([('dis_rate', 'mean')]).reset_index()
features.append(f)

In [54]:
#평균시간
f = tr.groupby(['custid'])['sales_time'].agg([('sales_time', 'mean')]).reset_index()
features.append(f)

In [55]:
# 남성파트
df = tr.groupby(['custid','part_nm'])['tot_amt'].agg([('tot_amt_part', 'sum')]).reset_index()
df['part_nm'] = np.where(df.part_nm.str.contains('남성'), '남성', '비남성')
df = df.pivot_table(values='tot_amt_part', index=df.custid, columns='part_nm', aggfunc='first',fill_value=0).reset_index()
df['남성part'] = (df['남성'] / (df['남성'] + df['비남성'])) * 100
df = df.fillna(0)
features.append(df)

In [56]:
# 화장품구매비율
df = tr.groupby(['custid','corner_nm'])['tot_amt'].agg([('tot_amt_corner', 'sum')]).reset_index()
df['corner_nm'] = np.where(df.corner_nm.str.contains('화장품'), '화장품', '비화장품')
df = df.pivot_table(values='tot_amt_corner', index=df.custid, columns='corner_nm', aggfunc='first',fill_value=0).reset_index()
df['화장품비율'] = (df['화장품'] / (df['화장품'] + df['비화장품'])) * 100
df = df.fillna(0)
features.append(df)

In [57]:
# 할부대비평균실구매
f = tr.groupby('custid')['real_amt'].agg([('real_amt', 'mean')]).reset_index()
features.append(f)

In [58]:
# 평균구매상품종류
df =tr.groupby(['custid','goodcd'])['tot_amt'].agg([('good_count', 'count')]).reset_index()
f = df.groupby(['custid'])['good_count'].agg([('good_count_mean', 'mean')]).reset_index()
features.append(f)

*구매금액합계*

In [59]:
# 지역
df = tr.groupby(['custid','str_nm'])['tot_amt'].agg([('tot_amt_str', 'sum')]).reset_index()
df =df.pivot_table(values='tot_amt_str', index=df.custid, columns='str_nm', aggfunc='first',fill_value=0).reset_index()
features.append(df)

In [60]:
# 팀별
df = tr.groupby(['custid','team_nm'])['tot_amt'].agg([('tot_amt_team', 'sum')]).reset_index()
df =df.pivot_table(values='tot_amt_team', index=df.custid, columns='team_nm', aggfunc='first',fill_value=0).reset_index()
features.append(df)

In [61]:
#총구매수입상품
df = tr.groupby(['custid'])['import_flg'].agg([('import_flg_sum', 'sum')]).reset_index()
features.append(df)

In [62]:
#파트
df = tr.groupby(['custid','part_nm'])['tot_amt'].agg([('tot_amt_part', 'sum')]).reset_index()
df =df.pivot_table(values='tot_amt_part', index=df.custid, columns='part_nm', aggfunc='first',fill_value=0).reset_index()
features.append(df)

In [63]:
#코너
df = tr.groupby(['custid','corner_nm'])['tot_amt'].agg([('tot_amt_corner', 'sum')]).reset_index()
df =df.pivot_table(values='tot_amt_corner', index=df.custid, columns='corner_nm', aggfunc='first',fill_value=0).reset_index()
features.append(df)


In [64]:
'''#pc
df = tr.groupby(['custid','pc_nm'])['tot_amt'].agg([('tot_amt_pc', 'sum')]).reset_index()
df =df.pivot_table(values='tot_amt_pc', index=df.custid, columns='pc_nm', aggfunc='first',fill_value=0).reset_index()
features.append(df)'''

"#pc\ndf = tr.groupby(['custid','pc_nm'])['tot_amt'].agg([('tot_amt_pc', 'sum')]).reset_index()\ndf =df.pivot_table(values='tot_amt_pc', index=df.custid, columns='pc_nm', aggfunc='first',fill_value=0).reset_index()\nfeatures.append(df)"

In [65]:
#총구매액
f = tr.groupby('custid')['tot_amt'].agg([('총구매액', 'sum')]).reset_index()
features.append(f)

In [66]:
#구매건수
f = tr.groupby('custid')['tot_amt'].agg([('구매건수', 'size')]).reset_index()
features.append(f)

In [67]:
#평균구매가격

f = tr.groupby('custid')['tot_amt'].agg([('평균구매가격', "mean")]).reset_index()
features.append(f)

In [68]:
#평균할부개월수
f = tr.groupby('custid')['inst_mon'].agg([('평균할부개월수', 'mean')]).reset_index()
f.iloc[:,1] = f.iloc[:,1].apply(round, args=(1,))
features.append(f)

In [69]:
#구매상품다양성
n = tr.corner_nm.nunique()
f = tr.groupby('custid')['brd_nm'].agg([('구매상품다양성', lambda x: len(x.unique()) / n)]).reset_index()
features.append(f)

In [70]:
#수입상품_구매비율
x = tr[tr['import_flg'] == 1].groupby('custid').size() / tr.groupby('custid').size()
f = x.reset_index().rename(columns={0: '수입상품_구매비율'}).fillna(0)
f.iloc[:,1] = (f.iloc[:,1]*100).apply(round, args=(1,))
features.append(f)

In [71]:
#일평균구매액
test2 = tr.groupby(['sales_date','custid'])['tot_amt'].agg([('day_amt', 'sum')]).reset_index()
test2 = test2.groupby(['custid'])['day_amt'].agg([('일평균구매액', 'mean')]).reset_index()
features.append(test2)

In [72]:
#내점일수 

f = tr.groupby(by = 'custid')['sales_time'].agg([('내점일수','nunique')]).reset_index()
features.append(f)

In [73]:
#일평균구매건
df = tr.groupby(['sales_date','custid'])['custid'].agg([('day_visit', 'count')]).reset_index()
f = df.groupby(['custid'])['day_visit'].agg([('일평균구매건', 'mean')]).reset_index()
features.append(f)

In [74]:
#아동상품 구매건수
f = tr[tr.tot_amt > 0].groupby('custid')['part_nm'].agg([('baby_sales', lambda x: list(x).count('아동')+list(x).count('케주얼,구두,아동')+list(x).count('아동문화')+list(x).count('아동,스포츠'))]).reset_index()
features.append(f)

In [75]:
#상위 브랜드 40개의 구매빈도
top_brd = tr["brd_nm"].value_counts()[:40].index
df = tr.groupby(['custid','brd_nm'])["custid"].agg([('brd_count', 'count')]).reset_index()
df =df.pivot_table(values='brd_count', index=df.custid, columns="brd_nm", aggfunc='count',fill_value=0).reset_index()
df = pd.concat([df["custid"] ,df[top_brd]],axis= 1)
features.append(df)

In [76]:
#상위 브랜드 40개의 구매금액
df = tr.groupby(['custid','brd_nm'])["tot_amt"].agg([('tot_amt_brd', 'sum')]).reset_index()
df =df.pivot_table(values='tot_amt_brd', index=df.custid, columns="brd_nm", aggfunc='sum',fill_value=0).reset_index()
df = pd.concat([df["custid"] ,df[top_brd]],axis= 1)
features.append(df)

In [77]:
#주말방문비율
day = {'월' : 0 , '화' : 1 , '수' : 2 , '목' : 3 , '금' : 4 , '토': 5 , '일' : 6 }

tr['sales_dayofweek_num'] = tr['sales_dayofweek'].apply(lambda x : day[x] )

f = tr.groupby('custid')['sales_dayofweek_num'].agg([
    ('주말방문비율', lambda x: np.mean(x >4))]).reset_index()

features.append(f)

In [78]:
# goodcd
# Top-50구매건수,총구매액,평균구매액,구매비율
f = id_data
best_seller = list(tr.goodcd.value_counts().index[:50])
bstn = tr.query('goodcd == @best_seller').groupby('custid').agg({'구매여부': [('베스트셀러_구매건수', np.sum)],
                                                                '환불여부': [('베스트셀러_환불건수', np.sum)],
                                                                'tot_amt>=0': [
                                                                            ('베스트셀러_총구매액_합', np.sum),
                                                                            ('베스트셀러_총구매액_평균', np.mean),
                                                                            ('베스트셀러_총구매액_표준편차', np.std),
                                                                         
                                                                            ('베스트셀러_총구매액_최대구매액', np.max),
                                                                ('베스트셀러_총구매액_변동계수',lambda x : np.std(x) / 0.0001 \
                                                                 if np.mean(x) == 0 else np.std(x) / np.mean(x))],
                    
                                                                'inst_mon':[
                                                                            ('베스트셀러_평균무이자할부개월', np.mean),
                                                                           ('베스트셀러_최소무이자할부개월', np.min),
                                                                           ('베스트셀러_최대무이자할부개월', np.max)]
})

bstn.columns = bstn.columns.droplevel(0)
f = pd.merge(f, bstn, on = 'custid', how = 'left').fillna(0)

features.append(f); display(f)

Unnamed: 0,custid,베스트셀러_구매건수,베스트셀러_환불건수,베스트셀러_총구매액_합,베스트셀러_총구매액_평균,베스트셀러_총구매액_표준편차,베스트셀러_총구매액_최대구매액,베스트셀러_총구매액_변동계수,베스트셀러_평균무이자할부개월,베스트셀러_최소무이자할부개월,베스트셀러_최대무이자할부개월
0,0,9.0,9.0,1174000.0,130444.444444,73698.899434,294000.0,0.532671,2.777778,1.0,3.0
1,2,1.0,1.0,146000.0,146000.000000,0.000000,146000.0,0.000000,1.000000,1.0,1.0
2,3,6.0,3.0,406600.0,58085.714286,72351.075287,187000.0,1.153194,2.142857,1.0,3.0
3,4,1.0,1.0,60000.0,60000.000000,0.000000,60000.0,0.000000,2.000000,2.0,2.0
4,5,15.0,10.0,1501259.0,100083.933333,58571.687204,204000.0,0.565382,1.800000,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,1.0,1.0,98000.0,98000.000000,0.000000,98000.0,0.000000,3.000000,3.0,3.0
35963,49990,1.0,1.0,213000.0,213000.000000,0.000000,213000.0,0.000000,6.000000,6.0,6.0
35964,49992,1.0,1.0,53000.0,53000.000000,0.000000,53000.0,0.000000,1.000000,1.0,1.0
35965,49993,3.0,1.0,123029.0,41009.666667,25704.849354,70000.0,0.511780,1.000000,1.0,1.0


In [79]:
#계절별 구매 건수
def season(k):
    if 3 <= k <= 5 :
        return('봄')
    elif 6 <= k <= 8 :
        return('여름')
    elif 9 <= k <= 11 :    
        return('가을')
    else :
        return('겨울')
df = tr.copy()
df["season"] = df.sales_month.apply(season)
f = pd.pivot_table(df, index='custid', columns='season', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f)

In [80]:
#계절별 구매 금액
df = tr.copy()
df["season"] = df.sales_month.apply(season)
f = pd.pivot_table(df, index='custid', columns='season', values='tot_amt', 
                   aggfunc="sum", fill_value=0).reset_index()
features.append(f)

In [81]:
#아침점심오후저녁별 구매건수
def f2(x):
    if 10 < x <= 12 :
        return('아침')
    elif 12 < x <= 14 :
        return('점심')
    elif 14 < x <= 16 :
        return('오후1')
    elif 16 < x <= 18 :
        return ('오후2')
    elif 18 < x <= 20:
        return('저녁')
    else :
        return('근무시간외')
df = tr.copy()
df["goodmea"] = df.sales_hour.apply(f2)
f = pd.pivot_table(df, index='custid', columns='goodmea', values='tot_amt', 
                   aggfunc=np.size).reset_index()
features.append(f.fillna(0))

In [82]:
#아침점심오후저녁별 구매금액
df = tr.copy()
df["goodmea"] = df.sales_hour.apply(f2)
f = pd.pivot_table(df, index='custid', columns='goodmea', values='tot_amt', 
                   aggfunc="sum").reset_index()
features.append(f.fillna(0))

In [83]:
#주방문요일
f = tr.groupby('custid')['sales_dayofweek'].agg([('주방문요일', lambda x: x.value_counts().index[0])]).reset_index()
f = pd.get_dummies(f, columns=['주방문요일'])

features.append(f)

In [84]:
#평균 쇼핑시간
f = tr.groupby('custid')['total_sec'].agg([
    ('평균쇼핑시간', lambda x: (x.max() - x.min()) / x.nunique())]).reset_index()

features.append(f)

In [85]:
'''
#일별 구매건수
f = pd.pivot_table(tr, index='custid', columns='sales_day', values='tot_amt', 
                   aggfunc=np.size).fillna(0).astype(int).reset_index().add_prefix('day_c_').rename(columns={'day_c_custid':'custid'})
features.append(f)
'''

"\n#일별 구매건수\nf = pd.pivot_table(tr, index='custid', columns='sales_day', values='tot_amt', \n                   aggfunc=np.size).fillna(0).astype(int).reset_index().add_prefix('day_c_').rename(columns={'day_c_custid':'custid'})\nfeatures.append(f)\n"

In [86]:
'''
#일별 구매금액
f = pd.pivot_table(tr, index='custid', columns='sales_day', values='tot_amt', 
                   aggfunc="sum").fillna(0).astype(int).reset_index().add_prefix('day_s_').rename(columns={'day_s_custid':'custid'})
features.append(f)
'''

'\n#일별 구매금액\nf = pd.pivot_table(tr, index=\'custid\', columns=\'sales_day\', values=\'tot_amt\', \n                   aggfunc="sum").fillna(0).astype(int).reset_index().add_prefix(\'day_s_\').rename(columns={\'day_s_custid\':\'custid\'})\nfeatures.append(f)\n'

In [87]:
#월별 구매건수
f = pd.pivot_table(tr, index='custid', columns='sales_month', values='tot_amt', 
                   aggfunc=np.size).fillna(0).astype(int).reset_index().add_prefix('month_c_').rename(columns={'month_c_custid':'custid'})
features.append(f)

In [88]:
#월별 구매금액
f = pd.pivot_table(tr, index='custid', columns='sales_month', values='tot_amt', 
                   aggfunc="sum").fillna(0).astype(int).reset_index().add_prefix('month_s_').rename(columns={'month_s_custid':'custid'})
features.append(f)

In [89]:
# fm스코어
f = tr.groupby('custid')['구매여부'].agg([('구매건수', np.sum)]).reset_index() 
f['총구매액'] = tr.groupby('custid')['tot_amt>=0'].agg([('총구매액', np.sum)]).reset_index().총구매액

f['f'] = pd.qcut(f['구매건수'], q = 5, labels = range(1, 6)).astype(int)
f['m'] = pd.qcut(f['총구매액'], q = 5, labels = range(1, 6)).astype(int)
f['fm_스코어'] =  (0.24 * f['f']) + (0.76 * f['m']) 

f = f.iloc[:,[0,5]]
features.append(f); display(f)

Unnamed: 0,custid,fm_스코어
0,0,2.76
1,2,4.28
2,3,4.00
3,4,1.76
4,5,4.76
...,...,...
35962,49988,1.76
35963,49990,1.00
35964,49992,1.00
35965,49993,1.00


In [90]:
# 최근 1달간 구매금액, 구매건수
cls = ['tot_amt>=0', 'dis_amt>=0', 'refund'] # 구매금액, 할인금액, 환불금액
for cl in cls:   
    aggs = ['sum', 'mean', 'max', 'std']
    for agg in aggs:
        f =pd.merge(id_data, pd.pivot_table(data = tr.query('sales_month == 4'),
                                                  columns = 'sales_month', index = 'custid', values = cl,
                      aggfunc = agg, fill_value = 0).reset_index(), how = 'left', on = 'custid').fillna(0)
        dic = {}
        for i in f.iloc[:,1:].columns:
                    dic[i] = f'최근한달간_{cl}_{agg}'    
        dic2 = {}
        g = id_data
        if agg == 'mean':
            me = f.iloc[: , 1:]
        if agg == 'std':
            st = f.iloc[: , 1:]
        
            
        f.rename(columns = dic, inplace = True)
        features.append(f); 
        display(f)
    g = pd.concat([g ,(st / me).fillna(0)], axis = 1)
    for t in g.iloc[:,1:].columns:
        dic2[t] = f'{t}_{cl}_변동계수'
    g.rename(columns = dic2, inplace = True)
    features.append(g)
    display(g)

Unnamed: 0,custid,최근한달간_tot_amt>=0_sum
0,0,294000.0
1,2,0.0
2,3,227000.0
3,4,0.0
4,5,0.0
...,...,...
35962,49988,0.0
35963,49990,0.0
35964,49992,168000.0
35965,49993,0.0


Unnamed: 0,custid,최근한달간_tot_amt>=0_mean
0,0,294000.000000
1,2,0.000000
2,3,56750.000000
3,4,0.000000
4,5,0.000000
...,...,...
35962,49988,0.000000
35963,49990,0.000000
35964,49992,168000.000000
35965,49993,0.000000


Unnamed: 0,custid,최근한달간_tot_amt>=0_max
0,0,294000.0
1,2,0.0
2,3,89000.0
3,4,0.0
4,5,0.0
...,...,...
35962,49988,0.0
35963,49990,0.0
35964,49992,168000.0
35965,49993,0.0


Unnamed: 0,custid,최근한달간_tot_amt>=0_std
0,0,0.000000
1,2,0.000000
2,3,41120.757127
3,4,0.000000
4,5,0.000000
...,...,...
35962,49988,0.000000
35963,49990,0.000000
35964,49992,0.000000
35965,49993,0.000000


Unnamed: 0,custid,4_tot_amt>=0_변동계수
0,0,0.000000
1,2,0.000000
2,3,0.724595
3,4,0.000000
4,5,0.000000
...,...,...
35962,49988,0.000000
35963,49990,0.000000
35964,49992,0.000000
35965,49993,0.000000


Unnamed: 0,custid,최근한달간_dis_amt>=0_sum
0,0,29400.0
1,2,0.0
2,3,11350.0
3,4,0.0
4,5,0.0
...,...,...
35962,49988,0.0
35963,49990,0.0
35964,49992,8400.0
35965,49993,0.0


Unnamed: 0,custid,최근한달간_dis_amt>=0_mean
0,0,29400.000000
1,2,0.000000
2,3,2837.500000
3,4,0.000000
4,5,0.000000
...,...,...
35962,49988,0.000000
35963,49990,0.000000
35964,49992,8400.000000
35965,49993,0.000000


Unnamed: 0,custid,최근한달간_dis_amt>=0_max
0,0,29400.0
1,2,0.0
2,3,4450.0
3,4,0.0
4,5,0.0
...,...,...
35962,49988,0.0
35963,49990,0.0
35964,49992,8400.0
35965,49993,0.0


Unnamed: 0,custid,최근한달간_dis_amt>=0_std
0,0,0.000000
1,2,0.000000
2,3,2056.037856
3,4,0.000000
4,5,0.000000
...,...,...
35962,49988,0.000000
35963,49990,0.000000
35964,49992,0.000000
35965,49993,0.000000


Unnamed: 0,custid,4_dis_amt>=0_변동계수
0,0,0.000000
1,2,0.000000
2,3,0.724595
3,4,0.000000
4,5,0.000000
...,...,...
35962,49988,0.000000
35963,49990,0.000000
35964,49992,0.000000
35965,49993,0.000000


Unnamed: 0,custid,최근한달간_refund_sum
0,0,0.0
1,2,0.0
2,3,621000.0
3,4,0.0
4,5,0.0
...,...,...
35962,49988,0.0
35963,49990,0.0
35964,49992,0.0
35965,49993,0.0


Unnamed: 0,custid,최근한달간_refund_mean
0,0,0.0
1,2,0.0
2,3,155250.0
3,4,0.0
4,5,0.0
...,...,...
35962,49988,0.0
35963,49990,0.0
35964,49992,0.0
35965,49993,0.0


Unnamed: 0,custid,최근한달간_refund_max
0,0,0.0
1,2,0.0
2,3,621000.0
3,4,0.0
4,5,0.0
...,...,...
35962,49988,0.0
35963,49990,0.0
35964,49992,0.0
35965,49993,0.0


Unnamed: 0,custid,최근한달간_refund_std
0,0,0.0
1,2,0.0
2,3,310500.0
3,4,0.0
4,5,0.0
...,...,...
35962,49988,0.0
35963,49990,0.0
35964,49992,0.0
35965,49993,0.0


Unnamed: 0,custid,4_refund_변동계수
0,0,0.0
1,2,0.0
2,3,2.0
3,4,0.0
4,5,0.0
...,...,...
35962,49988,0.0
35963,49990,0.0
35964,49992,0.0
35965,49993,0.0


In [91]:
# 새해에 구매한 구매금액,구매건수 (최근 4개월)
cls = ['tot_amt>=0', 'dis_amt>=0', 'refund'] # 구매금액, 할인금액, 환불금액
for cl in cls:   
    aggs = ['sum', 'mean', 'max', 'std']
    for agg in aggs:
        f =pd.merge(id_data, pd.pivot_table(data = tr.query('sales_month <= 4'),
                                                  columns = 'sales_month', index = 'custid', values = cl,
                      aggfunc = agg, fill_value = 0).reset_index(), how = 'left', on = 'custid').fillna(0)
       
        f[5]=f[[1,2,3,4]].sum(axis=1)

        
        f.drop(columns = [1,2,3,4], inplace=True)
        
        dic = {}
        for i in f.iloc[:,1:].columns:
            dic[i] = f'최근네달간_{cl}_{agg}'  

                
        dic2 = {}
        g = id_data
        
        if agg == 'mean':
            me = f.iloc[: , 1:]    
        if agg == 'std':
            st = f.iloc[: , 1:]
         
            
        f.rename(columns = dic, inplace = True)
        features.append(f)
        display(f)

    g = pd.concat([g ,(st / me).fillna(0)], axis = 1)

    
    for t in g.iloc[:,1:].columns:
        dic2[t] = f'{t}_{cl}_변동계수'

        
    g.rename(columns = dic2, inplace = True)
    features.append(g)
    display(g)

Unnamed: 0,custid,최근네달간_tot_amt>=0_sum
0,0,294000.0
1,2,1200000.0
2,3,1665000.0
3,4,0.0
4,5,804000.0
...,...,...
35962,49988,59000.0
35963,49990,213000.0
35964,49992,168000.0
35965,49993,143029.0


Unnamed: 0,custid,최근네달간_tot_amt>=0_mean
0,0,294000.000000
1,2,600000.000000
2,3,559916.666667
3,4,0.000000
4,5,201000.000000
...,...,...
35962,49988,59000.000000
35963,49990,213000.000000
35964,49992,168000.000000
35965,49993,35757.250000


Unnamed: 0,custid,최근네달간_tot_amt>=0_max
0,0,294000.0
1,2,1200000.0
2,3,799000.0
3,4,0.0
4,5,252000.0
...,...,...
35962,49988,59000.0
35963,49990,213000.0
35964,49992,168000.0
35965,49993,70000.0


Unnamed: 0,custid,최근네달간_tot_amt>=0_std
0,0,0.000000
1,2,848528.137424
2,3,326041.232621
3,4,0.000000
4,5,57746.572770
...,...,...
35962,49988,0.000000
35963,49990,0.000000
35964,49992,0.000000
35965,49993,23470.074071


Unnamed: 0,custid,5_tot_amt>=0_변동계수
0,0,0.000000
1,2,1.414214
2,3,0.582303
3,4,0.000000
4,5,0.287296
...,...,...
35962,49988,0.000000
35963,49990,0.000000
35964,49992,0.000000
35965,49993,0.656372


Unnamed: 0,custid,최근네달간_dis_amt>=0_sum
0,0,29400.0
1,2,0.0
2,3,80550.0
3,4,0.0
4,5,63800.0
...,...,...
35962,49988,2950.0
35963,49990,10650.0
35964,49992,8400.0
35965,49993,3500.0


Unnamed: 0,custid,최근네달간_dis_amt>=0_mean
0,0,29400.000000
1,2,0.000000
2,3,26645.833333
3,4,0.000000
4,5,15950.000000
...,...,...
35962,49988,2950.000000
35963,49990,10650.000000
35964,49992,8400.000000
35965,49993,875.000000


Unnamed: 0,custid,최근네달간_dis_amt>=0_max
0,0,29400.0
1,2,0.0
2,3,39950.0
3,4,0.0
4,5,25200.0
...,...,...
35962,49988,2950.0
35963,49990,10650.0
35964,49992,8400.0
35965,49993,3500.0


Unnamed: 0,custid,최근네달간_dis_amt>=0_std
0,0,0.000000
1,2,0.000000
2,3,18211.249940
3,4,0.000000
4,5,11873.078792
...,...,...
35962,49988,0.000000
35963,49990,0.000000
35964,49992,0.000000
35965,49993,1750.000000


Unnamed: 0,custid,5_dis_amt>=0_변동계수
0,0,0.000000
1,2,0.000000
2,3,0.683456
3,4,0.000000
4,5,0.744394
...,...,...
35962,49988,0.000000
35963,49990,0.000000
35964,49992,0.000000
35965,49993,2.000000


Unnamed: 0,custid,최근네달간_refund_sum
0,0,0.0
1,2,1416000.0
2,3,621000.0
3,4,0.0
4,5,0.0
...,...,...
35962,49988,0.0
35963,49990,0.0
35964,49992,0.0
35965,49993,0.0


Unnamed: 0,custid,최근네달간_refund_mean
0,0,0.0
1,2,708000.0
2,3,155250.0
3,4,0.0
4,5,0.0
...,...,...
35962,49988,0.0
35963,49990,0.0
35964,49992,0.0
35965,49993,0.0


Unnamed: 0,custid,최근네달간_refund_max
0,0,0.0
1,2,1416000.0
2,3,621000.0
3,4,0.0
4,5,0.0
...,...,...
35962,49988,0.0
35963,49990,0.0
35964,49992,0.0
35965,49993,0.0


Unnamed: 0,custid,최근네달간_refund_std
0,0,0.000000e+00
1,2,1.001263e+06
2,3,3.105000e+05
3,4,0.000000e+00
4,5,0.000000e+00
...,...,...
35962,49988,0.000000e+00
35963,49990,0.000000e+00
35964,49992,0.000000e+00
35965,49993,0.000000e+00


Unnamed: 0,custid,5_refund_변동계수
0,0,0.000000
1,2,1.414214
2,3,2.000000
3,4,0.000000
4,5,0.000000
...,...,...
35962,49988,0.000000
35963,49990,0.000000
35964,49992,0.000000
35965,49993,0.000000


In [92]:
# 구매추세 making 
tr['month_day_datetime'] = pd.to_datetime(tr['sales_date'], format= '%m%d')

In [93]:
# 5월 ~ 4월까지 데이터이기 때문에 ( 4월까지는 1년 추가)
# 5월 1일을 기준으로 recency계산 ( 4월까지는 1년 추가)
from dateutil.relativedelta import relativedelta
tr['month_day_datetime'] = tr['month_day_datetime'].map(lambda x : x+ relativedelta(years = 1) \
                                                              if x<pd.to_datetime('1900-05-01') else x)

In [94]:
# 마지막주부터 10주전까지 (총 10주 간) 매주 구매건수를 계산하여 리스트에 넣음
week_to = tr.month_day_datetime.max()
week_trans = []
for i in range(10):
    week_from = week_to + pd.DateOffset(weeks=-1)
    week_trans.append(tr.query('@week_from < month_day_datetime <= @week_to')
                      .groupby('custid')['tot_amt>=0']
                      .agg([(f'w{10-i}', 'count')])
                      .reset_index())
    week_to = week_from

# 리스트로부터 데이터프레임 변환    
f = id_data
for w in week_trans[::-1]:
    f = pd.merge(f, w, how='left')
f = f.fillna(0)

# 각 고객별로 10주간 구매에 대한 추세선(ax+b)을 계산하여 기울기(a)를 해당 고객의 추세패턴으로 사용 
# 기울기가 양수면 시간이 지나면서 구매가 증가한다는 뜻. 음수면 반대.
f['구매추세'] = f.apply(lambda x: np.polyfit(range(10), x[1:], 1)[0].round(2), axis=1)
features.append(f[['custid','구매추세']])
f[['custid','구매추세']]

Unnamed: 0,custid,구매추세
0,0,0.03
1,2,0.00
2,3,0.07
3,4,0.00
4,5,-0.04
...,...,...
35962,49988,0.00
35963,49990,-0.05
35964,49992,0.03
35965,49993,0.00


In [95]:
len(features)

81

#### 타겟 인코딩

In [96]:
cat_list = tr.select_dtypes(include=['object']).columns.to_list()

In [97]:
# target data, 개수가 적은 column들은 사용 x
cat_list.remove('age_group')
cat_list.remove('gender_group')
cat_list.remove('sales_season') 
cat_list.remove('team_nm')

In [98]:
y_target.value_counts().index # 많은 순서대로 7,6,5,4,3,2,1,0

Index(['F20', 'F30', 'F40', 'M30', 'F50', 'M50', 'M40', 'M20'], dtype='object')

In [99]:
train = pd.merge(tr.iloc[:train.shape[0]],y_target_c,on='custid',how='left')

# 수치형으로 (value_counts 순)
train.group = train.group.map(lambda x : 7 if x=='F20' else
                                    6 if x=='F30' else
                                    5 if x=='F40' else
                                    4 if x=='M30' else
                                    3 if x=='F50' else
                                    2 if x=='M50' else
                                    1 if x=='M40' else
                                    0)
for i in cat_list:
    target_data_tr = train[['custid',i,'group']]
    add_mean = train.groupby(i).group.mean()
    target_data_tr[f'{i}_target'] = target_data_tr[i].map(add_mean)
    test[f'{i}_target'] = test[i].map(add_mean)
    f = target_data_tr.groupby('custid')[f'{i}_target'].mean().reset_index()
    f_te = test.groupby('custid')[f'{i}_target'].mean().reset_index()
    ff = pd.concat([f,f_te])
    features.append(ff)
    print(i,"clear")

sales_dayofweek clear
str_nm clear
brd_nm clear
corner_nm clear
pc_nm clear
part_nm clear
buyer_nm clear


In [100]:
len(features)

88

In [101]:
X_train = pd.DataFrame({'custid': train.custid.unique()})
for f in features :
    X_train = pd.merge(X_train, f, how='left', on='custid')

X_test = pd.DataFrame({'custid': test.custid.unique()})
for f in features :
    X_test = pd.merge(X_test, f, how='left',on='custid')

In [102]:
X_train['평균내점구매액'] = X_train['총구매액']/X_train['내점일수']
# X_train['주중방문비율'] = (100 - X_train["주말방문비율"])
# X_train['국내상품_구매비율'] = (100 - X_train['수입상품_구매비율'])
X_train['할부구매가격'] = X_train['평균구매가격'] / X_train['평균할부개월수']
X_train['구매상품다양성'] = X_train['총구매액'] / X_train['구매상품다양성']
X_train['주말방문수'] = (X_train['주말방문비율'] * X_train['내점일수']) / 100
X_train['주말방문수'] = X_train['주말방문수'].astype('int64')
X_train['주중방문수'] = X_train['내점일수'] - X_train['주말방문수']
X_train['주중방문수'] = X_train['주중방문수'].astype('int64')
X_train['내점당편균구매건수'] = X_train['구매건수']/X_train['내점일수']
#X_train['주중구매액'] = X_train['총구매액']*(X_train['주중방문비율']/100)
#X_train['주말구매액'] = X_train['총구매액'] - X_train['주중구매액']

In [103]:
X_test['평균내점구매액'] = X_test['총구매액']/X_test['내점일수']
# X_test['주중방문비율'] = (100 - X_test['주말방문비율'])
# X_test['국내상품_구매비율'] = (100 - X_test['수입상품_구매비율'])
X_test['할부구매가격'] = X_test['평균구매가격'] / X_test['평균할부개월수']
X_test['구매상품다양성'] = X_test['총구매액'] / X_test['구매상품다양성']
X_test['주말방문수'] = (X_test['주말방문비율'] * X_test['내점일수']) / 100
X_test['주말방문수'] = X_test['주말방문수'].astype('int64')
X_test['주중방문수'] = X_test['내점일수'] - X_test['주말방문수']
X_test['주중방문수'] = X_test['주중방문수'].astype('int64')
X_test['내점당편균구매건수'] = X_test['구매건수']/X_test['내점일수']
#X_test['주중구매액'] = X_test['총구매액']*(X_test['주중방문비율']/100)
#X_test['주말구매액'] = X_test['총구매액'] - X_test['주중구매액']

In [104]:
X_train.shape, X_test.shape

((21587, 632), (14380, 632))

In [105]:
X_train_ = X_train.drop(columns='custid')
X_test_ = X_test.drop(columns='custid')

In [106]:
# json오류해결
import re

X_train_ = X_train_.rename(columns = lambda x:re.sub(',', '/', x))
X_test_ = X_test_.rename(columns = lambda x:re.sub(',', '/', x))

In [107]:
X_train_.shape, X_test_.shape

((21587, 631), (14380, 631))

In [108]:
train_col = X_train_.columns
test_col = X_test_.columns

### numeric feature save

In [109]:
'''
import datetime

time = datetime.datetime.now().strftime('%m%d_%H%M')

X_train_.to_csv(os.path.abspath("../input")+f'/num_feature_{time}.csv', index=False)
X_test_.to_csv(os.path.abspath("../input")+f'/num_feature_te_{time}.csv', index=False)
print(time)
print(f'num_feature_{time}.csv','is ready.')
'''

'\nimport datetime\n\ntime = datetime.datetime.now().strftime(\'%m%d_%H%M\')\n\nX_train_.to_csv(os.path.abspath("../input")+f\'/num_feature_{time}.csv\', index=False)\nX_test_.to_csv(os.path.abspath("../input")+f\'/num_feature_te_{time}.csv\', index=False)\nprint(time)\nprint(f\'num_feature_{time}.csv\',\'is ready.\')\n'

In [110]:
'''
X_train_ = pd.read_csv(os.path.abspath("../input")+f'/num_feature_{time}.csv')
X_test_ = pd.read_csv(os.path.abspath("../input")+f'/num_feature_te_{time}.csv')
'''

'\nX_train_ = pd.read_csv(os.path.abspath("../input")+f\'/num_feature_{time}.csv\')\nX_test_ = pd.read_csv(os.path.abspath("../input")+f\'/num_feature_te_{time}.csv\')\n'

---

### Categorical

In [111]:
IDtest = test.custid.unique()

In [112]:
level = 'corner_nm'
tr[level].nunique()

299

In [113]:
catFeatures_train_cor = pd.pivot_table(tr, index='custid', columns=level, values='구매여부',
                         aggfunc='mean', fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest').\
                        drop(columns=['custid']).values
catFeatures_test_cor = pd.pivot_table(tr, index='custid', columns=level, values='구매여부',
                         aggfunc='mean', fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values

# max_features = X_train_cat.shape[1]

In [114]:
catFeatures_train_cor = pd.DataFrame(catFeatures_train_cor)
catFeatures_test_cor = pd.DataFrame(catFeatures_test_cor)

In [115]:
catFeatures_train_cor.columns = catFeatures_train_cor.columns.map(lambda x : level+ "_" + str(x) + "_onehot" )
catFeatures_test_cor.columns = catFeatures_test_cor.columns.map(lambda x : level+ "_" + str(x) + "_onehot" )

**<font color='CC3D3D'>[brd_nm]**

In [116]:
'''
level = 'brd_nm'
tr[level].nunique()
'''

"\nlevel = 'brd_nm'\ntr[level].nunique()\n"

In [117]:
'''
catFeatures_train_brd = pd.pivot_table(tr, index='custid', columns=level, values='구매여부',
                         aggfunc='mean', fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values

catFeatures_test_brd = pd.pivot_table(tr, index='custid', columns=level, values='구매여부',
                         aggfunc='mean', fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values
# max_features = X_train.shape[1]
'''

"\ncatFeatures_train_brd = pd.pivot_table(tr, index='custid', columns=level, values='구매여부',\n                         aggfunc='mean', fill_value=0).                          reset_index().                          query('custid not in @IDtest').                          drop(columns=['custid']).values\n\ncatFeatures_test_brd = pd.pivot_table(tr, index='custid', columns=level, values='구매여부',\n                         aggfunc='mean', fill_value=0).                          reset_index().                          query('custid in @IDtest').                          drop(columns=['custid']).values\n# max_features = X_train.shape[1]\n"

In [118]:
'''
catFeatures_train_brd = pd.DataFrame(catFeatures_train_brd)
catFeatures_test_brd = pd.DataFrame(catFeatures_test_brd)
'''

'\ncatFeatures_train_brd = pd.DataFrame(catFeatures_train_brd)\ncatFeatures_test_brd = pd.DataFrame(catFeatures_test_brd)\n'

In [119]:
'''
catFeatures_train_brd.columns = catFeatures_train_brd.columns.map(lambda x : level+ "_" + str(x) + "_onehot" )
catFeatures_test_brd.columns = catFeatures_test_brd.columns.map(lambda x : level+ "_" + str(x) + "_onehot" )
'''

'\ncatFeatures_train_brd.columns = catFeatures_train_brd.columns.map(lambda x : level+ "_" + str(x) + "_onehot" )\ncatFeatures_test_brd.columns = catFeatures_test_brd.columns.map(lambda x : level+ "_" + str(x) + "_onehot" )\n'

**<font color='CC3D3D'>[pc_nm]**

In [120]:
level = 'pc_nm'
tr[level].nunique()

71

In [121]:
catFeatures_train_pc = pd.pivot_table(tr, index='custid', columns=level, values='구매여부',
                         aggfunc='mean', fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values

catFeatures_test_pc = pd.pivot_table(tr, index='custid', columns=level, values='구매여부',
                         aggfunc='mean', fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values
# max_features = X_train.shape[1]

In [122]:
catFeatures_train_pc = pd.DataFrame(catFeatures_train_pc)
catFeatures_test_pc = pd.DataFrame(catFeatures_test_pc)

In [123]:
catFeatures_train_pc.columns = catFeatures_train_pc.columns.map(lambda x : level+ "_" + str(x) + "_onehot" )
catFeatures_test_pc.columns = catFeatures_test_pc.columns.map(lambda x : level+ "_" + str(x) + "_onehot" )

**<font color='CC3D3D'>[part_nm]**

In [124]:
level = 'part_nm'
tr[level].nunique()

29

In [125]:
catFeatures_train_part = pd.pivot_table(tr, index='custid', columns=level, values='구매여부',
                         aggfunc='mean', fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values

catFeatures_test_part = pd.pivot_table(tr, index='custid', columns=level, values='구매여부',
                         aggfunc='mean', fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values
# max_features = X_train.shape[1]

In [126]:
catFeatures_train_part = pd.DataFrame(catFeatures_train_part)
catFeatures_test_part = pd.DataFrame(catFeatures_test_part)

In [127]:
catFeatures_train_part.columns = catFeatures_train_part.columns.map(lambda x : level+ "_" + str(x) + "_onehot" )
catFeatures_test_part.columns = catFeatures_test_part.columns.map(lambda x : level+ "_" + str(x) + "_onehot" )

**<font color='CC3D3D'>[team_nm]** 

In [128]:
level = 'team_nm'
tr[level].nunique()

3

In [129]:
catFeatures_train_team = pd.pivot_table(tr, index='custid', columns=level, values='구매여부',
                         aggfunc='mean', fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                        drop(columns=['custid']).values

catFeatures_test_team = pd.pivot_table(tr, index='custid', columns=level, values='구매여부',
                         aggfunc='mean', fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                        drop(columns=['custid']).values
# max_features = X_train.shape[1]

In [130]:
catFeatures_train_team = pd.DataFrame(catFeatures_train_team)
catFeatures_test_team = pd.DataFrame(catFeatures_test_team)

In [131]:
catFeatures_train_team.columns = catFeatures_train_team.columns.map(lambda x : level+ "_" + str(x) + "_onehot" )
catFeatures_test_team.columns = catFeatures_test_team.columns.map(lambda x : level+ "_" + str(x) + "_onehot" )

**<font color='CC3D3D'>[buyer_nm]**

In [132]:
level = 'buyer_nm'
tr[level].nunique()

28

In [133]:
catFeatures_train_buyer = pd.pivot_table(tr, index='custid', columns=level, values='구매여부',
                         aggfunc='mean', fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                        drop(columns=['custid']).values

catFeatures_test_buyer = pd.pivot_table(tr, index='custid', columns=level, values='구매여부',
                         aggfunc='mean', fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                        drop(columns=['custid']).values
# max_features = X_train.shape[1]

In [134]:
catFeatures_train_buyer = pd.DataFrame(catFeatures_train_buyer)
catFeatures_test_buyer = pd.DataFrame(catFeatures_test_buyer)

In [135]:
catFeatures_train_buyer.columns = catFeatures_train_buyer.columns.map(lambda x : level+ "_" + str(x) + "_onehot" )
catFeatures_test_buyer.columns = catFeatures_test_buyer.columns.map(lambda x : level+ "_" + str(x) + "_onehot" )

# Merge Categoric Features

In [136]:
onehot_features_train = pd.concat([catFeatures_train_cor, 
                                   #catFeatures_train_brd, 
                                   catFeatures_train_pc,
                            catFeatures_train_part, catFeatures_train_team, catFeatures_train_buyer], axis=1)

In [137]:
onehot_features_test = pd.concat([catFeatures_test_cor, 
                                  #catFeatures_test_brd, 
                                  catFeatures_test_pc,
                           catFeatures_test_part, catFeatures_test_team, catFeatures_test_buyer], axis=1)

In [138]:
onehot_features_train.head()

Unnamed: 0,corner_nm_0_onehot,corner_nm_1_onehot,corner_nm_2_onehot,corner_nm_3_onehot,corner_nm_4_onehot,corner_nm_5_onehot,corner_nm_6_onehot,corner_nm_7_onehot,corner_nm_8_onehot,corner_nm_9_onehot,...,buyer_nm_18_onehot,buyer_nm_19_onehot,buyer_nm_20_onehot,buyer_nm_21_onehot,buyer_nm_22_onehot,buyer_nm_23_onehot,buyer_nm_24_onehot,buyer_nm_25_onehot,buyer_nm_26_onehot,buyer_nm_27_onehot
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.666667,1.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.5,0.0,1.0,1.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [139]:
onehot_train_col = onehot_features_train.columns
onehot_test_col = onehot_features_test.columns

In [140]:
train_col = train_col.append(onehot_train_col)
test_col = test_col.append(onehot_test_col)

## Feature scaling

In [141]:
X_train_ = pd.concat([X_train_, onehot_features_train], axis = 1)
X_test_ =  pd.concat([X_test_, onehot_features_test], axis = 1)

In [142]:
X_train_.shape, X_test_.shape

((21587, 1061), (14380, 1061))

In [143]:
from sklearn.preprocessing import StandardScaler

In [144]:
scaler = StandardScaler()
X_train_ = scaler.fit_transform(X_train_)
X_test_ = scaler.transform(X_test_) 

In [145]:
X_train_ = pd.DataFrame(X_train_)
X_train_.columns = train_col
X_test_ = pd.DataFrame(X_test_)
X_test_.columns = test_col

In [146]:
X_train_.shape, X_test_.shape

((21587, 1061), (14380, 1061))

# Deployment

In [150]:
'''
onehot_features_train.to_csv(os.path.abspath("../input")+f'/onehot_features_{time}.csv', index=False)
onehot_features_test.to_csv(os.path.abspath("../input")+f'/onehot_features_te_{time}.csv', index=False)
print(f'onehot_features_train_{time}.csv','is ready.')
'''

'\nonehot_features_train.to_csv(os.path.abspath("../input")+f\'/onehot_features_{time}.csv\', index=False)\nonehot_features_test.to_csv(os.path.abspath("../input")+f\'/onehot_features_te_{time}.csv\', index=False)\nprint(f\'onehot_features_train_{time}.csv\',\'is ready.\')\n'

In [151]:
# save
import datetime

time = datetime.datetime.now().strftime('%m%d_%H%M')

X_train_.to_csv(os.path.abspath("../input")+f'/feature_1round_third_train.csv', index=False)
X_test_.to_csv(os.path.abspath("../input")+f'/feature_1round_third_test.csv', index=False)
print(f'feature_1round_third.csv','is ready.')

feature_1round_third.csv is ready.


In [152]:
# 확인
display(pd.read_csv(os.path.abspath("../input")+f'/feature_1round_third_train.csv'))
display(pd.read_csv(os.path.abspath("../input")+f'/feature_1round_third_test.csv'))

Unnamed: 0,refund_bool,amt_refund,amt_refund_mean,amt_refund_max,love_brd_count,20_weight,30_weight,40_weight,50_weight,60_weight,...,buyer_nm_18_onehot,buyer_nm_19_onehot,buyer_nm_20_onehot,buyer_nm_21_onehot,buyer_nm_22_onehot,buyer_nm_23_onehot,buyer_nm_24_onehot,buyer_nm_25_onehot,buyer_nm_26_onehot,buyer_nm_27_onehot
0,-1.070333,-0.305775,-0.379956,0.0,-0.276554,-0.436889,-0.441036,-0.441138,-0.428322,0.0,...,-0.03194,-0.580492,-0.011789,-0.340342,1.350469,-0.259034,1.494500,-1.148580,-0.102538,0.296883
1,0.934289,1.045830,5.618556,0.0,-0.349193,-0.447883,-0.448633,-0.444136,-0.439689,0.0,...,-0.03194,-0.580492,-0.011789,3.058029,1.350469,-0.259034,0.762429,0.933764,-0.102538,0.296883
2,0.934289,0.188493,0.424362,0.0,-0.349193,0.154448,0.143397,0.158382,0.121818,0.0,...,-0.03194,1.762620,-0.011789,-0.340342,0.274131,-0.259034,1.494500,0.933764,-0.102538,0.296883
3,-1.070333,-0.305775,-0.379956,0.0,-0.494471,-0.800145,-0.799398,-0.799138,-0.802019,0.0,...,-0.03194,-0.580492,-0.011789,-0.340342,1.350469,-0.259034,-0.701713,-1.148580,-0.102538,0.296883
4,-1.070333,-0.305775,-0.379956,0.0,-0.203915,0.208664,0.211429,0.224743,0.244930,0.0,...,-0.03194,-0.580492,-0.011789,-0.340342,1.350469,-0.259034,1.494500,-1.148580,-0.102538,0.296883
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21582,0.934289,0.652776,0.235771,0.0,-0.131276,2.214728,2.189031,2.208515,2.225704,0.0,...,-0.03194,-0.580492,-0.011789,-0.340342,1.171080,-0.259034,-0.701713,0.933764,-0.102538,0.296883
21583,0.934289,0.080173,0.611703,0.0,-0.058637,-0.108573,-0.153027,-0.146475,-0.129772,0.0,...,-0.03194,-0.580492,-0.011789,-0.340342,-0.802207,-0.259034,-0.701713,0.586707,-0.102538,0.296883
21584,-1.070333,-0.305775,-0.379956,0.0,0.014002,-0.516002,-0.507907,-0.509844,-0.516529,0.0,...,-0.03194,1.762620,-0.011789,-0.340342,-0.802207,-0.259034,-0.701713,-1.148580,-0.102538,0.296883
21585,0.934289,-0.266694,-0.233196,0.0,-0.421832,-0.418201,-0.429693,-0.422578,-0.415549,0.0,...,-0.03194,-0.580492,-0.011789,-0.340342,-0.802207,-0.259034,1.494500,0.933764,-0.102538,0.296883


Unnamed: 0,refund_bool,amt_refund,amt_refund_mean,amt_refund_max,love_brd_count,20_weight,30_weight,40_weight,50_weight,60_weight,...,buyer_nm_18_onehot,buyer_nm_19_onehot,buyer_nm_20_onehot,buyer_nm_21_onehot,buyer_nm_22_onehot,buyer_nm_23_onehot,buyer_nm_24_onehot,buyer_nm_25_onehot,buyer_nm_26_onehot,buyer_nm_27_onehot
0,0.934289,-0.259805,-0.296838,0.0,-0.276554,-0.002100,-0.007334,-0.019610,0.031085,0.0,...,-0.03194,-0.580492,-0.011789,-0.340342,1.350469,-0.259034,-0.701713,0.933764,-0.102538,0.296883
1,0.934289,0.422703,-0.031296,0.0,0.449835,3.282844,3.259291,3.319226,3.313613,0.0,...,-0.03194,1.762620,-0.011789,-0.340342,1.350469,4.072560,1.494500,0.933764,-0.102538,-0.285352
2,0.934289,-0.158576,-0.227061,0.0,-0.131276,0.822633,0.764444,0.723453,0.726984,0.0,...,-0.03194,-0.580492,-0.011789,-0.340342,0.991690,-0.259034,-0.701713,0.933764,-0.102538,-1.191053
3,-1.070333,-0.305775,-0.379956,0.0,-0.494471,-0.892994,-0.887795,-0.886322,-0.885318,0.0,...,-0.03194,-0.580492,-0.011789,-0.340342,-0.802207,-0.259034,-0.701713,-1.148580,-0.102538,0.296883
4,-1.070333,-0.305775,-0.379956,0.0,-0.494471,-0.662899,-0.665945,-0.662980,-0.665618,0.0,...,-0.03194,-0.580492,-0.011789,-0.340342,-0.802207,-0.259034,-0.701713,-1.148580,-0.102538,0.296883
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14375,-1.070333,-0.305775,-0.379956,0.0,-0.494471,-0.785990,-0.770486,-0.773438,-0.783189,0.0,...,-0.03194,-0.580492,-0.011789,-0.340342,-0.802207,-0.259034,-0.701713,-1.148580,-0.102538,0.296883
14376,-1.070333,-0.305775,-0.379956,0.0,-0.494471,-0.892882,-0.887610,-0.886185,-0.885145,0.0,...,-0.03194,-0.580492,-0.011789,-0.340342,-0.802207,-0.259034,-0.701713,-1.148580,-0.102538,0.296883
14377,-1.070333,-0.305775,-0.379956,0.0,-0.494471,-0.857830,-0.851972,-0.851028,-0.848232,0.0,...,-0.03194,-0.580492,-0.011789,-0.340342,-0.802207,-0.259034,-0.701713,0.933764,-0.102538,0.296883
14378,-1.070333,-0.305775,-0.379956,0.0,-0.421832,-0.858165,-0.845615,-0.848031,-0.854863,0.0,...,-0.03194,-0.580492,-0.011789,-0.340342,-0.802207,-0.259034,-0.701713,-1.148580,-0.102538,0.296883


---