## 모듈 import

In [1]:
#모듈 import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pycaret.regression  import *


# feature selection
from sklearn.linear_model import LinearRegression
import sys, warnings
if not sys.warnoptions: warnings.simplefilter("ignore")
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import cross_val_score
from tqdm import tqdm

#Scailing
from sklearn.preprocessing import PowerTransformer

#Label Encoding
from sklearn.preprocessing import LabelEncoder

## data 불러오기

In [2]:
# 데이터가 저장된 경로 설정
data_path = '../data/'

# 데이터 불러오기
train = pd.read_csv(data_path + 'train_df.csv', encoding='cp949')
test = pd.read_csv(data_path + 'test_df.csv', encoding='cp949')
submission = pd.read_csv(data_path + 'sample_submission.csv')

### 전처리

In [3]:
data = pd.concat([train,test]).reset_index(drop=True);data

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT
0,0,1129000014045300,5011000220046300,패션의류,상의,3.0
1,1,1135000009051200,5011000178037300,생활/건강,반려동물,3.0
2,2,1135000030093100,5011000265091400,패션의류,기타패션의류,9.0
3,3,1154500002014200,5011000315087400,식품,농산물,10.0
4,4,1165000021008300,5011000177051200,식품,가공식품,3.0
...,...,...,...,...,...,...
36635,36635,5013000858004400,4725000719072200,식품,농산물,
36636,36636,5013000870018300,2826000106075300,식품,농산물,
36637,36637,5013000897086300,4311100034004300,식품,농산물,
36638,36638,5013000902065100,4145000013011200,식품,농산물,


- Make Feature

In [4]:
# 송하인 기준으로 총 몇개의 택배를 보냈는지 확인하여 이를 피처로 만들어줌 
groupby_SEND_SPG_INNB_sum = data.groupby('SEND_SPG_INNB').sum()['INVC_CONT'].reset_index()
# 이후에 피처로 만들어주기 위한 dic생성
groupby_SEND_SPG_INNB_sum_dix = {x:y for x,y in zip(groupby_SEND_SPG_INNB_sum['SEND_SPG_INNB'], groupby_SEND_SPG_INNB_sum['INVC_CONT'])}
# 피처 생성
data['총물류량'] = data['SEND_SPG_INNB'].apply(lambda x: groupby_SEND_SPG_INNB_sum_dix[x])

In [5]:
# 송하인 기준으로 총 몇명의 수하인에게 택배를 보냈는지 확인하여 이를 피처로 만들어줌 
groupby_SEND_SPG_INNB_count = data.groupby('SEND_SPG_INNB').count()['REC_SPG_INNB'].reset_index()
# 이후에 피처로 만들어주기 위한 dic생성
groupby_SEND_SPG_INNB_count_dix = {x:y for x,y in zip(groupby_SEND_SPG_INNB_count['SEND_SPG_INNB'], groupby_SEND_SPG_INNB_count['REC_SPG_INNB'])}
# 피처 생성
data['거래수하인수'] = data['SEND_SPG_INNB'].apply(lambda x: groupby_SEND_SPG_INNB_count_dix[x])

In [6]:
# 평균 한건당 몇번의 수화물을 보내는지에 대한 피처 생성
data['평균거래량'] = data['총물류량'] / data['거래수하인수']

In [7]:
# 인코딩과 스케일링을 위함
data_index = data['index']
y_data = data['INVC_CONT']

data = data.drop(columns=['index', 'INVC_CONT'])

- 결측치 채움 -> 없음

- Encoding, Scailing

In [8]:
# 범주형 변수와 수치형 변수를 분리
cat_features = data.select_dtypes(include=['object']).columns.to_list()
num_features = data.select_dtypes(exclude='object').columns.to_list() 

In [9]:
# scailing을 위해
num_features = num_features[2:]

In [10]:
#수치형변수 scailing
data[num_features] = PowerTransformer(standardize=True).fit_transform(data[num_features])

In [11]:
#위에서 떼어놨던 피처 병합
data = pd.concat([data_index, data],axis=1)
data = pd.concat([data, y_data],axis=1)

### PCA

In [14]:
# 차원축소 매소드 
from sklearn.decomposition import PCA

def dummy_to_pca(tr, column_name:str) :
    max_seq = 300 # -> 영향없는 코드
    max_d = 15
    col_count = tr.groupby(column_name)[column_name].count()
    if len(col_count) > max_seq: # -> 영향없는 코드
        tops = col_count.sort_values(ascending=False)[0:max_seq].index # -> 영향없는 코드
        f =tr.loc[tr[column_name].isin(tops)][['index', column_name]] # -> 영향없는 코드
    else:
        tops = col_count.index
        f =tr[['index', column_name]]
    f = pd.get_dummies(f, columns=[column_name])  # One-hot-encoding을 진행해줌
    
    f = f.groupby('index').mean() # index별로 평균을 내어줌 -> 영향 없는 코드
    if len(tops) < max_d: # 크게 몇차원으로 나누어줄지 계산
        max_d = len(tops) # 차원 갱신
    pca = PCA(n_components=max_d) # pca 생성
    pca.fit(f) # pca 적용 / 원핫 인코딩을 한 피처들을 위에서 정의한 차원을 축소해줌
    cumsum = np.cumsum(pca.explained_variance_ratio_) #분산의 설명량을 누적합
#     print(cumsum)
    # 적절한 차원수 찾기
    num_d = np.argmax(cumsum >= 0.99) + 1 # 분산의 설명량이 99%이상 되는 인덱스를 cumsum에서 찾아줌 거기에 1을 더한것을 차원으로 지정
    if num_d == 1:
        num_d = max_d
    
    pca = PCA(n_components=num_d) # 분산의 설명량이 99% 이상 되는 차원으로 두번째 pca를 적용함    
    result = pca.fit_transform(f)
    result = pd.DataFrame(result)
    result.columns = [column_name + '_' + str(column) for column in result.columns] #열이름 지정
    result.index = f.index # 인덱스를 맞춰줌
    return result.reset_index()

In [15]:
# PCA를 진행한 데이터 생성
pca = pd.concat([dummy_to_pca(data, 'DL_GD_LCLS_NM').iloc[:,1:], dummy_to_pca(data, 'DL_GD_MCLS_NM').iloc[:,1:]],axis=1)
data = pd.concat([data.drop(columns=['DL_GD_LCLS_NM', 'DL_GD_MCLS_NM']), pca], axis=1) 

- Data split

In [14]:
#train과 test를 분리
train = data.iloc[:32000,:]
test = data.iloc[32000:,:]
test = test.drop(columns = ['INVC_CONT']).reset_index(drop=True)

In [15]:
train

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,총물류량,거래수하인수,평균거래량,INVC_CONT,DL_GD_LCLS_NM_0,DL_GD_LCLS_NM_1,DL_GD_LCLS_NM_2,DL_GD_LCLS_NM_3,DL_GD_LCLS_NM_4,DL_GD_MCLS_NM_0,DL_GD_MCLS_NM_1,DL_GD_MCLS_NM_2,DL_GD_MCLS_NM_3,DL_GD_MCLS_NM_4,DL_GD_MCLS_NM_5,DL_GD_MCLS_NM_6,DL_GD_MCLS_NM_7,DL_GD_MCLS_NM_8,DL_GD_MCLS_NM_9,DL_GD_MCLS_NM_10,DL_GD_MCLS_NM_11,DL_GD_MCLS_NM_12,DL_GD_MCLS_NM_13,DL_GD_MCLS_NM_14
0,0,1129000014045300,5011000220046300,-0.337341,-0.287203,-0.096765,3.0,0.788362,0.287385,0.981666,-0.232391,-0.016447,0.676085,-0.024236,-0.068021,-0.009101,0.086673,-0.089317,0.018481,0.230694,-0.036317,-0.212550,-0.032277,0.036870,0.356156,0.849537,-0.172158
1,1,1135000009051200,5011000178037300,-0.612465,-0.602047,0.138106,3.0,1.166577,-0.509271,-0.106340,-0.036534,-0.003234,0.678289,-0.024906,-0.070189,-0.009453,0.090753,-0.094908,0.019915,0.262160,-0.048143,-0.329765,-0.208902,0.773892,-0.362229,-0.094668,-0.120183
2,2,1135000030093100,5011000265091400,-1.167376,-1.501972,3.177199,9.0,0.788362,0.287385,0.981666,-0.232391,-0.016447,0.670055,-0.022555,-0.062661,-0.008250,0.077075,-0.076790,0.015408,0.173145,-0.021614,-0.107114,-0.009660,0.010137,0.054860,0.029683,0.871368
3,3,1154500002014200,5011000315087400,-1.449594,-1.645199,2.507885,10.0,-0.152233,-0.009975,-0.005061,-0.002253,-0.000209,-0.317616,0.000973,0.002367,0.000261,-0.002049,0.001603,-0.000270,-0.002014,0.000154,0.000623,0.000040,-0.000040,-0.000187,-0.000088,-0.000412
4,4,1165000021008300,5011000177051200,-1.744892,-1.645199,-0.617663,3.0,-0.152233,-0.009975,-0.005061,-0.002253,-0.000209,0.739789,-0.084515,-0.374899,-0.701441,-0.452410,0.159187,-0.020342,-0.105791,0.006592,0.025245,0.001508,-0.001521,-0.006908,-0.003226,-0.013817
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31995,31995,5011001060063300,2635000026053400,-0.181235,-0.238055,0.605823,6.0,-0.152233,-0.009975,-0.005061,-0.002253,-0.000209,-0.317616,0.000973,0.002367,0.000261,-0.002049,0.001603,-0.000270,-0.002014,0.000154,0.000623,0.000040,-0.000040,-0.000187,-0.000088,-0.000412
31996,31996,5011001095042400,1168000017002200,-0.607005,-0.768897,1.358154,5.0,-0.152233,-0.009975,-0.005061,-0.002253,-0.000209,0.721631,-0.050784,-0.170036,-0.032908,0.684424,0.619441,-0.046337,-0.173290,0.009631,0.036001,0.002093,-0.002107,-0.009492,-0.004408,-0.018315
31997,31997,5011001108036200,4119700008012100,-0.539758,-0.493304,-0.095494,9.0,-0.152233,-0.009975,-0.005061,-0.002253,-0.000209,-0.317616,0.000973,0.002367,0.000261,-0.002049,0.001603,-0.000270,-0.002014,0.000154,0.000623,0.000040,-0.000040,-0.000187,-0.000088,-0.000412
31998,31998,5011001115011400,1132000015085100,-1.167376,-1.196253,0.326348,3.0,-0.152233,-0.009975,-0.005061,-0.002253,-0.000209,-0.317616,0.000973,0.002367,0.000261,-0.002049,0.001603,-0.000270,-0.002014,0.000154,0.000623,0.000040,-0.000040,-0.000187,-0.000088,-0.000412


## Modeling with pycaret

- Model setting

In [16]:
train = train.iloc[:,1:]

In [17]:
reg = setup(train, 
            preprocess = False, 
            train_size = 0.8,  
            target = 'INVC_CONT', # 목표 변수
            silent = True, 
            use_gpu = True, 
            session_id = 42,
            fold_shuffle = True
            )

Unnamed: 0,Description,Value
0,session_id,42
1,Target,INVC_CONT
2,Original Data,"(32000, 26)"
3,Missing Values,False
4,Numeric Features,25
5,Categorical Features,0
6,Transformed Train Set,"(25600, 25)"
7,Transformed Test Set,"(6400, 25)"
8,Shuffle Train-Test,True
9,Stratify Train-Test,False


In [18]:
top5 = compare_models(sort='RMSE', fold=5, n_select = 1, include = ['gbr'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,1.8285,23.0479,4.7582,0.2492,0.3599,0.3485,1.77


- Hyperparameter tuning

In [19]:
#gbr
gbr = create_model('gbr')
#hyperparameter tuning
gbr = tune_model(gbr, optimize = 'RMSE', choose_better = True, fold = 5, n_iter = 30)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,1.8432,16.4125,4.0512,0.3919,0.3646,0.3553
1,1.8003,18.7921,4.335,0.2734,0.359,0.3495
2,1.8239,20.2509,4.5001,0.2487,0.36,0.3533
3,1.9375,34.7936,5.8986,0.399,0.3638,0.3546
4,1.8232,20.714,4.5513,0.1074,0.3599,0.3604
Mean,1.8456,22.1926,4.6672,0.2841,0.3615,0.3546
SD,0.0479,6.4763,0.6399,0.1071,0.0023,0.0035


### Ensemble

- blend model

In [20]:
final_model = finalize_model(gbr) # 전체 데이터로 재학습

- Fit & predict

In [58]:
len(f.groupby('index').mean() == f.iloc[:,1:])

36640

In [21]:
y_train = train['INVC_CONT']
train = train.drop('INVC_CONT', axis=1)

In [22]:
model = final_model
model.fit(train,y_train)
pred = model.predict(test.iloc[:,1:])

In [23]:
submission['INVC_CONT'] = pred

### Make submission 

In [24]:
t = pd.Timestamp.now()
# fname = f'submission_{t.month:02}{t.day:02}{t.hour:02}{t.minute:02}.csv'
fname = 'submission_검증'
submission.to_csv('../submission/'+fname, index=False)
print("'{}' is ready to submit." .format(fname))

'submission_검증' is ready to submit.


# END