## Module import 

In [1]:
#### 전처리
import pandas as pd
import numpy as np
# Scailing
from sklearn.preprocessing import PowerTransformer
#PCA
from sklearn.decomposition import PCA
#### Modeling
from pycaret.regression  import *

# 버전 확인
print('pandas : %s'%(pd.__version__))
print('numpy : %s'%(np.__version__))
print('pycaret : %s'%(pycaret.__version__))

!python --version

pandas : 1.3.2
numpy : 1.19.5
pycaret : 2.3.3
Python 3.8.8


## data 불러오기

In [2]:
# 데이터가 저장된 경로 설정
data_path = '../data/'

# 데이터 불러오기
train = pd.read_csv(data_path + 'train_df.csv', encoding='cp949')
test = pd.read_csv(data_path + 'test_df.csv', encoding='cp949')
submission = pd.read_csv(data_path + 'sample_submission.csv')

### 전처리

In [3]:
# train과 test 병합
data = pd.concat([train,test]).reset_index(drop=True);data

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT
0,0,1129000014045300,5011000220046300,패션의류,상의,3.0
1,1,1135000009051200,5011000178037300,생활/건강,반려동물,3.0
2,2,1135000030093100,5011000265091400,패션의류,기타패션의류,9.0
3,3,1154500002014200,5011000315087400,식품,농산물,10.0
4,4,1165000021008300,5011000177051200,식품,가공식품,3.0
...,...,...,...,...,...,...
36635,36635,5013000858004400,4725000719072200,식품,농산물,
36636,36636,5013000870018300,2826000106075300,식품,농산물,
36637,36637,5013000897086300,4311100034004300,식품,농산물,
36638,36638,5013000902065100,4145000013011200,식품,농산물,


- 원본 데이터의 형태로 변경해줌

In [4]:
# data
data['SEND_SPG_INNB'] = data['SEND_SPG_INNB'].apply(lambda x: int(str(x)[:5])*10000000)
data['REC_SPG_INNB'] = data['REC_SPG_INNB'].apply(lambda x: int(str(x)[:5])*10000000)

In [5]:
#train과 test를 분리
train = data.iloc[:32000,:]
test = data.iloc[32000:,:]
test = test.drop(columns = ['INVC_CONT']).reset_index(drop=True)

- Make Feature

    - train

In [7]:
####train#####

# 송하인 기준으로 총 몇개의 택배를 보냈는지 확인하여 이를 피처로 만들어줌 
groupby_SEND_SPG_INNB_sum = train.groupby('SEND_SPG_INNB').sum()['INVC_CONT'].reset_index()
# 이후에 피처로 만들어주기 위한 dic생성
groupby_SEND_SPG_INNB_sum_dix_tr = {x:y for x,y in zip(groupby_SEND_SPG_INNB_sum['SEND_SPG_INNB'], groupby_SEND_SPG_INNB_sum['INVC_CONT'])}
# 피처 생성
train['총물류량'] = train['SEND_SPG_INNB'].apply(lambda x: groupby_SEND_SPG_INNB_sum_dix_tr[x])

In [8]:
####train#####
# 송하인 기준으로 총 몇명의 수하인에게 택배를 보냈는지 확인하여 이를 피처로 만들어줌 
groupby_SEND_SPG_INNB_count = train.groupby('SEND_SPG_INNB').count()['REC_SPG_INNB'].reset_index()
# 이후에 피처로 만들어주기 위한 dic생성
groupby_SEND_SPG_INNB_count_dix_tr = {x:y for x,y in zip(groupby_SEND_SPG_INNB_count['SEND_SPG_INNB'], groupby_SEND_SPG_INNB_count['REC_SPG_INNB'])}
# 피처 생성
train['거래수하인수'] = train['SEND_SPG_INNB'].apply(lambda x: groupby_SEND_SPG_INNB_count_dix_tr[x])

In [9]:
####train####
# 평균 한건당 몇번의 수화물을 보내는지에 대한 피처 생성
train['평균거래량'] = train['총물류량'] / train['거래수하인수'] # INVC_CONT과의 상관계수를 확인해보면 scailing을 하기 전 0.6정도 나옴

In [10]:
# train의 데이터를 참고하여 피처를 만들어줌
## 총물류량
dic_total = {x:y for x,y in zip(train['SEND_SPG_INNB'], train['총물류량'])}
test['총물류량'] = test['SEND_SPG_INNB'].apply(lambda x: dic_total[x] if x in dic_total else 0)
## 거래수하인수
dic_rec = {x:y for x,y in zip(train['SEND_SPG_INNB'], train['거래수하인수'])}
d_items = [i[1] for i in dic_rec.items()] # 딕셔너리의 아이템들을 확인함
d_items.sort(reverse=True)
test['거래수하인수'] = test['SEND_SPG_INNB'].apply(lambda x: dic_rec[x] if x in dic_rec else np.mean(d_items)) # train에 없는 경우 평균으로 채워줌
## 평균거래량
test['평균거래량'] = test['총물류량'] / test['거래수하인수']

In [11]:
test['거래수하인수'].describe()

count     4640.000000
mean     10281.481521
std       5136.497290
min          1.000000
25%      11341.000000
50%      11341.000000
75%      14003.000000
max      14003.000000
Name: 거래수하인수, dtype: float64

    - preprocessing

In [12]:
# 인코딩과 스케일링을 위하여 분리및 제거해줌
####train####
train_index = train['index']
y_train = train['INVC_CONT']

train = train.drop(columns=['index', 'INVC_CONT'])

####test####
test_index = test['index']

test = test.drop(columns=['index'])

- Scailing

In [13]:
# 범주형 변수와 수치형 변수를 분리
#train
train_cat_features = train.select_dtypes(include=['object']).columns.to_list()
train_num_features = train.select_dtypes(exclude='object').columns.to_list() 
#test
test_cat_features = test.select_dtypes(include=['object']).columns.to_list()
test_num_features = test.select_dtypes(exclude='object').columns.to_list() 

In [14]:
# 송하인과 수하인의 고유번호는 따로 scailing을 하지 않고 그대로 진행
# train
train_num_features = train_num_features[2:]
# test
test_num_features = test_num_features[2:]

In [15]:
#수치형변수 PowerTransformer scailing
# train
train[train_num_features] = PowerTransformer(standardize=True).fit_transform(train[train_num_features])
# test
test[test_num_features] = PowerTransformer(standardize=True).fit_transform(test[test_num_features])

In [16]:
# 위에서 떼어놨던 피처 병합
# train
train = pd.concat([train_index, train, y_train],axis=1)
# test
test = pd.concat([test_index, test],axis=1)

### One-hot encoding & PCA
- 범주형 변수들을 onehot encoding한 이후 피처들의 차원을 두번에 걸쳐 줄여줌

In [17]:
# data = pd.get_dummies(data, columns=['SEND_SPG_INNB', 'REC_SPG_INNB', 'cluster_kmeans'])

In [18]:
# 차원축소 매소드 
def dummy_to_pca(tr, column_name:str) :
    # pca에 쓰일 차원을 확인하기 위함
    max_d = len(set(tr[column_name])) # -> PCA의 차원으로 씀
    
    # PCA를 적용할 데이터프레임 생성
    f =tr[['index', column_name]] # 해당 컬럼만 가져와줌
    f = pd.get_dummies(f, columns=[column_name])  # 가져온 피처에 One-hot-encoding을 진행해줌
    f = f.iloc[:,1:].astype('float64') # 누적 분산 계산을 위해 index를 지우고 type을 바꿔줌
    
    # 최적의 d값을 찾음
    pca = PCA(n_components=max_d) 
    pca.fit(f) # pca 적용 / 원핫 인코딩을 한 피처들을 위에서 정의한 차원을 축소해줌

    # 적절한 차원의 수를 찾기 위한 과정
    cumsum = np.cumsum(pca.explained_variance_ratio_) #분산의 설명량을 누적합 / 리스트의 형태
    num_d = np.argmax(cumsum >= 0.99) + 1 # 분산의 설명량이 99%이상 되는 인덱스를 cumsum에서 찾아줌 거기에 1을 더한것을 차원으로 지정 / 차원 갱신
    if num_d == 1: # 가장 처음의 차원이 분산이 제일 큰 경우
        num_d = max_d
    
    # 적용
    pca = PCA(n_components=num_d) # 분산의 설명량이 99% 이상 되는 차원으로 두번째 pca를 적용함    
    result = pca.fit_transform(f)
    result = pd.DataFrame(result)
    result.columns = [column_name + '_' + str(column) for column in result.columns] #열이름 지정
    result.index = f.index # 인덱스를 맞춰줌
    return result.reset_index()

In [19]:
# PCA를 진행한 데이터 생성후 train와 병합
# train
pca_train = pd.concat([dummy_to_pca(train, 'DL_GD_LCLS_NM').iloc[:,1:], dummy_to_pca(train, 'DL_GD_MCLS_NM').iloc[:,1:]],axis=1)
train = pd.concat([train.drop(columns=['DL_GD_LCLS_NM', 'DL_GD_MCLS_NM']), pca_train], axis=1)
# test
pca_test = pd.concat([dummy_to_pca(test, 'DL_GD_LCLS_NM').iloc[:,1:], dummy_to_pca(test, 'DL_GD_MCLS_NM').iloc[:,1:]],axis=1)
test = pd.concat([test.drop(columns=['DL_GD_LCLS_NM', 'DL_GD_MCLS_NM']), pca_test], axis=1)

- Data split

In [20]:
# #train과 test를 분리
# train = data.iloc[:32000,:]
# test = data.iloc[32000:,:]
# test = test.drop(columns = ['INVC_CONT']).reset_index(drop=True)

## Modeling with pycaret
- pycaret을 통하여 가장 좋은 성능을 내는 모델을 찾고 사용
- 검증 score만이 아닌 public을 같이 확인하여 모델 평가

- Model setting

In [21]:
train = train.iloc[:,1:]

In [22]:
reg = setup(train, 
            preprocess = False, 
            train_size = 0.8,  
            target = 'INVC_CONT', # 목표 변수
            silent = True, 
            use_gpu = True, 
            session_id = 42,
            fold_shuffle = True,
            )

Unnamed: 0,Description,Value
0,session_id,42
1,Target,INVC_CONT
2,Original Data,"(32000, 30)"
3,Missing Values,False
4,Numeric Features,29
5,Categorical Features,0
6,Transformed Train Set,"(25600, 29)"
7,Transformed Test Set,"(6400, 29)"
8,Shuffle Train-Test,True
9,Stratify Train-Test,False


- Compare Model -> 잘 나오는 모델이 무엇인지 확인하여 사용

In [23]:
top3 = compare_models(sort='RMSE', fold=5, n_select = 3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,2.1513,28.8368,5.3156,0.0794,0.411,0.4144,0.79
gbr,Gradient Boosting Regressor,2.1434,28.894,5.3283,0.072,0.4053,0.4122,1.632
catboost,CatBoost Regressor,2.1552,29.6018,5.3967,0.0458,0.4111,0.4119,3.916
br,Bayesian Ridge,2.2155,31.05,5.4813,0.0348,0.4194,0.4319,0.026
ridge,Ridge Regression,2.2176,31.0598,5.4823,0.0344,0.4198,0.4325,0.012
omp,Orthogonal Matching Pursuit,2.2591,31.6311,5.532,0.0169,0.4248,0.4426,0.012
en,Elastic Net,2.2359,31.9668,5.5597,0.0076,0.4194,0.4352,0.606
rf,Random Forest Regressor,2.2555,31.3509,5.5622,-0.0191,0.4363,0.4331,0.968
lr,Linear Regression,2.2392,32.0456,5.5671,0.0048,0.4206,0.436,0.014
lasso,Lasso Regression,2.2392,32.0456,5.5671,0.0048,0.4206,0.436,0.508


- Hyperparameter tuning

In [None]:
#gbr
gbr = create_model('gbr')
#hyperparameter tuning
gbr = tune_model(gbr, optimize = 'RMSE', choose_better = True, fold = 5, n_iter = 30)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.1246,27.0306,5.1991,-0.2584,0.4015,0.4122
1,2.1693,23.3366,4.8308,0.2819,0.4144,0.4275
2,2.0791,22.1797,4.7095,0.0243,0.4063,0.4113
3,2.1392,26.1946,5.1181,0.0962,0.4048,0.4053
4,2.164,29.4626,5.4279,0.036,0.4121,0.4092
5,2.0557,22.1795,4.7095,0.0496,0.3962,0.4091
6,2.2399,39.0367,6.2479,0.2426,0.4135,0.4117
7,2.2225,44.1932,6.6478,0.3121,0.402,0.3983
8,2.1647,33.8034,5.8141,-0.0947,0.4025,0.4136
9,2.0055,14.7567,3.8414,0.0498,0.3955,0.4148


IntProgress(value=0, description='Processing: ', max=7)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE


Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    4.6s


In [21]:
# 전체 데이터로 재학습
final_model = finalize_model(gbr)

- Fit & predict

In [22]:
# y_train = train['INVC_CONT']
train = train.drop('INVC_CONT', axis=1)

In [23]:
model = final_model
model.fit(train,y_train)
pred = model.predict(test.iloc[:,1:])

In [24]:
submission['INVC_CONT'] = pred

### Make submission 

In [25]:
t = pd.Timestamp.now()
fname = f'submission_{t.month:02}{t.day:02}{t.hour:02}{t.minute:02}.csv'
# fname = 'submission_검증'
submission.to_csv('../submission/'+fname, index=False)
print("'{}' is ready to submit." .format(fname))

'submission_12222221.csv' is ready to submit.


In [40]:
# submission.to_csv('../submission/submission_검증.csv', index=False)
# print('submission is ready to submit.')

# END