In [1]:
%load_ext watermark
%watermark -v -p numpy,scipy,sklearn,pandas,matplotlib

Python implementation: CPython
Python version       : 3.7.12
IPython version      : 7.33.0

numpy     : 1.21.6
scipy     : 1.7.3
sklearn   : 0.23.2
pandas    : 1.3.4
matplotlib: 3.5.2



In [2]:
import numpy as np
import os
import seaborn as sns
import sklearn

# 일관된 출력을 위해 유사난수 초기화
np.random.seed(42)

# 맷플롯립 설정
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# 한글출력
matplotlib.rc('font', family='NanumBarunGothic')
plt.rcParams['axes.unicode_minus'] = False

# DATA 불러오기

In [3]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('mode.chained_assignment',  None)

PROJECT_ROOT_DIR = './..'
DATA_DIR = 'data'
DATA_PATH = os.path.join(PROJECT_ROOT_DIR, DATA_DIR, )

def load_user_data(data_path = DATA_PATH):
    train_csv_path = os.path.join(data_path, 'train.csv')
    test_csv_path = os.path.join(data_path, 'test.csv')
    return pd.read_csv(train_csv_path), pd.read_csv(test_csv_path)

train, test = load_user_data(DATA_PATH)

# DATA 확인

In [4]:
train.head()

Unnamed: 0,id,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,NumDealsPurchases,...,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response,target
0,0,1974,Master,Together,46014.0,1,1,21-01-2013,21,10,...,8,7,0,0,0,0,0,0,0,541
1,1,1962,Graduation,Single,76624.0,0,1,24-05-2014,68,1,...,7,1,1,0,0,0,0,0,0,899
2,2,1951,Graduation,Married,75903.0,0,1,08-04-2013,50,2,...,9,3,0,0,0,0,0,0,0,901
3,3,1974,Basic,Married,18393.0,1,0,29-03-2014,2,2,...,3,8,0,0,0,0,0,0,0,50
4,4,1946,PhD,Together,64014.0,2,1,10-06-2014,56,7,...,5,7,0,0,0,1,0,0,0,444


In [5]:
train = train.drop('id', axis=1)

In [6]:
train.columns

Index(['Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'NumDealsPurchases',
       'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
       'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5',
       'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Response', 'target'],
      dtype='object')

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1108 entries, 0 to 1107
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Year_Birth           1108 non-null   int64  
 1   Education            1108 non-null   object 
 2   Marital_Status       1108 non-null   object 
 3   Income               1108 non-null   float64
 4   Kidhome              1108 non-null   int64  
 5   Teenhome             1108 non-null   int64  
 6   Dt_Customer          1108 non-null   object 
 7   Recency              1108 non-null   int64  
 8   NumDealsPurchases    1108 non-null   int64  
 9   NumWebPurchases      1108 non-null   int64  
 10  NumCatalogPurchases  1108 non-null   int64  
 11  NumStorePurchases    1108 non-null   int64  
 12  NumWebVisitsMonth    1108 non-null   int64  
 13  AcceptedCmp3         1108 non-null   int64  
 14  AcceptedCmp4         1108 non-null   int64  
 15  AcceptedCmp5         1108 non-null   i

In [8]:
X = train.drop('target', axis=1)
y = train.target
X.reset_index()
y.reset_index()

Unnamed: 0,index,target
0,0,541
1,1,899
2,2,901
3,3,50
4,4,444
...,...,...
1103,1103,241
1104,1104,147
1105,1105,30
1106,1106,447


# 전처리

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin

class DtCustomerSplitter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        year, month, day = list(), list(), list()
        for date in X.Dt_Customer:
            da, mo, ye = date.split('-')
            year.append(int(ye))
            month.append(int(mo))
            day.append(int(da))
        X['year']  = year
        X['month'] = month
        X['day']   = day
        return X.drop('Dt_Customer', axis=1)

In [10]:
class HousingTypeClassifier(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        for i in range(X.shape[0]):
            marital_status = X['Marital_Status'].iloc[i]
            if marital_status in ['Married', 'Together']:
                X['Marital_Status'].iloc[i] = 0
            else:
                X['Marital_Status'].iloc[i] = 1
        return X

In [11]:
class KidChildToBinary(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        for i in range(X.shape[0]):
            kid_home = X['Kidhome'].iloc[i]
            child_home = X['Teenhome'].iloc[i]
            if kid_home + child_home != 0 :
                X['Teenhome'].iloc[i] = 1
            else:
                X['Teenhome'].iloc[i] = 0
        return X.drop('Kidhome', axis=1)

In [12]:
class DoNothing(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import RobustScaler

left = ['Income',
        # 'Kidhome',
       # 'Teenhome',
        'Recency', 'NumDealsPurchases',
       'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
       'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5',
       'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Response']
ord_cat = [['Basic','2n Cycle','Graduation','Master','PhD']]

my_pipeline = ColumnTransformer([
    # ('DtCustomerSplitter', DtCustomerSplitter(), ['Dt_Customer']),
    ('KidChildToBinary', KidChildToBinary(), ['Kidhome', 'Teenhome']),
    ('HousingTypeClassifier', HousingTypeClassifier(), ['Marital_Status']),
    ('OrdinalEncoder', OrdinalEncoder(categories=ord_cat), ['Education']),
    ('RobustScaler', RobustScaler(), ['Year_Birth']),
    ('DoNothing', DoNothing(), left)
])


In [14]:
test_X = my_pipeline.fit_transform(X)
test_X

array([[1, 0, 3.0, ..., 0.0, 0.0, 0.0],
       [1, 1, 2.0, ..., 0.0, 0.0, 0.0],
       [1, 0, 2.0, ..., 0.0, 0.0, 0.0],
       ...,
       [1, 0, 3.0, ..., 0.0, 0.0, 0.0],
       [1, 1, 2.0, ..., 0.0, 0.0, 0.0],
       [1, 1, 4.0, ..., 0.0, 0.0, 1.0]], dtype=object)

# 모델

In [15]:
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from ngboost import NGBRegressor

cat_reg = CatBoostRegressor(random_state=42, silent=True)
xgb = XGBRegressor(objective='reg:squarederror', random_state = 42)
lgbm = LGBMRegressor(objective='regression', random_state = 42)
rnd_fr = RandomForestRegressor(random_state=42)
ngbr = NGBRegressor(random_state=42, verbose=0)

models = [('xgb', xgb), ('lgbm', lgbm), ('cat_reg', cat_reg), ('rnd_fr', rnd_fr), ('ngbr', ngbr)]
voting_reg = VotingRegressor(models, n_jobs=1)

In [16]:
X_prepared = my_pipeline.fit_transform(X)
X_prepared

array([[1, 0, 3.0, ..., 0.0, 0.0, 0.0],
       [1, 1, 2.0, ..., 0.0, 0.0, 0.0],
       [1, 0, 2.0, ..., 0.0, 0.0, 0.0],
       ...,
       [1, 0, 3.0, ..., 0.0, 0.0, 0.0],
       [1, 1, 2.0, ..., 0.0, 0.0, 0.0],
       [1, 1, 4.0, ..., 0.0, 0.0, 1.0]], dtype=object)

In [17]:
def nmae(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score

In [18]:
data = test.drop('id', axis = 1).copy()
sub_X = data
sub_X_prepared = my_pipeline.fit_transform(sub_X)

In [23]:
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt

skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True) #총 10번의 fold 진행
n = 0 #x번째 fold인지 기록

fold_target_pred = []
fold_score = []

for train_index, valid_index in skf.split(X_prepared, y): #range 기준으로 stratified k fold 진행
    n += 1

    val_pred_name = [] #validation pred model 이름 저장
    val_pred = []      #validation set pred 결과 저장
    target_pred = []   #test set pred 결과 저장

    train_X = np.array(X_prepared)
    train_Y = np.array(y)

    X_train, X_valid = train_X[train_index], train_X[valid_index]
    y_train, y_valid = train_Y[train_index], train_Y[valid_index]

    X_test = np.array(sub_X_prepared)

    ### Create Model ###

    ###모델을 생성하고 집어넣으면 됩니다.

    ### LGBMRegressor ###
    model = LGBMRegressor(random_state = 42, verbose = 0, objective='regression', force_col_wise=True)
    #추가적으로 하이퍼파라미터 튜닝 필요
    model.fit(X_train, y_train) # 모델 학습

    val_pred_name.append("LGBMRegressor")      # 모델 이름 저장
    val_pred.append(model.predict(X_valid))   # validation set pred 결과 저장
    target_pred.append(model.predict(X_test)) # test set pred 결과 저장

    ### XGBRegressor ###
    model = XGBRegressor(random_state = 42) #추가적으로 하이퍼파라미터 튜닝 필요
    model.fit(X_train, y_train)

    val_pred_name.append("XGBRegressor")      # 모델 이름 저장
    val_pred.append(model.predict(X_valid))   # validation set pred 결과 저장
    target_pred.append(model.predict(X_test)) # test set pred 결과 저장

    ### CatBoostRegressor ###
    model = CatBoostRegressor(random_state = 42, silent=True) #추가적으로 하이퍼파라미터 튜닝 필요
    model.fit(X_train, y_train)

    val_pred_name.append("CatBoostRegressor")      # 모델 이름 저장
    val_pred.append(model.predict(X_valid))   # validation set pred 결과 저장
    target_pred.append(model.predict(X_test)) # test set pred 결과 저장

    ### NGBRegressor ###
    model = NGBRegressor(random_state = 42, verbose=0) #추가적으로 하이퍼파라미터 튜닝 필요
    model.fit(X_train, y_train)

    val_pred_name.append("NGBRegressor")      # 모델 이름 저장
    val_pred.append(model.predict(X_valid))   # validation set pred 결과 저장
    target_pred.append(model.predict(X_test)) # test set pred 결과 저장

    ### RandomForestRegressor ###
    model = RandomForestRegressor(random_state=42) #추가적으로 하이퍼파라미터 튜닝 필요
    model.fit(X_train, y_train)

    val_pred_name.append("RandomForestRegressor")      # 모델 이름 저장
    val_pred.append(model.predict(X_valid))   # validation set pred 결과 저장
    target_pred.append(model.predict(X_test)) # test set pred 결과 저장

    ### voting ###

    ### average validation pred ###
    preds = np.array(val_pred[0])
    for i in range(1, len(val_pred)):
        preds += val_pred[i]

    preds = preds/len(val_pred)

    ### average target pred ###
    target_preds = np.array(target_pred[0])
    for i in range(1, len(target_pred)):
        target_preds += target_pred[i]

    target_preds = target_preds/len(target_pred)

    fold_target_pred.append(target_preds) # append final target pred

    print("========== fold %d ==========" %(n))
    for i in range(len(val_pred)):
        print("%s model NMAE : %0.4f" %(val_pred_name[i], nmae(y_valid, val_pred[i].astype(int))))

    print("==============================")
    print("Average NMAE %0.4f" %(nmae(y_valid, preds.astype(int))))
    print("")

    fold_score.append(nmae(y_valid, preds.astype(int)))

total_score = fold_score[0]
for i in range(2, len(fold_score), 1):
    total_score += fold_score[i]
total_score = total_score/(len(fold_score))

print("==============================")
print("Total Average NMAE %0.4f" %(total_score)) #최종 average score 출력



LGBMRegressor model NMAE : 0.1745
XGBRegressor model NMAE : 0.1818
CatBoostRegressor model NMAE : 0.1587
NGBRegressor model NMAE : 0.1764
RandomForestRegressor model NMAE : 0.1608
Average NMAE 0.1611

LGBMRegressor model NMAE : 0.1680
XGBRegressor model NMAE : 0.1714
CatBoostRegressor model NMAE : 0.1693
NGBRegressor model NMAE : 0.1983
RandomForestRegressor model NMAE : 0.1669
Average NMAE 0.1635

LGBMRegressor model NMAE : 0.1663
XGBRegressor model NMAE : 0.1698
CatBoostRegressor model NMAE : 0.1722
NGBRegressor model NMAE : 0.1884
RandomForestRegressor model NMAE : 0.1710
Average NMAE 0.1628

LGBMRegressor model NMAE : 0.1754
XGBRegressor model NMAE : 0.1869
CatBoostRegressor model NMAE : 0.1812
NGBRegressor model NMAE : 0.2071
RandomForestRegressor model NMAE : 0.1832
Average NMAE 0.1774

LGBMRegressor model NMAE : 0.1763
XGBRegressor model NMAE : 0.1555
CatBoostRegressor model NMAE : 0.1754
NGBRegressor model NMAE : 0.2082
RandomForestRegressor model NMAE : 0.1867
Average NMAE 0.1

In [20]:
final_pred = np.array(fold_target_pred[0])
final_pred

array([ 542.33337696,  760.10900788,  813.97747201, ...,   71.74198423,
       1313.20754705,  148.12988971])

In [21]:
### average target pred ###
final_pred = np.array(fold_target_pred[0])
for i in range(1, len(fold_target_pred)):
    final_pred += fold_target_pred[i]

final_pred = final_pred/len(fold_target_pred)
final_pred = final_pred.astype(int)

In [22]:
submission = pd.read_csv('../data/sample_submission.csv')
submission['target'] = final_pred
submission.to_csv("../data/submission_8.csv", index=False)