# 0. 필요한 라이브러리 import & train, test read

In [1]:
import os
import re
from datetime import datetime, timedelta
import warnings
import numpy as np
import pandas as pd
import joblib
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

warnings.filterwarnings('ignore')

In [2]:
train = pd.read_excel("../original_data/main_data.xlsx")
test = pd.read_excel("../original_data/template.xlsx")

# 1. EDA

In [3]:
## flag: if True -> train, else -> test
# renaming
def rename_valid(data):
    data.rename(columns={"방송일시": "broadDateTime", "노출(분)": "broadTime",
                         "마더코드": "motherCode", "상품코드": "prodCode", "상품명": "prodName",
                         "상품군": "prodGroup", "판매단가": "unitPrice", "취급액": "revenue"}, inplace=True)

    return data

모델링의 편의를 위해 한글로 된 변수 이름을 영어로 바꿔주는 함수입니다.

In [4]:
# broadTime imputation
def broadTime_imp(data):
    na_broadTime_idx = list(data[data["broadTime"].isna()].index)
    max_num = 0
    last_idx = -1
    num = 1

    for i, idx in enumerate(na_broadTime_idx):
        if idx - last_idx == 1:
            num += 1
        else:
            max_num = max(max_num, num)
            num = 1

        last_idx = idx

    for idx in na_broadTime_idx:
        max_num_idx = min(max_num, idx)
        impute_val = data[idx - max_num_idx: idx]["broadTime"][~data[idx - max_num_idx: idx]["broadTime"].isna()].iloc[
            -1]
        data["broadTime"].loc[idx] = impute_val

    for i in range(len(data)):
        val = data["broadTime"].iloc[i]
        if 19 < val < 21:
            val = 20
        elif 29 < val < 31:
            val = 30
        else:
            val = round(val)
        data["broadTime"].iloc[i] = val

    data['broadTime'] = data['broadTime'].astype(int)

    return data

broadTime, 즉 방송시간이 nan인 것들을 imputation 해줍니다.

In [5]:
# drop nas
def na_dropping(data, flag=True):
    if flag:
        data = data[data["revenue"] != 50000]
        data = data[~data["revenue"].isna()]
        data = data[data["prodGroup"] != "무형"]
        data.reset_index(drop=True, inplace=True)
    else:
        data['revenue'] = 10 ** 5 # dummy
        data = data[data["prodGroup"] != "무형"]
        data.reset_index(drop=True, inplace=True)
    return data

flag=True는 트레인 데이터를 의미하고 flag=False는 테스트 테이터를 의미합니다. <br>
트레인시에는 무형인 데이터와 revenue가 5만인 데이터를 빼주고, 테스트시에는 무형인 데이터를 빼줍니다.

In [6]:
def target_transformation(data):
    data["target"] = np.divide(data["revenue"], data["unitPrice"]).astype(np.float64)
    data["target"] = np.divide(data["target"], data["broadTime"]).astype(np.float64)
    data["target"] = np.log(data["target"])

    return data

target을 revenue에서 log(분당판매량((revenue/unitPrice)/broadTime))으로 바꿔줍니다.

In [7]:
# 하나로 합치기
def first_eda(data, flag=True):
    dat = rename_valid(data)
    dat = broadTime_imp(dat)
    dat = na_dropping(dat, flag=flag)
    dat = target_transformation(dat)

    return dat

이 모든 과정을 하나로 정리한 함수입니다

In [8]:
train = first_eda(train)
test = first_eda(test, flag=False)

적용시켜줍니다

In [9]:
# add new variables
def add_newvars(data):
    # gender
    isFemale = [1 if "여성" in name or "여자" in name else 0 for name in data.prodName]
    #isMale = [1 if "남성" in name or "남자" in name else 0 for name in data.prodName]
    data["isFemale"] = isFemale
    #data["isMale"] = isMale

    # 할부
    paymentPlan = [1 if "무이자" in name or "(무)" in name else 0 for name in data.prodName]
    data["paymentPlan"] = paymentPlan

    return data

새로운 변수: 성별변수와 할부여부를 넣어줍니다.

In [10]:
#change dayofweek: 0, 1, 2시 인 경우 전날로 바꿔줌
def change_dayofWeek(data):
    #dayofWeek
    broadDateTime = pd.to_datetime(data['broadDateTime'])

    day_lst_f = []

    for i in range(broadDateTime.shape[0]):
        year = broadDateTime[i].year
        month = broadDateTime[i].month
        day = broadDateTime[i].day
        hour = broadDateTime[i].hour
        broadDayofWeek = broadDateTime[i].weekday()

        if hour== 0 or hour ==1 or hour==2 :
            today = datetime(year,month,day)
            yes = today - timedelta(days = 1)
            yes = yes.weekday()
            day_lst_f.append(yes)
        else:
            day_lst_f.append(broadDayofWeek)

    data['broadDayOfWeek'] = day_lst_f
    return data

#change time: 0시->24시, 1시->25시, 2시->26시
def change_time(data):
    day_lst = []
    broadDateTime = pd.to_datetime(data['broadDateTime'])

    for i in range(broadDateTime.shape[0]):
        year = broadDateTime[i].year
        month = broadDateTime[i].month
        day = broadDateTime[i].day
        hour = broadDateTime[i].hour
        minute = broadDateTime[i].minute
        second = broadDateTime[i].second
        today = datetime(year, month, day)
        yes = today - timedelta(days = 1)

        if hour== 0 : hour = '24'
        elif hour == 1 : hour = '25'
        elif hour == 2 : hour = '26'

        if hour in ['24', '25', '26'] :
            yesterday = str(yes.year) + '-' +str(yes.month)+ '-'+str(yes.day)
            then = str(hour) + ':' +str(minute)+ ':'+ str(second)
            day_lst.append(yesterday+' '+then)
        else:
            day_lst.append(str(broadDateTime[i]))

    #broadDateTime 바꾸고
    data['broadDateTime'] =day_lst

    #각각 파싱
    lst_Y=[]
    lst_M = []
    lst_D =[]
    lst_H = []
    lst_Mi = []
    for i in range(len(day_lst)):
        broadYear = str(data['broadDateTime'][i]).split('-')[0]
        broadMonth = str(data['broadDateTime'][i]).split('-')[1]
        broadDay = str(data['broadDateTime'][i]).split('-')[2].split(' ')[0]
        broadHour = str(data['broadDateTime'][i]).split('-')[2].split(' ')[1].split(':')[0]
        broadMin = str(data['broadDateTime'][i]).split('-')[2].split(' ')[1].split(':')[1]
        lst_Y.append(int(broadYear))
        lst_M.append(int(broadMonth))
        lst_D.append(int(broadDay))
        lst_H.append(int(broadHour))
        lst_Mi.append(int(broadMin))

    data.insert(1, "broadYear", lst_Y)
    data.insert(2, "broadMonth", lst_M)
    data.insert(3, "broadDay", lst_D)
    data.insert(4, "broadHour", lst_H)
    data.insert(5, "broadMin", lst_Mi)

    return data

0시, 1시, 2시는 새벽이므로 그 전날과 더 가깝다고 보아 날짜를 바꿔주고, 혼동의 여지가 없도록 0시를 24시로 1시를 25시로 <br> 2시를 26시로 바꿔주는 함수입니다.

In [11]:
#number of products in same time
def prodCount(data):
    Map=dict()
    data['prodCount']=0
    for x in data['broadDateTime'] :
        time = str(x)
        if time not in Map:
            Map[time] = 1
        else:
            Map[time] +=1
        data['prodCount'][data['broadDateTime']==time]=Map[time]

    return data

한 방송시간대당 몇개의 상품을 판매했는지를 나타내는 prodCount변수를 새로 만들어줍니다.

In [12]:
#holidays
def holidays(data):
    holidays= []
#일요일인 국가 지정 공휴일은 제외
    holiday_lst = ['2019-01-01','2019-02-04','2019-02-05','2019-02-06',
              '2019-03-01','2019-05-06','2019-06-06','2019-08-15',
              '2019-09-12', '2019-09-13','2019-09-14', '2019-10-03','2019-10-09',
              '2019-12-25', '2020-06-06', '2019-1-1','2019-2-4','2019-2-5','2019-2-6',
              '2019-3-1','2019-5-6','2019-6-6','2019-8-15',
              '2019-9-12', '2019-9-13','2019-9-14', '2019-10-3','2019-10-9',
              '2019-12-25', '2020-6-6']

    for i in range(data.shape[0]):
    #holiday list 속 애들 - append(1)
        if (data.broadDateTime[i].split(' ')[0] in holiday_lst) == True:
            holidays.append(1)
    #holiday list 속 애들이 아니면서 dayofWeek이 6인 것 (일요일인것)
        elif data.broadDayOfWeek[i] ==6:
            holidays.append(1)
    #쟤도 걔도 아닌것.
        else:
            holidays.append(0)

    data['isHoliday'] = holidays

    ## holiday len ##

    data["isHoliday"][data["broadDayOfWeek"] == 5] = 1
    data["isHoliday"][data["broadDayOfWeek"] == 6] = 1

    grouped = data.groupby(["broadMonth", "broadDay"]).mean().reset_index()
    grouped = grouped[["broadMonth", "broadDay", "isHoliday"]]

    holiday_len = []
    cnt = 0
    for holiday in grouped["isHoliday"]:
        if holiday == 1:
            cnt += 1
        elif holiday == 0:
            for _ in range(cnt):
                holiday_len.append(cnt)
            cnt = 0
            holiday_len.append(cnt)

    grouped["holidayLen"] = holiday_len
    data = data.merge(grouped)
    data.drop("isHoliday", inplace = True, axis = 1)

    return data

연속으로 몇 번째 공휴일인지 나타내는 변수 holidayLen을 만드는 함수입니다.

In [13]:
# add index
def add_index(data):
    data['unnamed'] = data.index

    # 비교군 2개
    li_time1 = list(map(lambda x: '0'+str(x) if len(str(x)) == 1 else x, data['broadMonth'].values.tolist()))
    li_time2 = list(map(lambda x: '0'+str(x) if len(str(x)) == 1 else x, data['broadDay'].values.tolist()))
    li_time = [str(x)+str(y) for x, y in zip(li_time1, li_time2)]

    data['time_index'] = li_time
    data = data.sort_values(['prodCode', 'time_index'])

    li_prod = data['prodCode'].values.tolist()
    li_time = data['time_index'].values.tolist()

    n=1
    index =[]
    index.append(n)

    for i in range(len(data)-1):
        if li_prod[i] != li_prod[i+1]: # 다른 상품이면 n 초기화
            n = 1
            index.append(n)
            continue
        if li_time[i] != li_time[i+1]: # 같은 상품에, 연속된거 아니면 n 초기화
            n = 1
            index.append(n)
            continue

        n += 1
        index.append(n)

    data['prod_index'] = index

    tmp = data.groupby(['prodCode', 'broadMonth', 'broadDay']).max()['prod_index'].reset_index()
    tmp = pd.merge(data, tmp, how='left', on=['prodCode', 'broadMonth', 'broadDay'])
    tmp.sort_values('unnamed', inplace=True)
    tmp.rename(columns={'prod_index_y': 'max_index',
                        'prod_index_x': 'prod_index'}, inplace=True)
    tmp.drop(['time_index', 'unnamed'], axis=1, inplace=True)

    ## start index & end index

    tmp["start_index"], tmp["end_index"] = np.zeros(tmp.shape[0]), np.zeros(tmp.shape[0])
    tmp["start_index"][tmp["prod_index"] == 1] = 1
    tmp["end_index"][tmp["prod_index"] == tmp["max_index"]] = 1

    tmp[["start_index", "end_index"]] = tmp[["start_index", "end_index"]].astype("int")

    return tmp

연속된 방송중 몇 번째 방송인지 나타내주는 변수 prod_index를 만들고, start_index는 시작 방송인지 max_index는 마지막 방송인지를 나타냅니다.

In [14]:
# 6월인지 아닌지 / test는 무조건 1
def isJune(data, flag=True):
    if flag:
        data["isJune"] = np.zeros(data.shape[0])
        data["isJune"][data["broadMonth"] == 6] = 1
    else:
        data["isJune"] = 1

    data["isJune"] = data["isJune"].astype('int')

    return data

학습을 위해 6월을 구분하는 함수를 만들었습니다.

In [15]:
# 조기품절되었는지
def stop(data):
    timeMap = dict()
    for month, day, maxIndex, code,prod, time in zip(data['broadMonth'], data['broadDay'], data["max_index"],data['prodCode'],data["prodName"],data["broadTime"]):
        name = str(month)+str(day)+str(code)+str(prod) + str(maxIndex)
        if name not in timeMap:
            timeMap[name] = time
        else:
            if timeMap[name] < time:
                timeMap[name] = time
            else:
                continue

    def make_stop(month, day, maxIndex, code, prod, time):
        name = str(month)+str(day)+str(code)+str(prod) + str(maxIndex)
        return time/timeMap[name]

    data['stop'] = data.apply(lambda row: make_stop(row['broadMonth'], row['broadDay'],row["max_index"], row['prodCode'],row["prodName"], row["broadTime"]), axis=1)
    data['stop'] = 1/data['stop']

    return data

조기품절되었는지의 여부를 나타내주는 변수 stop입니다. 즉, 원래의 방송시간보다 짧게 방송되었다는 의미입니다.

In [16]:
# 제품명에 이름이 들어갔는지
def isNamed(data):
    names = {'전지현', '팽현숙', '전철우', '강레오', '김선영', '김정문', '김정배',
             '김규흔', '이봉원', '오세득', '유귀열', '이경제', '이만기', '이보은', '임성근', '최인선',
             '김병만', '김병지', '이정섭', '이동수', '서장훈', '송도순', '효재', '천수봉', '숀리',
             '임화자', 'aab', 'aac'}

    data['isNamed'] = 0
    for i, prod_name in enumerate(data.prodName.values.tolist()):
        for name in names:
            if name in prod_name.lower():
                data.isNamed.iloc[i] = 1
                break

    data['isNamed'] = data['isNamed'].astype(int)

    return data

named 제품인지를 나타내주는 변수 isNamed를 만드는 함수입니다.

In [17]:
# 월급날 변수
def isPayday(data):
    payday={10, 11, 12, 13, 20, 21, 22, 23, 25, 26, 27, 28}
    isPayday = [1 if day in payday else 0 for day in data.broadDay]
    data['isPayday'] = isPayday

    return data

월급날 여부를 알려주는 isPayday 변수입니다.

In [18]:
# 외부시청률: 방영 전 시간대 인기많은 프로그램의 갯수
def isPre(train, broad_final):
    tmp = np.zeros((train.shape[0],), dtype=int)
    for i, row in train.iterrows():
        broad_new = broad_final[broad_final.month == row.loc['broadMonth']]
        broad_new = broad_new[broad_new.day == row.loc['broadDay']]
        broad_new = broad_new[broad_new.start_hour == row.loc['broadHour']-1]
        tmp[i] = len(broad_new)
    return tmp

# 외부시청률: 방영 시간대 인기많은 프로그램의 갯수
def isNow(train, broad_final):
    tmp = np.zeros((train.shape[0],), dtype=int)
    for i, row in train.iterrows():
        broad_new = broad_final[broad_final.month == row.loc['broadMonth']]
        broad_new = broad_new[broad_new.day == row.loc['broadDay']]
        broad_new = broad_new[broad_new.start_hour == row.loc['broadHour']]
        tmp[i] = len(broad_new)
    return tmp

# 외부시청률: 방영 후 시간대 인기많은 프로그램의 갯수
def isNext(train, broad_final):
    tmp = np.zeros((train.shape[0],), dtype=int)
    for i, row in train.iterrows():
        broad_new = broad_final[broad_final.month == row.loc['broadMonth']]
        broad_new = broad_new[broad_new.day == row.loc['broadDay']]
        broad_new = broad_new[broad_new.end_hour == row.loc['broadHour']+1]
        tmp[i] = len(broad_new)
    return tmp

# 시청률 변수 넣기
def add_rating(data, flag=True):
    if flag:
        file_names = ['broad_lst1_new.csv', 'broad_lst2_new_no_re.csv', 'broad_lst3_new.csv',
                      'broad_lst4_new.csv', 'broad_lst5_new.csv']
        file_paths = tuple(map(lambda x: os.path.join('../eda/data/rating_csv/', x), file_names))

        files = {}
        for file_name, file_path in zip(file_names, file_paths):
            try:
                df = pd.read_csv(file_path, encoding='utf-8')
                df = df[['title', 'dayOfWeek', 'start_hour', 'start_min', 'end_hour', 'end_min']]
                df.rename(columns={'dayOfweek': 'dayOfWeek'}, inplace=True)
            except:
                df = pd.read_csv(file_path, encoding='utf-8')
                df.columns = ['title', 'dayOfWeek', 'start_hour', 'start_min', 'end_hour', 'end_min']
            if file_name == 'broad_lst5_new.csv':
                df = pd.read_csv(file_path, encoding='utf-8')
                df.columns = ['title', 'start_hour', 'start_min', 'end_hour', 'end_min', 'dayOfWeek']
                df = df[['title', 'dayOfWeek', 'start_hour', 'start_min', 'end_hour', 'end_min']]

            files[file_name[:10]] = df

        broad_lst = pd.concat([*files.values()], axis=0)
        broad_final = pd.read_csv('../eda/data/rating_csv/broad_final_added.csv', index_col=0)

        broad_final_year = [str(x) for x in broad_final.year.values.tolist()]
        broad_final_month = ['0' + str(x) if len(str(x)) < 2 else str(x)
                             for x in broad_final.month.values.tolist()]
        broad_final_day = ['0' + str(x) if len(str(x)) < 2 else str(x)
                           for x in broad_final.day.values.tolist()]

        broadDateTime = list(map(lambda x, y, z: x + y + z, broad_final_year, broad_final_month,
                                 broad_final_day))
        broadDateTime = pd.to_datetime(broadDateTime, utc=False)
        broadDayOfWeek = [x.weekday() for x in broadDateTime]

        broad_final.insert(3, "dayOfWeek", broadDayOfWeek)
        broad_final = broad_final.drop(['start_hour', 'start_min', 'end_hour', 'end_min'], axis=1)
        broad_final = pd.merge(broad_final, broad_lst, on=['dayOfWeek', 'title'])

        broad_final.loc[2272, 'day'] = 14
        broad_final.loc[2278, 'day'] = 14

    else:
        file_path = os.path.join('../eda/data/rating_csv/', 'broad_lst_2020_new.csv')

        df = pd.read_csv(file_path, encoding='utf-8')
        df.columns = ['title', 'start_hour', 'start_min', 'end_hour', 'end_min', 'dayOfWeek']
        df = df[['title', 'dayOfWeek', 'start_hour', 'start_min', 'end_hour', 'end_min']]

        broad_lst = df
        broad_final = pd.read_csv('../eda/data/rating_csv/broad_final_2020.csv', index_col=0)

        broad_final_year = [str(x) for x in broad_final.year.values.tolist()]
        broad_final_month = ['0' + str(x) if len(str(x)) < 2 else str(x)
                             for x in broad_final.month.values.tolist()]
        broad_final_day = ['0' + str(x) if len(str(x)) < 2 else str(x)
                           for x in broad_final.day.values.tolist()]

        broadDateTime = list(map(lambda x, y, z: x + y + z, broad_final_year, broad_final_month,
                                 broad_final_day))
        broadDateTime = pd.to_datetime(broadDateTime, utc=False)
        broadDayOfWeek = [x.weekday() for x in broadDateTime]

        broad_final.insert(3, "dayOfWeek", broadDayOfWeek)
        broad_final = broad_final.drop(['start_hour', 'start_min', 'end_hour', 'end_min'], axis=1)
        broad_final = pd.merge(broad_final, broad_lst, on=['dayOfWeek', 'title'])

    pre = isPre(data, broad_final)
    now = isNow(data, broad_final)
    next = isNext(data, broad_final)
    data['rating'] = list(map(lambda x, y, z: max((x, y, z)), pre, now, next))
    data['rating'] = data['rating'].astype('category')

    return data

외부에서 크롤링한 시청률 데이터를 이용해 전시간대/방영시간대/후시간대에 인기 프로그램(시청률 8.5 이상) 갯수를 각각 구하고 <br> 3개중 max값을 rating변수로 이용합니다.

In [19]:
# add additional data
scaler = MinMaxScaler()
def add_exdata(data):
    data.insert(1, "no", range(0, len(data)))

    # priceIndex Variable
    pi = pd.read_excel("../eda/data/priceIndex_2015100__201901-202007.xlsx", header=0)
    pi=pi[['시점','식품','식품 이외']] # 전국 생활물가지수 사용

    broadYear=[int(i[0:4]) for i in pi['시점']]
    broadMonth=[int(i[6:8]) for i in pi['시점']]
    pi.insert(1,'broadYear',broadYear)
    pi.insert(2,'broadMonth',broadMonth)
    pi.drop('시점',axis=1,inplace=True)

    foodIndex=[108.79-108.45]
    for i in range(len(pi)-1):
        delta=pi['식품'][i+1]-pi['식품'][i]
        foodIndex.append(delta)

    nonfoodIndex=[101.49-102.12]

    for i in range(len(pi)-1):
        delta2=pi['식품 이외'][i+1]-pi['식품 이외'][i]
        nonfoodIndex.append(delta2)

    pi['foodIndex']=foodIndex
    pi['nonfoodIndex']=nonfoodIndex

    data_food=data[(data['prodGroup']=='농수축')|(data['prodGroup']=='건강기능')]
    data_food=pd.merge(data_food,pi[['broadYear','broadMonth','foodIndex']],
                   on=['broadYear','broadMonth'])
    data_food.rename(columns={'foodIndex':'priceIndex'},inplace=True)
    data_nonfood=data[~((data['prodGroup']=='농수축')|
                    (data['prodGroup']=='건강기능'))]
    data_nonfood=pd.merge(data_nonfood,pi[['broadYear','broadMonth','nonfoodIndex']],
                      on=['broadYear','broadMonth'])
    data_nonfood.rename(columns={'nonfoodIndex':'priceIndex'},inplace=True)
    data=pd.concat([data_food,data_nonfood])

    # tempNorm Variable
    temper = pd.read_csv('../eda/data/temper2.csv', header=6, encoding='cp949')

    temper["날짜"] = pd.to_datetime(temper["날짜"])

    broadYear = temper["날짜"].dt.year
    broadMonth = temper["날짜"].dt.month
    broadDay = temper["날짜"].dt.day
    temper.insert(1, "broadYear", broadYear)
    temper.insert(2, "broadMonth", broadMonth)
    temper.insert(3, "broadDay", broadDay)
    temper.drop("날짜", axis=1, inplace=True)

    data = pd.merge(data, temper[['broadYear', 'broadMonth',
                                  'broadDay', '평균기온(℃)']],
                    on=['broadYear', 'broadMonth', 'broadDay'])
    data.rename(columns={'평균기온(℃)': 'temperature'}, inplace=True)

    m = pd.DataFrame()

    for i in data['broadMonth'].unique():
        dataMonth = data[data['broadMonth'] == i]
        dataMonth['tempNorm'] = scaler.fit_transform(dataMonth[['temperature']])
        m = pd.concat([m, dataMonth])

    data = m.drop('temperature', axis=1)


    # rainAvgAll, rainAvgCap Variable(강수량)
    rain_all = pd.read_excel('../eda/data/rain_all.xlsx', header=13)
    rain_cap = pd.read_excel('../eda/data/rain_cap.xlsx', header=13)

    rain_all = rain_all[['일시', '평균일강수량(mm)']]
    rain_cap = rain_cap[['일시', '평균일강수량(mm)']]

    rain_all = rain_all[~rain_all.일시.isnull()]
    rain_cap = rain_cap[~rain_cap.일시.isnull()]

    broadYear = rain_all["일시"].dt.year
    broadMonth = rain_all["일시"].dt.month
    broadDay = rain_all["일시"].dt.day
    rain_all.insert(1, "broadYear", broadYear)
    rain_all.insert(2, "broadMonth", broadMonth)
    rain_all.insert(3, "broadDay", broadDay)
    rain_all.drop('일시', axis=1, inplace=True)

    broadYear = rain_cap["일시"].dt.year
    broadMonth = rain_cap["일시"].dt.month
    broadDay = rain_cap["일시"].dt.day
    rain_cap.insert(1, "broadYear", broadYear)
    rain_cap.insert(2, "broadMonth", broadMonth)
    rain_cap.insert(3, "broadDay", broadDay)
    rain_cap.drop('일시', axis=1, inplace=True)

    rain_all2019 = rain_all[rain_all.broadYear == 2019]
    rain_all202006 = rain_all[(rain_all.broadYear == 2020) & (rain_all.broadMonth == 6)]
    rain_cap2019 = rain_cap[rain_cap.broadYear == 2019]
    rain_cap202006 = rain_cap[(rain_cap.broadYear == 2020) & (rain_cap.broadMonth == 6)]

    scalerAll = MinMaxScaler()
    scalerAll.fit(rain_all2019[['평균일강수량(mm)']])
    scalerCap = MinMaxScaler()
    scalerCap.fit(rain_cap2019[['평균일강수량(mm)']])

    rain_all['평균일강수량(mm)'] = scalerAll.transform(rain_all[['평균일강수량(mm)']])
    rain_cap['평균일강수량(mm)'] = scalerCap.transform(rain_cap[['평균일강수량(mm)']])

    data = pd.merge(data, rain_all[['broadYear', 'broadMonth', 'broadDay',
                                    '평균일강수량(mm)']],
                    on=['broadYear', 'broadMonth', 'broadDay'])
    data.rename(columns={"평균일강수량(mm)": "rainAvgWhole"},
                inplace=True)

    data = pd.merge(data, rain_cap[['broadYear', 'broadMonth', 'broadDay',
                                    '평균일강수량(mm)']],
                    on=['broadYear', 'broadMonth', 'broadDay'])
    data.rename(columns={"평균일강수량(mm)": "rainAvgCap"},
                inplace=True)

    #dust Variable: 미세먼지

    dust1_2=pd.read_excel('../eda/data/dust/19-1미세먼지2.5.xlsx',header=4)
    dust1_10=pd.read_excel('../eda/data/dust/19-1미세먼지10.xlsx',header=4)
    dust2_2=pd.read_excel('../eda/data/dust/19-2미세먼지2.5.xlsx',header=4)
    dust2_10=pd.read_excel('../eda/data/dust/19-2미세먼지10.xlsx',header=4)
    dust3_2=pd.read_excel('../eda/data/dust/19-3미세먼지2.5.xlsx',header=4)
    dust3_10=pd.read_excel('../eda/data/dust/19-3미세먼지10.xlsx',header=4)
    dust4_2=pd.read_excel('../eda/data/dust/19-4미세먼지2.5.xlsx',header=4)
    dust4_10=pd.read_excel('../eda/data/dust/19-4미세먼지10.xlsx',header=4)
    dust5_2=pd.read_excel('../eda/data/dust/19-5미세먼지2.5.xlsx',header=4)
    dust5_10=pd.read_excel('../eda/data/dust/19-5미세먼지10.xlsx',header=4)
    dust6_2=pd.read_excel('../eda/data/dust/19-6미세먼지2.5.xlsx',header=4)
    dust6_10=pd.read_excel('../eda/data/dust/19-6미세먼지10.xlsx',header=4)
    dust7_2=pd.read_excel('../eda/data/dust/19-7미세먼지2.5.xlsx',header=4)
    dust7_10=pd.read_excel('../eda/data/dust/19-7미세먼지10.xlsx',header=4)
    dust8_2=pd.read_excel('../eda/data/dust/19-8미세먼지2.5.xlsx',header=4)
    dust8_10=pd.read_excel('../eda/data/dust/19-8미세먼지10.xlsx',header=4)
    dust9_2=pd.read_excel('../eda/data/dust/19-9미세먼지2.5.xlsx',header=4)
    dust9_10=pd.read_excel('../eda/data/dust/19-9미세먼지10.xlsx',header=4)
    dust10_2=pd.read_excel('../eda/data/dust/19-10미세먼지2.5.xlsx',header=4)
    dust10_10=pd.read_excel('../eda/data/dust/19-10미세먼지10.xlsx',header=4)
    dust11_2=pd.read_excel('../eda/data/dust/19-11미세먼지2.5.xlsx',header=4)
    dust11_10=pd.read_excel('../eda/data/dust/19-11미세먼지10.xlsx',header=4)
    dust12_2=pd.read_excel('../eda/data/dust/19-12미세먼지2.5.xlsx',header=4)
    dust12_10=pd.read_excel('../eda/data/dust/19-12미세먼지10.xlsx',header=4)
    dust206_2=pd.read_excel('../eda/data/dust/20-6미세먼지2.5.xlsx',header=4)
    dust206_10=pd.read_excel('../eda/data/dust/20-6미세먼지10.xlsx',header=4)

    lst2019=[dust1_2,
     dust2_2,
     dust3_2,
     dust4_2,
     dust5_2,
     dust6_2,
     dust7_2,
     dust8_2,
     dust9_2,
     dust10_2,
     dust11_2,
     dust12_2,
     dust1_10,
     dust2_10,
     dust3_10,
     dust4_10,
     dust5_10,
     dust6_10,
     dust7_10,
     dust8_10,
     dust9_10,
     dust10_10,
     dust11_10,
     dust12_10,
            ]

    dust2019=pd.DataFrame(columns=['broadYear','broadMonth','broadDay','dust2.5','dust10'])
    for i in range(12):
        lst2019[i]=pd.DataFrame(lst2019[i].iloc[0,2:]).reset_index().rename(
            columns={'index':'broadDay',0:'dust2.5'})
        lst2019[i]['broadMonth']=i+1
        lst2019[i+12]=pd.DataFrame(lst2019[i+12].iloc[0,2:]).reset_index().rename(
            columns={'index':'broadDay',0:'dust10'})
        lst2019[i+12]['broadMonth']=i+1
        dust2019=pd.concat([dust2019,(pd.merge(lst2019[i],lst2019[i+12],on=['broadMonth','broadDay']))])

    dust2019['broadDay']=dust2019['broadDay'].str.rstrip('일')
    dust2019['broadDay']=dust2019['broadDay'].astype('int')
    dust2019['broadYear']=2019
    dust2019.dropna(axis=0,inplace=True) # 없는 날짜가 존재하는 행 삭제

    dust202006=pd.DataFrame(columns=['broadYear','broadMonth','broadDay','dust2.5','dust10'])

    dust206_2=pd.DataFrame(dust206_2.iloc[0,2:]).reset_index().rename(
        columns={'index':'broadDay',0:'dust2.5'})
    dust206_2['broadMonth']=6
    dust206_10=pd.DataFrame(dust206_10.iloc[0,2:]).reset_index().rename(
        columns={'index':'broadDay',0:'dust10'})
    dust206_10['broadMonth']=6
    dust202006=pd.concat([dust202006,(pd.merge(dust206_2,dust206_10,on=['broadMonth','broadDay']))])

    dust202006['broadDay']=dust202006['broadDay'].str.rstrip('일')
    dust202006['broadDay']=dust202006['broadDay'].astype('int')
    dust202006['broadYear']=2020
    dust202006.dropna(axis=0,inplace=True) # 없는 날짜가 존재하는 행 삭제

    dust=pd.concat([dust2019,dust202006])

    fc2 = []
    for i in dust['dust2.5'].values:
        if i <= 15:
            fc2.append(1)
        elif 16 <= i <= 35:
            fc2.append(2)
        elif 36 <= i <= 75:
            fc2.append(3)
        else:
            fc2.append(4)

    fc10 = []
    for i in dust['dust10'].values:
        if i <= 30:
            fc10.append(1)
        elif 31 <= i <= 80:
            fc10.append(2)
        elif 81 <= i <= 150:
            fc10.append(3)
        else:
            fc10.append(4)

    dustOrd = []
    for i, j in zip(fc2, fc10):
        dustOrd.append(max([i, j]))

    # 카테고리로 변환
    dustCat = []
    for i in dustOrd:
        if i == 1:
            dustCat.append('좋음')
        elif i == 2:
            dustCat.append('보통')
        elif i == 3:
            dustCat.append('나쁨')
        else:
            dustCat.append('매우나쁨')

    dust['dustCat'] = dustCat
    dust = dust.drop(['dust2.5', 'dust10'], axis=1)

    data=pd.merge(data,dust,on=['broadYear','broadMonth','broadDay'])

    data.sort_values(by='no', inplace=True)
    data.reset_index(drop=True, inplace=True)

    del data['no']
    del data['broadYear']
    del data['broadDateTime']
    del data['broadMin']

    return data

외부변수: 생활물가지수(전월대비 증감률), 기온(달에 대해서 정규화), 강수량(1년에 대해서 정규화), 미세먼지 지수를 <br> 넣어줍니다.

In [20]:
# 방영달 -> 계절로 변화(봄, 여름, 가을, 겨울)
def broadMonth_app(lst):
    res = []
    for item in lst:
        if item in (3, 4, 5):
            res.append(0)
        elif item in (6, 7, 8):
            res.append(1)
        elif item in (9, 10, 11):
            res.append(2)
        else:
            res.append(3)

    return res

# 방영 날짜: 1-10일이면 전반, 11-20이면 중반, 21-이면 후반
def broadDay_app(lst):
    res = []
    for item in lst:
        if item in range(1, 11):
            res.append(0)
        elif item in range(11, 21):
            res.append(1)
        else:
            res.append(2)
    return res

# 적용
def change_broad_times(data):
    data['broadMonth'] = broadMonth_app(data['broadMonth'].values.tolist())
    data['broadDay'] = broadDay_app(data['broadDay'].values.tolist())

    return data

방영달은 계절로 바꾸고, 방영날짜는 초/중/후반으로 바꿔줍니다.

In [21]:
DIM = 64

names = {'전지현', '팽현숙', '전철우', '강레오', '김선영', '김정문', '김정배',
         '김규흔', '이봉원', '오세득', '유귀열', '이경제', '이만기', '이보은', '임성근', '최인선',
         '김병만', '김병지', '이정섭', '이동수', '서장훈', '송도순', '효재', '천수봉', '숀리',
         '임화자', 'aab', 'aac'}

# 제품명 정규화
def regex_str(string: str):
    string = string.lower()
    string = re.sub('[^\w\s]', ' ', string)
    string = re.sub('\d+', ' ', string)
    string = string.replace('여자', '여성').replace('남자', '남성')
    string = string.replace('\(무\)', '무이자').replace('무\)', '무이자')
    string = string.replace('\(일\)', '일시불').replace('일\)', '일시불')
    string = re.sub('밥솥', ' 밥솥 ', string)
    string = re.sub('침대', ' 침대 ', string)
    string = re.sub('에어컨', ' 에어컨 ', string)
    string = re.sub('노트북', ' 노트북 ', string)
    string = re.sub('티셔츠', ' 티셔츠 ', string)
    string = re.sub('스타일러', ' 스타일러 ', string)
    string = re.sub('손질', ' 손질 ', string)
    string = string.replace("s/s","ss").replace("ss", "시즌")
    string = string.replace("f/w", "시즌").replace("썸머", "시즌")
    string = string.replace("lg", " lg ")
    string = string.replace("울트라hd", " uhd ")
    string = string.replace("tv", " tv ")
    string = string.replace("김치", " 김치 ")
    string = string.replace("기초세트", " 기초세트 ")
    for name in names:
        string = string.replace(name, ' ')
    string = ' '.join([x for x in string.split() if len(x) > 1])
    return string

# corpus 만들기
def make_test_corpus():
    global test
    test_prodName_corpus = set(' '.join(set(test['prodName'].apply(regex_str).values)).split())
    return test_prodName_corpus


test_prodName_corpus = make_test_corpus()


def regex_str_train(string: str):
    string = string.lower()
    string = re.sub('[^\w\s]', ' ', string)
    string = re.sub('\d+', ' ', string)
    string = string.replace('여자', '여성').replace('남자', '남성')
    string = string.replace('\(무\)', '무이자').replace('무\)', '무이자')
    string = string.replace('\(일\)', '일시불').replace('일\)', '일시불')
    string = re.sub('밥솥', ' 밥솥 ', string)
    string = re.sub('침대', ' 침대 ', string)
    string = re.sub('에어컨', ' 에어컨 ', string)
    string = re.sub('노트북', ' 노트북 ', string)
    string = re.sub('티셔츠', ' 티셔츠 ', string)
    string = re.sub('스타일러', ' 스타일러 ', string)
    string = re.sub('손질', ' 손질 ', string)
    string = string.replace("s/s","ss").replace("ss", "시즌")
    string = string.replace("f/w", "시즌").replace("썸머", "시즌")
    string = string.replace("lg", " lg ")
    string = string.replace("울트라hd", " uhd ")
    string = string.replace("tv", " tv ")
    string = string.replace("김치", " 김치 ")
    string = string.replace("기초세트", " 기초세트 ")
    for name in names:
        string = string.replace(name, ' ')
    string = ' '.join([x for x in string.split() if len(x) > 1 and x in test_prodName_corpus])
    return string


stopwords = {'gs', 'by', 'x', 'vbc', '봉', 'gabl', 'l', 'j',
             'tq', 'a', 'un', 'the', 'dv', 'kna', 'nu', 'jk', '매',
             'aab의', 'ih', 'ev', 'bna', '종', 'ia', 'kwa', 'kg',
             'gablcrp', 'crp', 'in', 'hnc', '인용', 'g', 'af', 'um',
             'fg', 'arc', 'nt', 'fq', 'b', 'uk', 'm', 'qs', '일시불', '무이자',
             'wwj', 'pat', 'aae', 'fxkr', 'qv', 'fs', 'knb', '남성', '여성'} | names

# tfidf vectorization
def make_tfidf(data, flag=True):
    if flag:
        vectorizer = TfidfVectorizer(stop_words=stopwords, max_features=150)
        corpus = list(data['prodName'].apply(regex_str_train).values)
        corpus.extend(list(test['prodName'].apply(regex_str).values))

        vectorizer.fit(corpus)
        train_tfidf = vectorizer.transform(data['prodName'].apply(regex_str_train)).toarray()

        svd = TruncatedSVD(n_components=DIM, n_iter=7, random_state=42)
        train_tfidf = svd.fit_transform(train_tfidf)
        train_tfidf = pd.DataFrame(train_tfidf, columns=['Feature'+str(i) for i in range(DIM)])

        data = pd.concat([data, train_tfidf], axis=1)

        joblib.dump(vectorizer, 'vectorizer.pkl')
    else:
        vectorizer = joblib.load('vectorizer.pkl')
        test_tfidf = vectorizer.transform(data['prodName'].apply(regex_str)).toarray()

        svd = TruncatedSVD(n_components=DIM, n_iter=7, random_state=42)
        test_tfidf = svd.fit_transform(test_tfidf)
        test_tfidf = pd.DataFrame(test_tfidf, columns=['Feature' + str(i) for i in range(DIM)])

        data = pd.concat([data, test_tfidf], axis=1)
        
    return data

테스트, 트레인 셋을 모두 이용하여 Tfidf Vectorizer를 만들어준 후 정규화된 제품명을 Tf-idf 벡터로 만듭니다. <br>
그대로 쓰면 차원이 너무 커지기 때문에 SVD를 이용해서 64차원으로 만들어줍니다.

In [22]:
def subgroup(data):
    subgroup = pd.read_csv("../eda/data/merged_subGroup.csv")
    data = data.merge(subgroup, how = 'left')
    return data

NS홈쇼핑에서 가져온 소분류 카테고리를 데이터에 합쳐주는 함수입니다.

In [23]:
# 상품군별로 스케일링
def priceScaler(data, flag=True):
    if flag:
        data['unitPriceOrigin'] = data['unitPrice']

        for group in data['prodGroup'].unique():
            group_idx = data[data['prodGroup'] == group].index

            scaler = StandardScaler()
            scaler.fit(data.loc[group_idx, 'unitPrice'].values.reshape(-1, 1))

            data.loc[group_idx, 'unitPrice'] = scaler.transform(
                data.loc[group_idx, 'unitPrice'].values.reshape(-1, 1))

            file_name = '../eda/data/scaler/scaler_{}.pkl'.format(group)
            joblib.dump(scaler, file_name)

    else:
        data['unitPriceOrigin'] = data['unitPrice']

        for group in data['prodGroup'].unique():
            group_idx = data[data['prodGroup'] == group].index

            file_name = '../eda/data/scaler/scaler_{}.pkl'.format(group)
            scaler = joblib.load(file_name)

            data.loc[group_idx, 'unitPrice'] = scaler.transform(
                data.loc[group_idx, 'unitPrice'].values.reshape(-1, 1))
            
    return data

상품군별로 판매가격을 정규화해서 unitPrice를 바꿉니다.

## 1.1 train preprocessing

In [24]:
data = train
data = add_newvars(data)
data = change_dayofWeek(data)
data = prodCount(data)
data = change_time(data)
data = holidays(data)
data = add_index(data)
data = isJune(data)
data = stop(data)
data = isNamed(data)
data = isPayday(data)
data = add_rating(data)
data = add_exdata(data)
data = change_broad_times(data)
data = make_tfidf(data)
data = subgroup(data)
data = priceScaler(data)

for col in ('stop', 'priceIndex', 'tempNorm', 'rainAvgWhole', 'rainAvgCap', 'target') + \
            tuple(['Feature' + str(i) for i in range(DIM)]):
    data[col] = list(map(lambda x: round(x, 4), data[col].values.tolist()))

print("Train data successfully preprocessed!")
train = data

Train data successfully preprocessed!


지금까지 만든 EDA 과정을 트레인 데이터에 적용해주고, computational overflow를 막기 위해 실수형 변수들을 <br>
소숫점 4자리까지 반올림해줍니다.

## 1.2 test preprocessing

In [25]:
data = test
data = first_eda(data, flag=False)
data = add_newvars(data)
data = change_dayofWeek(data)
data = prodCount(data)
data = change_time(data)
data = holidays(data)
data = add_index(data)
data = isJune(data, flag=False)
data = stop(data)
data = isNamed(data)
data = isPayday(data)
data = add_rating(data, flag=False)
data = add_exdata(data)
data = change_broad_times(data)
data = make_tfidf(data, flag=False)
data = subgroup(data)
data = priceScaler(data, flag=False)

for col in ('stop', 'priceIndex', 'tempNorm', 'rainAvgWhole', 'rainAvgCap', 'target') + \
            tuple(['Feature' + str(i) for i in range(DIM)]):
    data[col] = list(map(lambda x: round(x, 4), data[col].values.tolist()))

data["target"] = np.nan
data["revenue"] = np.nan

print("Test data successfully preprocessed!")
test = data

Test data successfully preprocessed!


지금까지 만든 EDA 과정을 테스트 데이터에 적용해주고, computational overflow를 막기 위해 실수형 변수들을 <br>
소숫점 4자리까지 반올림해줍니다.

In [26]:
#####
##### 나중에 지울거 ######

train.to_csv("train.csv")
test.to_csv("test.csv")

# 2. Modeling
## 2.1 제품군 embedding 구하기

In [27]:
def preprocess(data):
    X = data.loc[:, 'Feature0':'Feature63']

    prod = data['prodGroup'].unique()
    prod = {val: i for i, val in enumerate(prod)}
    func = lambda x: prod[x]
    data['prodGroup'] = data['prodGroup'].apply(func).astype(int)
    prod = data['subGroup'].unique()
    prod = {val: i for i, val in enumerate(prod)}
    func = lambda x: prod[x]
    data['subGroup'] = data['subGroup'].apply(func).astype(int)

    X1 = data['prodGroup']
    X2 = data['subGroup']
    X = pd.concat([X, X1, X2], axis=1)
    y = data['target']
    return X, y

제품군(prodGroup)과 소분류 카테고리(subGroup - NS홈쇼핑에서 일일이 검색한 카테고리입니다.)를 정수값으로 변경합니다. <br> 이는 임베딩을 위해 필요한 과정입니다.

In [28]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [29]:
BATCH_SIZE = 1024
EMBEDDING_DIM = DIM
EPOCH = 300

# reproducibility
torch.manual_seed(3)
device = "cuda" if torch.cuda.is_available() else "cpu"

embedding dimension은 Tf-idf에서 정한 SVD의 차원인 64와 동일하게 정해줍니다. <br>
왜냐하면 우리의 최종적인 임베딩은 prodGroup 임베딩, 소분류 임베딩, Tf-idf 임베딩을 Linear Projection한 것이기 때문입니다.

In [30]:
class TrainData(Dataset):
    def __init__(self, X_train, y_train):
        # embedding1
        self.x1 = torch.LongTensor(X_train.loc[:, X_train.columns == 'prodGroup'].values)
        # embedding2
        self.x2 = torch.LongTensor(X_train.loc[:, X_train.columns == 'subGroup'].values)
        # embedding3
        self.x3 = torch.FloatTensor(X_train.loc[:, "Feature0":"Feature63"].values)
        self.y = torch.Tensor(y_train.values)

    def __getitem__(self, s):
        return self.x1[s], self.x2[s], self.x3[s], self.y[s]

    def __len__(self):
        return len(self.y)


class TestData(TrainData):
    '''Train 상속'''

In [31]:
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.emb1 = nn.Embedding(num_embeddings=11, embedding_dim=EMBEDDING_DIM)
        self.emb2 = nn.Embedding(num_embeddings=44, embedding_dim=EMBEDDING_DIM)
        self.enc1 = nn.Linear(EMBEDDING_DIM * 3, EMBEDDING_DIM, bias=False)
        self.fc1_0 = nn.Linear(EMBEDDING_DIM, 1)
        self._init_weights()

    def _init_weights(self):
        for layer in [self.emb1.weight, self.emb2.weight, self.enc1.weight]:
            torch.nn.init.kaiming_uniform_(layer)
        for layer in [self.fc1_0.weight]:
            torch.nn.init.kaiming_normal_(layer)
        for layer in [self.fc1_0.bias]:
            torch.nn.init.zeros_(layer)

    def forward(self, x1, x2, x3):
        x1_emb = self.emb1(x1)
        x1_emb = x1_emb.view(-1, x1_emb.shape[-1])
        x2_emb = self.emb2(x2)
        x2_emb = x2_emb.view(-1, x2_emb.shape[-1])
        x_embs = self.enc1(torch.cat([x1_emb, x2_emb, x3], dim=1))
        return self.fc1_0(x_embs), x_embs

In [32]:
X_train, y_train = preprocess(train)

train_dataset = TrainData(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

net = NeuralNet().to(device)
optimizer = optim.Adam(net.parameters())
criterion = nn.MSELoss().to(device)

losses = []

print("Start training...")
for epoch in range(1, EPOCH + 1):
    net.train()
    for x1, x2, x3, y in train_loader:
        x1, x2, x3, y = x1.to(device), x2.to(device), x3.to(device), y.to(device)
        y_pred, _ = net(x1, x2, x3)
        y_pred = y_pred.view_as(y)
        loss = criterion(y_pred, y)
        losses.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if epoch % 50 == 0:
        print(f"Epoch: {epoch}, Train Loss: {np.mean(losses)}")

torch.save(net.state_dict(), './nn_embs.pth')

Start training...
Epoch: 50, Train Loss: 0.8201319829055241
Epoch: 100, Train Loss: 0.7937009323835373
Epoch: 150, Train Loss: 0.7847264514082954
Epoch: 200, Train Loss: 0.7801392961740494
Epoch: 250, Train Loss: 0.7772836347443717
Epoch: 300, Train Loss: 0.7753065173739478


In [33]:
net.eval()
tmp = TrainData(X_train, y_train)

with torch.no_grad():
    _, train_embs = net(tmp.x1.to(device), tmp.x2.to(device), tmp.x3.to(device))
    df = pd.DataFrame(train_embs.detach().cpu().numpy(), columns=['Feature'+str(i) for i in range(64)])
    for col in df.columns:
        df[col] = np.round(df[col], 4)
    df.to_csv("../modeling/data/train_embs.csv", index=False)

    # test embs
X_test, y_test = preprocess(test)
tmp = TestData(X_test, y_test)

with torch.no_grad():
    _, test_embs = net(tmp.x1.to(device), tmp.x2.to(device), tmp.x3.to(device))
    df = pd.DataFrame(test_embs.detach().cpu().numpy(), columns=['Feature' + str(i) for i in range(64)])
    for col in df.columns:
        df[col] = np.round(df[col], 4)
    df.to_csv("../modeling/data/test_embs.csv", index=False)

print('All done.')

All done.


임베딩을 모두 훈련시킨후, 저장까지해줍니다.

## 2.2 각 모델에 피팅하기

### 2.2.1 데이터 정리

In [2]:
###Lable_encoder###
from sklearn.preprocessing import LabelEncoder

def label_encoder(data):
    encoder = LabelEncoder()
    data['dustCat'] = encoder.fit_transform(data['dustCat'])
    data['subGroup'] = encoder.fit_transform(data['subGroup'])
    data['prodGroup'] = encoder.fit_transform(data['prodGroup'])

    data[['dustCat', 'subGroup', 'prodGroup']] = data[['dustCat', 'subGroup', 'prodGroup']].astype('category')
    return data

dustCat의 원소가 한글이라 get_dummies를 인식못해서 따로 라벨을 붙혀줍니다.

In [3]:
def make_dataset_cate_xgb(data, final):
    try:
        data.drop("Unnamed: 0", axis=1, inplace=True)
    except:
        pass

    print("Original Shape : ", data.shape)

    if final:
        data.drop(['motherCode', 'prodCode', 'prodName', 'prodGroup', 'subGroup', 'isJune'], axis=1, inplace=True)
    else:
        data.drop(['motherCode', 'prodCode', 'prodName', 'prodGroup', 'subGroup'], axis=1, inplace=True)

    broad_cat = ['broadMonth', 'broadDay', 'broadHour', 'broadDayOfWeek', 'holidayLen',
                 'prod_index', 'max_index', 'dustCat']
    for cat in broad_cat:
        data[cat] = data[cat].astype('category')
        
    return data

In [4]:
def make_dataset_cate_rf_lgbm(data, final):
    try:
        data.drop("Unnamed: 0", axis=1, inplace=True)
    except:
        pass

    print("Original Shape : ", data.shape)

    if final:
        data.drop(['motherCode', 'prodCode', 'prodName', 'isJune'], axis=1, inplace=True)
    else:
        data.drop(['motherCode', 'prodCode', 'prodName'], axis=1, inplace=True)

    broad_cat = ['broadMonth', 'broadDay', 'broadHour', 'broadDayOfWeek', 'holidayLen',
                 'prod_index', 'max_index', 'dustCat', 'prodGroup', 'subGroup']
    for cat in broad_cat:
        data[cat] = data[cat].astype('category')
        
    data = label_encoder(data)
    
    return data

- get_dummies 함수를 써서 카테고리 변수를 one-hot-encoding 하기 위해 변수들을 category 타입으로 지정해줍니다.
- 단, start_index, end_index, isFemale, paymentPlan 변수는 0,1 로만 구성되어 있는 카테고리 변수이기 때문에 따로 카테고리 타입으로 지정해주지 않았습니다.
- final(validation이 아닌 최종 train 및 test 예측일 때)이면 isJune 변수도 드랍합니다.
- random forest 모델과 lgbm 모델은 prodGroup과 subGroup을 포함하고, xgboost는 두 변수를 빼고 사용합니다.

In [5]:
## metric
def MAPE(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

메인 메트릭인 MAPE를 정의해줍니다.

In [6]:
def xgb_dataclean(train, train_emb, isFinal, test = 0, test_emb = 0):
    train_origin = train.copy()
    train_xgb = make_dataset_cate_xgb(train, final = isFinal)
    train_xgb.iloc[:, -65:-1] = train_emb
    
    if isFinal:
        test_origin = test.copy()
        test_xgb = make_dataset_cate_xgb(test, final = isFinal)
        test.iloc[:, -65:-1] = test_emb
        
        return train_xgb, train_origin, test_xgb, test_origin
        
    else:
        return train_xgb, train_origin

In [7]:
def rf_lgb_dataclean(train, train_emb, isFinal, test = 0, test_emb = 0):
    train_origin = train.copy()
    train_rf_lgb = make_dataset_cate_rf_lgbm(train, final = isFinal)
    train_rf_lgb.iloc[:, -66:-2] = train_emb 
    
    if isFinal:
        test_origin = test.copy()
        test_rf_lgb = make_dataset_cate_rf_lgbm(test, final = isFinal)
        test_rf_lgb.iloc[:, -66:-2] = test_emb
        
        return train_rf_lgb, train_origin, test_rf_lgb, test_origin
        
    else:
        return train_rf_lgb, train_origin

### 2.2.2 모델 - XGBoost, RandomForest, LGBM
- XGBoost, LGBM : Bayesian Optimization 사용
- RandomForest : Random Search 사용

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from bayes_opt import BayesianOptimization
import lightgbm as lgb 
from xgboost import XGBRegressor
from sklearn.model_selection import KFold

In [9]:
# xgboost가 hyperparameter tuning을 할 때 사용할 parameter의 범위
xgb_pbounds = {'max_depth': (5, 10),
           'learning_rate': (0.001, 0.2),
           'n_estimators': (100, 600),
           'gamma': (1., 0.001),
           'min_child_weight': (2, 10),
           'max_delta_step': (0, 0.1),
           'subsample': (0.5, 1),
           'colsample_bytree': (0.5, 0.99)
           }

In [10]:
# Random Forest가 hyperparameter tuning을 할 때 사용할 parameter의 범위
max_depth = [int(x) for x in np.linspace(10, 110, num = 10)]
max_depth.append(None)

rf_pbounds = {'n_estimators': [int(x) for x in np.linspace(start = 100, stop = 1500, num = 8)],
               'max_features': ['sqrt','auto'],
               'max_depth': max_depth,
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False]}

In [11]:
# lgbm이 hyperparameter tuning을 할 때 사용할 parameter의 범위
lgbm_pbounds = {'num_leaves': (16, 300),
          'feature_fraction': (0.1, 0.9),
          'bagging_fraction': (0.8, 1),
           'max_depth': (5,30),
          'min_split_gain': (0.001, 0.1),
          'min_child_weight': (30,50),
          'learning_rate': (0.001, 0.2),  
          'n_estimators': (16, 300),      
          'subsample': (0, 1),          
          }

### 2.2.2.1 Validation Model

In [12]:
from bayes_opt import BayesianOptimization
from xgboost import XGBRegressor

In [42]:
def train_valid_xgb(train):
    X=train.drop(['target'], axis=1)
    y=train['target']

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,
                                                      random_state=3, stratify= X.isJune)

    X_train.drop("isJune", axis = 1 ,inplace = True)
    X_test.drop("isJune", axis = 1, inplace = True)
    
    X_train=pd.get_dummies(X_train,drop_first=True)
    X_test=pd.get_dummies(X_test,drop_first=True)

    X__train=X_train.drop(['revenue','unitPriceOrigin'],axis=1)
    X__test=X_test.drop(['revenue','unitPriceOrigin'],axis=1)
    
    return X__train, X__test, y_train, y_test, X_test

In [14]:
def train_valid_rf_lgbm(train):
    X = train.drop('target',axis=1)
    y = train['target']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                        random_state=3, stratify=X.isJune)
    
    X__train = X_train.drop(['isJune','revenue', 'unitPriceOrigin'],axis=1)
    X__test = X_test.drop(['isJune','revenue', 'unitPriceOrigin'], axis=1)
    
    return X__train, X__test, y_train, y_test, X_test

### XGBOOST

In [44]:
def xgb_valid(train, param_search):
    
    print("---------------[XGBoost Model]---------------")
    print("Train test set split!\n")
    
    X__train, X__test, y_train, y_test, X_test = train_valid_xgb(train)

    np.random.seed(3)
    
    def XGB_cv(max_depth, learning_rate, n_estimators, gamma,
               min_child_weight, max_delta_step, subsample,
               colsample_bytree, nthread=-1):
        model = XGBRegressor(max_depth=int(max_depth),
                             learning_rate=learning_rate,
                             n_estimators=int(n_estimators),
                             nthread=nthread,
                             gamma=gamma,
                             min_child_weight=min_child_weight,
                             max_delta_step=max_delta_step,
                             subsample=subsample,
                             colsample_bytree=colsample_bytree)
        MAE = cross_val_score(model, X__train, y_train, scoring='neg_mean_absolute_error', cv = 5).mean()
        return MAE
    
    print("Column : \n", X__train.columns)
    print("\nTrain X shape : ", X__train.shape)
    print("Train Y sahpe : ", y_train.shape)
    print("Test X shape : ", X__test.shape)
    print("\nBayes Optimization start!")
    
    if param_search: # Bayes Optimization으로 hyperparameter tuning 진행
        xgboostBO = BayesianOptimization(f=XGB_cv, pbounds=xgb_pbounds, verbose=2, random_state=3)
        xgboostBO.maximize(init_points=5, n_iter=45)
        xgboostBO = xgboostBO.max
    
    else: # hyperparameter tuning으로 찾아진 hyperparameter 값 바로 사용
        xgboostBO = {'target': -0.31514963452668143, 'params': {'colsample_bytree': 0.561134482078139, 'gamma': 0.001,
                                                            'learning_rate': 0.18329806043912528, 'max_delta_step': 0.06863918133671187,
                                                            'max_depth': 9.980132921667362, 'min_child_weight': 9.928157427363239,
                                                            'n_estimators': 521.4135613718304, 'subsample': 0.9468597761420319}}
        
    print("\nBEST params : ", xgboostBO)
    fit_xgb = XGBRegressor(max_depth=int(xgboostBO['params']['max_depth']),
                   learning_rate=xgboostBO['params']['learning_rate'],
                   n_estimators=int(xgboostBO['params']['n_estimators']),
                   gamma=xgboostBO['params']['gamma'],
                   min_child_weight=xgboostBO['params']['min_child_weight'],
                   max_delta_step=xgboostBO['params']['max_delta_step'],
                   subsample=xgboostBO['params']['subsample'],
                   colsample_bytree=xgboostBO['params']['colsample_bytree'])
        
    xgb_model = fit_xgb.fit(X__train, y_train)

    y_test2 = X_test['revenue']
    y_pred = fit_xgb.predict(X__test)
    final_pred = np.exp(y_pred) * X_test['broadTime'].astype('float') * X_test['unitPriceOrigin']

    mape = MAPE(y_test2, final_pred)
    print('MAPE:', mape)
    
    return fit_xgb

### Random Forest

In [43]:
def rf_valid(train, param_search):
    X__train, X__test, y_train, y_test, X_test = train_valid_rf_lgbm(train)
    
    print("---------------[Random Forest Model]---------------")
    print("Train test set split!\n")

    np.random.seed(3)
    rf = RandomForestRegressor(random_state=3)
    cv = KFold(5, shuffle=True, random_state=3)
    
    print("Column : \n", X__train.columns)
    print("\nTrain X shape : ", X__train.shape)
    print("Train Y sahpe : ", y_train.shape)
    print("Test X shape : ", X__test.shape)
    print("\nRandom Search start!")
    
    if param_search: # Random Search로 hyperparameter tuning 진행
        rfgrid_1 = RandomizedSearchCV(rf, rf_pbounds, cv = cv, scoring= 'neg_mean_squared_error',
                                  n_jobs = 7, return_train_score=True)
        
        bestParams = rfgrid_1.best_params_
   
    else: # hyperparameter tuning으로 찾아진 hyperparameter 값 바로 사용
        bestParams = {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 
                     'max_features': 'auto', 'max_depth': 76, 'bootstrap': True}
        
    print("\nBEST params : ", bestParams)
        
    fit_rf = RandomForestRegressor(n_estimators=bestParams['n_estimators'],
                                   max_depth=bestParams['max_depth'],
                                   min_samples_split=bestParams['min_samples_split'],
                                   min_samples_leaf=bestParams['min_samples_leaf'],
                                   bootstrap=bestParams['bootstrap'],
                                   max_features=bestParams['max_features'], random_state=3)
        
    rf_model = fit_rf.fit(X__train, y_train)

    y_test2 = X_test['revenue']
    y_pred = fit_rf.predict(X__test)
    final_pred = np.exp(y_pred) * X_test['broadTime'].astype('float') * X_test['unitPriceOrigin']

    mape = MAPE(y_test2, final_pred)
    print('MAPE:', mape)
    
    return fit_rf

### LGBM

In [74]:
def lgbm_valid(train, param_search):
    
    print("---------------[Light GBM Model]---------------")
    print("Train test set split!\n")
    
    X__train, X__test, y_train, y_test, X_test = train_valid_rf_lgbm(train)

    np.random.seed(3)
    
    def lgb_cv(num_leaves, feature_fraction, bagging_fraction, 
           max_depth, min_split_gain, min_child_weight,
           learning_rate, n_estimators, subsample):
        model = lgb.LGBMRegressor(
            num_leaves = int(round(num_leaves)),
            feature_fraction = max(min(feature_fraction, 1), 0),
            bagging_fraction = max(min(bagging_fraction, 1), 0),
            max_depth = int(round(max_depth)),
            min_split_gain = min_split_gain,
            min_child_weight = min_child_weight,
            learning_rate = learning_rate, 
            n_estimators = int(round(n_estimators)),
            subsample = np.clip(subsample, 0, 1))

        MAE = cross_val_score(
            model, X__train.values, y_train.values, 
            scoring='neg_mean_absolute_error', cv=5).mean()
        return MAE
    
    print("Column : \n", X__train.columns)
    print("\nTrain X shape : ", X__train.shape)
    print("Train Y sahpe : ", y_train.shape)
    print("Test X shape : ", X__test.shape)
    print("\nBayes Optimization start!")
    
    if param_search: # Bayes Optimization으로 hyperparameter tuning 진행
        lgbBO = BayesianOptimization(f=lgb_cv, pbounds=lgbm_pbounds, verbose=2,random_state=3)
        lgbBO.maximize(init_points=5, n_iter = 45)
        bestParams = lgbBO.max['params']

    else: # hyperparameter tuning으로 찾아진 hyperparameter 값 바로 사용
        bestParams = {'bagging_fraction': 0.8119808750206314, 'feature_fraction': 0.8799983013520603, 
                      'learning_rate': 0.13556614571375009, 'max_depth': 18.806832590144996, 'min_child_weight': 39.25971967342293, 
                      'min_split_gain': 0.008664144839057038, 'n_estimators': 250.6692179822859, 
                      'num_leaves': 105.13032646164234, 'subsample': 0.861280669085226}
        
    print("BEST params : ", bestParams)  
    fit_lgbm = lgb.LGBMRegressor(num_leaves = int(round(bestParams['num_leaves'])),
                                           feature_fraction = bestParams['feature_fraction'],
                                           bagging_fraction = bestParams['bagging_fraction'],
                                           max_depth = int(round(bestParams['max_depth'])),
                                           min_split_gain = bestParams['min_split_gain'],
                                           min_child_weight = bestParams['min_child_weight'],
                                           learning_rate = bestParams['learning_rate'],
                                           n_estimators = int(round(bestParams['n_estimators'])),
                                           subsample = bestParams['subsample'],
                                           random_state=3)
    model = fit_lgbm.fit(X__train, y_train)

    y_test2 = X_test['revenue']
    y_pred = fit_lgbm.predict(X__test)
    final_pred = np.exp(y_pred) * X_test['broadTime'].astype('float') * X_test['unitPriceOrigin']

    mape = MAPE(y_test2, final_pred)
    print('MAPE:', mape)
    
    return fit_lgbm

In [76]:
train_emb = pd.read_csv("../modeling/data/train_embs.csv")
train = pd.read_csv("train.csv")
test_emb = pd.read_csv("../modeling/data/test_embs.csv")
test = pd.read_csv("test.csv")

In [77]:
train_valid_xgb_data, train_valid_rf_lgb_data = train.copy(), train.copy()
train_final_xgb_data, train_final_rf_lgb_data = train.copy(), train.copy()
test_final_xgb_data, test_final_rf_lgb_data = test.copy(), test.copy()

In [47]:
# xgb
train_xgb, train_origin_xgb = xgb_dataclean(train_valid_xgb_data, train_emb, isFinal = False)
xgb_valid_model = xgb_valid(train_xgb, param_search = False)

Original Shape :  (35379, 96)
---------------[XGBoost Model]---------------
Train test set split!

Column : 
 Index(['broadTime', 'unitPrice', 'isFemale', 'paymentPlan', 'prodCount',
       'start_index', 'end_index', 'stop', 'isNamed', 'isPayday',
       ...
       'max_index_5', 'max_index_6', 'max_index_7', 'max_index_8',
       'max_index_9', 'max_index_10', 'max_index_12', 'dustCat_매우나쁨',
       'dustCat_보통', 'dustCat_좋음'],
      dtype='object', length=139)

Train X shape :  (26534, 139)
Train Y sahpe :  (26534,)
Test X shape :  (8845, 139)

Bayes Optimization start!

BEST params :  {'target': -0.31514963452668143, 'params': {'colsample_bytree': 0.561134482078139, 'gamma': 0.001, 'learning_rate': 0.18329806043912528, 'max_delta_step': 0.06863918133671187, 'max_depth': 9.980132921667362, 'min_child_weight': 9.928157427363239, 'n_estimators': 521.4135613718304, 'subsample': 0.9468597761420319}}
MAPE: 35.0760124090276


In [70]:
# rf & lgbm
train_rf, train_origin_rf = rf_lgb_dataclean(train_valid_rf_lgb_data, train_emb, isFinal = False)
train_lgbm, train_origin_lgbm = train_rf.copy(), train_origin_rf.copy()

Original Shape :  (35379, 96)


In [49]:
# rf
rf_valid_model = rf_valid(train_rf, param_search = False)

---------------[Random Forest Model]---------------
Train test set split!

Column : 
 Index(['broadMonth', 'broadDay', 'broadHour', 'broadTime', 'prodGroup',
       'unitPrice', 'isFemale', 'paymentPlan', 'broadDayOfWeek', 'prodCount',
       'holidayLen', 'prod_index', 'max_index', 'start_index', 'end_index',
       'stop', 'isNamed', 'isPayday', 'rating', 'priceIndex', 'tempNorm',
       'rainAvgWhole', 'rainAvgCap', 'dustCat', 'Feature0', 'Feature1',
       'Feature2', 'Feature3', 'Feature4', 'Feature5', 'Feature6', 'Feature7',
       'Feature8', 'Feature9', 'Feature10', 'Feature11', 'Feature12',
       'Feature13', 'Feature14', 'Feature15', 'Feature16', 'Feature17',
       'Feature18', 'Feature19', 'Feature20', 'Feature21', 'Feature22',
       'Feature23', 'Feature24', 'Feature25', 'Feature26', 'Feature27',
       'Feature28', 'Feature29', 'Feature30', 'Feature31', 'Feature32',
       'Feature33', 'Feature34', 'Feature35', 'Feature36', 'Feature37',
       'Feature38', 'Feature39', 

In [73]:
# lgbm
lgbm_valid_model = lgbm_valid(train_lgbm, param_search = False)

---------------[Light GBM Model]---------------
Train test set split!

Column : 
 Index(['broadMonth', 'broadDay', 'broadHour', 'broadTime', 'prodGroup',
       'unitPrice', 'isFemale', 'paymentPlan', 'broadDayOfWeek', 'prodCount',
       'holidayLen', 'prod_index', 'max_index', 'start_index', 'end_index',
       'stop', 'isNamed', 'isPayday', 'rating', 'priceIndex', 'tempNorm',
       'rainAvgWhole', 'rainAvgCap', 'dustCat', 'Feature0', 'Feature1',
       'Feature2', 'Feature3', 'Feature4', 'Feature5', 'Feature6', 'Feature7',
       'Feature8', 'Feature9', 'Feature10', 'Feature11', 'Feature12',
       'Feature13', 'Feature14', 'Feature15', 'Feature16', 'Feature17',
       'Feature18', 'Feature19', 'Feature20', 'Feature21', 'Feature22',
       'Feature23', 'Feature24', 'Feature25', 'Feature26', 'Feature27',
       'Feature28', 'Feature29', 'Feature30', 'Feature31', 'Feature32',
       'Feature33', 'Feature34', 'Feature35', 'Feature36', 'Feature37',
       'Feature38', 'Feature39', 'Fea

### 2.2.2.2 Final Model

In [78]:
def train_test_final(train, test, isXgb):
    if isXgb:
        train_X = pd.get_dummies(train.drop(['target', 'revenue', 'unitPriceOrigin'], axis = 1))
        test_X = pd.get_dummies(test.drop(['target', 'revenue', 'unitPriceOrigin'], axis = 1))

    else:
        train_X = train.drop(['target', 'unitPriceOrigin', 'revenue'], axis=1)
        test_X = test.drop(['target','unitPriceOrigin','revenue'], axis=1)

    train_Y = train['target']

    test_column = list(test_X.columns)
    for column in train_X.columns:
        if column not in test_column:
            test_X[column] = 0

    test_X = test_X[train_X.columns]
    
    return train_X, train_Y, test_X

### XGBOOST

In [79]:
def final_xgb(train, test, param_search):
    
    print("---------------[XGBoost Model]---------------")
    print("Train test set clean!\n")
    
    train_X, train_Y, test_X = train_test_final(train, test, isXgb = True)

    np.random.seed(3)

    print("Column : \n", train_X.columns)
    print("Train X shape : ", train_X.shape)
    print("Train Y sahpe : ", train_Y.shape)
    print("Test X shape : ", test_X.shape)
    print("XGBoost Bayes Optimization Start!")
    
    if param_search:
        def XGB_cv(max_depth, learning_rate, n_estimators, gamma
                   , min_child_weight, max_delta_step, subsample
                   , colsample_bytree, nthread=-1):
            model = XGBRegressor(max_depth=int(max_depth),
                                 learning_rate=learning_rate,
                                 n_estimators=int(n_estimators),
                                 nthread=nthread,
                                 gamma=gamma,
                                 min_child_weight=min_child_weight,
                                 max_delta_step=max_delta_step,
                                 subsample=subsample,
                                 colsample_bytree=colsample_bytree)
            MAE = cross_val_score(model, train_X, train_Y, scoring='neg_mean_absolute_error', cv=5).mean()
            return MAE

        xgboostBO = BayesianOptimization(f=XGB_cv, pbounds=pbounds, verbose=2, random_state=3)
        xgboostBO.maximize(init_points=5, n_iter=45)
        xgboostBO = xgboost.max
        
    else:
        xgboostBO = {'target': -0.4231938004771939, 'params': {'colsample_bytree': 0.5922112179210803, 'gamma': 0.001,
                                                               'learning_rate': 0.1674272151263528, 'max_delta_step': 0.0708801582700138,
                                                               'max_depth': 8.975177509311667, 'min_child_weight': 9.110058896157721,
                                                               'n_estimators': 588.5277619331036, 'subsample': 0.9857800068264405}}

    print("BEST params : ", xgboostBO)

    fit_xgb = XGBRegressor(max_depth=int(xgboostBO['params']['max_depth']),
                           learning_rate=xgboostBO['params']['learning_rate'],
                           n_estimators=int(xgboostBO['params']['n_estimators']),
                           gamma=xgboostBO['params']['gamma'],
                           min_child_weight=xgboostBO['params']['min_child_weight'],
                           max_delta_step=xgboostBO['params']['max_delta_step'],
                           subsample=xgboostBO['params']['subsample'],
                           colsample_bytree=xgboostBO['params']['colsample_bytree'])

    model = fit_xgb.fit(train_X, train_Y)

    train_pred = fit_xgb.predict(train_X)
    test_pred = fit_xgb.predict(test_X)

    final_train_pred = np.exp(train_pred) * train['broadTime'].astype('float') * train['unitPriceOrigin']
    final_test_pred = np.exp(test_pred) * test['broadTime'].astype('float') * test['unitPriceOrigin']

    final_train_pred = pd.DataFrame(final_train_pred)
    final_test_pred = pd.DataFrame(final_test_pred)

    final_train_pred.to_csv("final_train_pred_xgb.csv")
    final_test_pred.to_csv("final_test_pred_xgb.csv")

    return final_train_pred, final_test_pred, fit_xgb

### Random Forest

In [80]:
def final_rf(train, test, param_search):
    
    print("---------------[Random Forest Model]---------------")
    print("Train test set clean!\n")
    
    train_X, train_Y, test_X = train_test_final(train, test, isXgb = False)

    np.random.seed(3)
    rf = RandomForestRegressor(random_state=3)
    cv = KFold(5, shuffle=True, random_state=3)
    
    print("Column : \n", train_X.columns)
    print("Train X shape : ", train_X.shape)
    print("Train Y sahpe : ", train_Y.shape)
    print("Test X shape : ", test_X.shape)
    print("\nRandom Search start!")
    
    if param_search:
        rfgrid_1 = RandomizedSearchCV(rf, rf_pbounds, cv = cv, scoring= 'neg_mean_squared_error',
                                  n_jobs = 7, return_train_score=True)
        
        bestParams = rfgrid_1.best_params_
   
    else: 
        bestParams = {'n_estimators': 100, 'min_samples_split': 5,
                      'min_samples_leaf': 2, 'max_features': 'auto',
                      'max_depth': 76, 'bootstrap': True}
        
    print("\nBEST params : ", bestParams)
        
    fit_rf = RandomForestRegressor(n_estimators=bestParams['n_estimators'],
                                   max_depth=bestParams['max_depth'],
                                   min_samples_split=bestParams['min_samples_split'],
                                   min_samples_leaf=bestParams['min_samples_leaf'],
                                   bootstrap=bestParams['bootstrap'],
                                   max_features=bestParams['max_features'], random_state=3)
        
    np.random.seed(3)
    
    model = fit_rf.fit(train_X,train_Y)

    train_pred = fit_rf.predict(train_X)
    test_pred = fit_rf.predict(test_X)
    
    final_train_pred = np.exp(train_pred) * train['broadTime'].astype('float') * train['unitPriceOrigin']
    final_test_pred = np.exp(test_pred) * test['broadTime'].astype('float') * test['unitPriceOrigin']

    final_train_pred = pd.DataFrame(final_train_pred)
    final_test_pred = pd.DataFrame(final_test_pred)

    final_train_pred = pd.DataFrame(final_train_pred)
    final_test_pred = pd.DataFrame(final_test_pred)

    final_train_pred.to_csv("final_train_pred_rf.csv")
    final_test_pred.to_csv("final_test_pred_rf.csv")

    return final_train_pred, final_test_pred, fit_rf

### Light GBM

In [105]:
def final_lgbm(train, test, param_search):
    
    print("---------------[Light GBM Model]---------------")
    print("Train test set clean!\n")
    
    train_X, train_Y, test_X = train_test_final(train, test, isXgb = False)

    np.random.seed(3)

    print("Column : \n", train_X.columns)
    print("Train X shape : ", train_X.shape)
    print("Train Y sahpe : ", train_Y.shape)
    print("Test X shape : ", test_X.shape)
    print("XGBoost Bayes Optimization Start!")
    
    if param_search:
        def lgb_cv(num_leaves, feature_fraction, bagging_fraction, 
                   max_depth, min_split_gain, min_child_weight,
                   learning_rate, n_estimators, subsample):
            model = lgb.LGBMRegressor(
            num_leaves = int(round(num_leaves)),
            feature_fraction = max(min(feature_fraction, 1), 0),
            bagging_fraction = max(min(bagging_fraction, 1), 0),
            max_depth = int(round(max_depth)),
            min_split_gain = min_split_gain,
            min_child_weight = min_child_weight,
            learning_rate = learning_rate, 
            n_estimators = int(round(n_estimators)),
            subsample = np.clip(subsample, 0, 1))

            MAE = cross_val_score(model, X__train.values, y_train.values, 
                                  scoring='neg_mean_absolute_error', cv=5).mean()
        return MAE
    
        lgbBO = BayesianOptimization(f=lgb_cv, pbounds=lgbm_pbounds, verbose=2,random_state=3)
        lgbBO.maximize(init_points=5, n_iter = 45)
        bestParmas = lgbBO.max['params']
        
    else:
        bestParams = {'bagging_fraction': 0.9731589936660383, 'feature_fraction': 0.5757297248941633, 
                      'learning_rate': 0.06982196726597009, 'max_depth': 22.345555275557583, 'min_child_weight': 43.32592767723173,
                      'min_split_gain': 0.044098125637682964, 'n_estimators': 172.83567350969471, 'num_leaves': 220.3220932675174,
                      'subsample': 0.0270706243388118}

    print("BEST params : ", bestParams)

    fit_lgbm = lgb.LGBMRegressor(num_leaves = int(round(bestParams['num_leaves'])),
                                 feature_fraction = bestParams['feature_fraction'],
                                 bagging_fraction = bestParams['bagging_fraction'],
                                 max_depth = int(round(bestParams['max_depth'])),
                                 min_split_gain = bestParams['min_split_gain'],
                                 min_child_weight = bestParams['min_child_weight'],
                                 learning_rate = bestParams['learning_rate'], 
                                 n_estimators = int(round(bestParams['n_estimators'])), 
                                 subsample = bestParams['subsample'], 
                                 random_state=3)

    np.random.seed(3)
    model = fit_lgbm.fit(train_X,train_Y)

    train_pred = fit_lgbm.predict(train_X)
    test_pred = fit_lgbm.predict(test_X)
    final_train_pred = np.exp(train_pred) * train['broadTime'].astype('float') * train['unitPriceOrigin']
    final_test_pred = np.exp(test_pred) * test['broadTime'].astype('float') * test['unitPriceOrigin']

    final_train_pred = pd.DataFrame(final_train_pred)
    final_test_pred = pd.DataFrame(final_test_pred)

    final_train_pred = pd.DataFrame(final_train_pred)
    final_test_pred = pd.DataFrame(final_test_pred)

    final_train_pred.to_csv("final_train_pred_lgbm.csv")
    final_test_pred.to_csv("final_test_pred_lgbm.csv")

    return final_train_pred, final_test_pred, fit_lgbm

### CV for ensemble weight

In [106]:
def cv_for_ensemble(model, data, data_origin, isXgb):
    X = data.drop(['target'], axis=1)
    y = data['target']
    kf = KFold(n_splits=5, random_state=3, shuffle=True)
    i = 0
    
    print("MODEL : \n", model)

    for train_idx, test_idx in kf.split(X):
        
        i += 1

        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        if isXgb:
            X_train = pd.get_dummies(X_train, drop_first=True)
            X_test = pd.get_dummies(X_test, drop_first=True)

        X__train = X_train.drop(['revenue', 'unitPriceOrigin'], axis=1)
        X__test = X_test.drop(['revenue', 'unitPriceOrigin'], axis=1)

        cv_model = model.fit(X__train, y_train)

        y_pred = model.predict(X__test)
        cv_final_pred = np.exp(y_pred) * X_test['broadTime'].astype('float') * X_test['unitPriceOrigin']
        X_test['pred'] = cv_final_pred
        X_test['index'] = test_idx

        if i==1 :
            cv_merged = X_test
        else:
            cv_merged = pd.concat([cv_merged, X_test])

        
    print("X test shape : ", cv_merged.shape)
    cv_merged = cv_merged.sort_values(by = ['index'])
    cv_merged['prodGroup'] = data_origin['prodGroup']

    mapes = []
    for group in list(cv_merged["prodGroup"].unique()):
        train_group = cv_merged[cv_merged["prodGroup"] == group]
        new_mape = MAPE(train_group['revenue'], train_group['pred'])
        mapes.append((group, new_mape))

    for mape in mapes:
        print(mape)
        
    return mapes

In [82]:
# xgb
train_xgb, train_origin_xgb, test_xgb, test_origin_xgb = xgb_dataclean(train_final_xgb_data, train_emb, isFinal = True, test = test_final_xgb_data, test_emb = test_emb)
xgb_train_pred, xgb_test_pred, xgb_final_model = final_xgb(train_xgb, test_xgb, param_search = False)

Original Shape :  (35379, 96)
Original Shape :  (2716, 96)
---------------[XGBoost Model]---------------
Train test set clean!

Column : 
 Index(['broadTime', 'unitPrice', 'isFemale', 'paymentPlan', 'prodCount',
       'start_index', 'end_index', 'stop', 'isNamed', 'isPayday',
       ...
       'max_index_6', 'max_index_7', 'max_index_8', 'max_index_9',
       'max_index_10', 'max_index_12', 'dustCat_나쁨', 'dustCat_매우나쁨',
       'dustCat_보통', 'dustCat_좋음'],
      dtype='object', length=147)
Train X shape :  (35379, 147)
Train Y sahpe :  (35379,)
Test X shape :  (2716, 147)
XGBoost Bayes Optimization Start!
BEST params :  {'target': -0.4231938004771939, 'params': {'colsample_bytree': 0.5922112179210803, 'gamma': 0.001, 'learning_rate': 0.1674272151263528, 'max_delta_step': 0.0708801582700138, 'max_depth': 8.975177509311667, 'min_child_weight': 9.110058896157721, 'n_estimators': 588.5277619331036, 'subsample': 0.9857800068264405}}


In [83]:
# rf & lgbm
train_rf, train_origin_rf, test_rf, test_origin_rf = rf_lgb_dataclean(train_final_rf_lgb_data, train_emb, isFinal = True, test = test_final_rf_lgb_data, test_emb = test_emb)
train_lgbm, train_origin_lgbm, test_lgbm, test_origin_lgbm = train_rf.copy(), train_origin_rf.copy(), test_rf.copy(), test_origin_rf.copy()

Original Shape :  (35379, 96)
Original Shape :  (2716, 96)


In [84]:
# rf
rf_train_pred, rf_test_pred, rf_final_model = final_rf(train_rf, test_rf, param_search = False)

---------------[Random Forest Model]---------------
Train test set clean!

Column : 
 Index(['broadMonth', 'broadDay', 'broadHour', 'broadTime', 'prodGroup',
       'unitPrice', 'isFemale', 'paymentPlan', 'broadDayOfWeek', 'prodCount',
       'holidayLen', 'prod_index', 'max_index', 'start_index', 'end_index',
       'stop', 'isNamed', 'isPayday', 'rating', 'priceIndex', 'tempNorm',
       'rainAvgWhole', 'rainAvgCap', 'dustCat', 'Feature0', 'Feature1',
       'Feature2', 'Feature3', 'Feature4', 'Feature5', 'Feature6', 'Feature7',
       'Feature8', 'Feature9', 'Feature10', 'Feature11', 'Feature12',
       'Feature13', 'Feature14', 'Feature15', 'Feature16', 'Feature17',
       'Feature18', 'Feature19', 'Feature20', 'Feature21', 'Feature22',
       'Feature23', 'Feature24', 'Feature25', 'Feature26', 'Feature27',
       'Feature28', 'Feature29', 'Feature30', 'Feature31', 'Feature32',
       'Feature33', 'Feature34', 'Feature35', 'Feature36', 'Feature37',
       'Feature38', 'Feature39', 

In [85]:
# lgbm
lgbm_train_pred, lgbm_test_pred, lgbm_final_model = final_lgbm(train_lgbm, test_lgbm, param_search = False)

---------------[Light GBM Model]---------------
Train test set clean!

Column : 
 Index(['broadMonth', 'broadDay', 'broadHour', 'broadTime', 'prodGroup',
       'unitPrice', 'isFemale', 'paymentPlan', 'broadDayOfWeek', 'prodCount',
       'holidayLen', 'prod_index', 'max_index', 'start_index', 'end_index',
       'stop', 'isNamed', 'isPayday', 'rating', 'priceIndex', 'tempNorm',
       'rainAvgWhole', 'rainAvgCap', 'dustCat', 'Feature0', 'Feature1',
       'Feature2', 'Feature3', 'Feature4', 'Feature5', 'Feature6', 'Feature7',
       'Feature8', 'Feature9', 'Feature10', 'Feature11', 'Feature12',
       'Feature13', 'Feature14', 'Feature15', 'Feature16', 'Feature17',
       'Feature18', 'Feature19', 'Feature20', 'Feature21', 'Feature22',
       'Feature23', 'Feature24', 'Feature25', 'Feature26', 'Feature27',
       'Feature28', 'Feature29', 'Feature30', 'Feature31', 'Feature32',
       'Feature33', 'Feature34', 'Feature35', 'Feature36', 'Feature37',
       'Feature38', 'Feature39', 'Fea

In [None]:
xgb_cv_mapes = cv_for_ensemble(xgb_final_model, train_xgb, train_origin_xgb, isXgb = True)
rf_cv_mapes = cv_for_ensemble(rf_final_model, train_rf, train_origin_rf, isXgb = False)
lgbm_cv_mapes = cv_for_ensemble(lgbm_final_model, train_lgbm, train_origin_lgbm, isXgb = False)

In [190]:
def weight_sum(xgb_mapes, rf_mapes, lgbm_mapes, data_origin, xgb_train_pred, rf_train_pred, lgbm_train_pred):
    pred = xgb_train_pred
    pred.rename(columns = {0 : 'xgb'}, inplace = True)
    pred['rf'] = rf_train_pred
    pred['lgbm'] = lgbm_train_pred
    pred['final'] = 0
    pred['prodGroup'] = data_origin['prodGroup']
    xgb_inversed, rf_inversed, lgbm_inversed = [], [], []
    
    for i in range(len(xgb_cv_mapes)):
        inverse = (1/(xgb_cv_mapes[i][1])) + (1/(rf_cv_mapes[i][1])) + (1/(lgbm_cv_mapes[i][1]))
        xgb_inversed.append((1/(xgb_cv_mapes[i][1]))/inverse)
        rf_inversed.append((1/(rf_cv_mapes[i][1]))/inverse)
        lgbm_inversed.append((1/(lgbm_cv_mapes[i][1]))/inverse)
        
    print("\nXGB Weights :\n", xgb_inversed)
    print("\nRF Weights :\n", rf_inversed)
    print("\nLGBM Weights :\n", lgbm_inversed)
        
    for i in range(len(xgb_cv_mapes)):
        grp_name = xgb_cv_mapes[i][0]               
        pred['final'][pred['prodGroup'] == grp_name] = ((pred['xgb'][pred['prodGroup'] == grp_name] * xgb_inversed[i])
                                                         + (pred['rf'][pred['prodGroup'] == grp_name] * rf_inversed[i])
                                                         + (pred['lgbm'][pred['prodGroup'] == grp_name] * xgb_inversed[i]))
        
    return pred

In [182]:
weighted = weight_sum(xgb_cv_mapes, rf_cv_mapes, lgbm_cv_mapes, train, xgb_train_pred, rf_train_pred, lgbm_train_pred)

In [184]:
MAPE(weighted['final'], train['revenue'])

20.08341855893393

In [None]:
weighted['final'].to_csv("train_final.csv")

In [191]:
test_weighted = weight_sum(xgb_cv_mapes, rf_cv_mapes, lgbm_cv_mapes, train, xgb_test_pred, rf_test_pred, lgbm_test_pred)


XGB Weights :
 [0.3405913905325114, 0.3397247621145796, 0.3401355604937333, 0.34717060958621326, 0.3435774887754052, 0.3430060753747281, 0.35564655006254176, 0.34997335475924474, 0.34470889560249723, 0.3376579523310813, 0.35290873190789884]

RF Weights :
 [0.29841829453775626, 0.3105462107446997, 0.31143975130109547, 0.298065554850706, 0.3048148171050208, 0.3163881592949487, 0.2995096612932377, 0.2912489984574073, 0.30827970155119044, 0.319454491175293, 0.3021973053681798]

LGBM Weights :
 [0.3609903149297324, 0.34972902714072074, 0.3484246882051713, 0.35476383556308055, 0.3516076941195739, 0.3406057653303233, 0.3448437886442207, 0.35877764678334795, 0.34701140284631227, 0.34288755649362573, 0.3448939627239213]


In [192]:
test_weighted

Unnamed: 0,xgb,rf,lgbm,final,prodGroup
0,9.311134e+06,1.052223e+07,6.976666e+06,8.687511e+06,의류
1,1.559501e+07,1.952674e+07,1.417514e+07,1.596659e+07,의류
2,3.480942e+07,3.053960e+07,2.516273e+07,2.953957e+07,의류
3,1.565779e+07,1.710998e+07,1.558356e+07,1.574647e+07,의류
4,2.576550e+07,2.983447e+07,2.728156e+07,2.697052e+07,의류
...,...,...,...,...,...
2711,6.205482e+06,9.761382e+06,9.769978e+06,8.473904e+06,주방
2712,3.818743e+06,1.168152e+07,8.262836e+06,7.747466e+06,주방
2713,8.105626e+06,1.195793e+07,9.084447e+06,9.571129e+06,주방
2714,1.697284e+07,1.149075e+07,8.863133e+06,1.238857e+07,건강기능


In [1]:
final_pred_y = test_weighted['final']

NameError: name 'test_weighted' is not defined

In [197]:
final_pred_y.to_csv("submission.csv")