In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
pip install bayesian-optimization

## 데이터 로드

In [None]:
import gc
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder

sns.set(style="darkgrid")
warnings.filterwarnings(action='ignore') # 경고 무시

# 경로
data_path = '/kaggle/input/competitive-data-science-predict-future-sales/'

sales_train = pd.read_csv(data_path + 'sales_train.csv')
shops = pd.read_csv(data_path + 'shops.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

## 다운캐스팅
 출처 : [https://www.kaggle.com/code/jungbyunggil/top-3-5-lightgbm-with-feature-engineering/edit]

In [None]:
def downcast(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('{:.1f}% compressed'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

all_df = [sales_train, shops, items, item_categories, test]
for df in all_df:
    df = downcast(df)

## 작성 시 사용할 수 있는 유틸리티 기능들
출처 : [https://www.kaggle.com/code/abubakar624/first-place-solution-kaggle-predict-future-sales]

In [None]:
def reduce_mem_usage(df, silent=True, allow_categorical=True, float_dtype="float32"):
    """ 
    Iterates through all the columns of a dataframe and downcasts the data type
     to reduce memory usage. Can also factorize categorical columns to integer dtype.
    """
    def _downcast_numeric(series, allow_categorical=allow_categorical):
        """
        Downcast a numeric series into either the smallest possible int dtype or a specified float dtype.
        """
        if pd.api.types.is_sparse(series.dtype) is True:
            return series
        elif pd.api.types.is_numeric_dtype(series.dtype) is False:
            if pd.api.types.is_datetime64_any_dtype(series.dtype):
                return series
            else:
                if allow_categorical:
                    return series
                else:
                    codes, uniques = series.factorize()
                    series = pd.Series(data=codes, index=series.index)
                    series = _downcast_numeric(series)
                    return series
        else:
            series = pd.to_numeric(series, downcast="integer")
        if pd.api.types.is_float_dtype(series.dtype):
            series = series.astype(float_dtype)
        return series

    if silent is False:
        start_mem = np.sum(df.memory_usage()) / 1024 ** 2
        print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    if df.ndim == 1:
        df = _downcast_numeric(df)
    else:
        for col in df.columns:
            df.loc[:, col] = _downcast_numeric(df.loc[:,col])
    if silent is False:
        end_mem = np.sum(df.memory_usage()) / 1024 ** 2
        print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
        print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df


def shrink_mem_new_cols(matrix, oldcols=None, allow_categorical=False):
    # Calls reduce_mem_usage on columns which have not yet been optimized
    if oldcols is not None:
        newcols = matrix.columns.difference(oldcols)
    else:
        newcols = matrix.columns
    matrix.loc[:,newcols] = reduce_mem_usage(matrix.loc[:,newcols], allow_categorical=allow_categorical)
    oldcols = matrix.columns  # This is used to track which columns have already been downcast
    return matrix, oldcols


def list_if_not(s, dtype=str):
    # Puts a variable in a list if it is not already a list
    if type(s) not in (dtype, list):
        raise TypeError
    if (s != "") & (type(s) is not list):
        s = [s]
    return s

## sales_train 의 이상치 제거

In [None]:
plt.figure(figsize=(10,4))
plt.xlim(-100, 3000)
flierprops = dict(marker='o', markerfacecolor='purple', markersize=6,
                  linestyle='none', markeredgecolor='black')
sns.boxplot(x=sales_train.item_cnt_day, flierprops=flierprops)

plt.figure(figsize=(10,4))
plt.xlim(sales_train.item_price.min(), sales_train.item_price.max()*1.1)
sns.boxplot(x=sales_train.item_price, flierprops=flierprops)

In [None]:
# 아이템 가격이 0보다 작은 값 제외
sales_train = sales_train[sales_train['item_price'] > 0]
# 100000 이상 나온 가격 제외
sales_train = sales_train[sales_train['item_price'] < 100000]
# 아이템 판매량이 0보다 작은 값 제외
sales_train = sales_train[sales_train['item_cnt_day'] > 0]
# 판매량이 1000 이상일 경우 제외
sales_train = sales_train[sales_train['item_cnt_day'] < 1000]

In [None]:
#  id_name이 같은 값들의 id 변경
sales_train.loc[sales_train['shop_id'] == 0, 'shop_id'] = 57
sales_train.loc[sales_train['shop_id'] == 1, 'shop_id'] = 58
sales_train.loc[sales_train['shop_id'] == 10, 'shop_id'] = 11
sales_train.loc[sales_train['shop_id'] == 39, 'shop_id'] = 40
sales_train.loc[sales_train['shop_id'] == 37, 'shop_id'] = 36  # 수윤 수정 :Shop36 added separately because it only has one month of data

#  테스트 데이터에도 적용
test.loc[test['shop_id'] == 0, 'shop_id'] = 57
test.loc[test['shop_id'] == 1, 'shop_id'] = 58
test.loc[test['shop_id'] == 10, 'shop_id'] = 11
test.loc[test['shop_id'] == 39, 'shop_id'] = 40
test.loc[test['shop_id'] == 37, 'shop_id'] = 36  # 수윤 수정 :Shop36 added separately because it only has one month of data

In [None]:
# 중복값 제거
unique_test_shop_id = test['shop_id'].unique()
sales_train = sales_train[sales_train['shop_id'].isin(unique_test_shop_id)]

## 진호님 작성 부분

In [None]:
df_eda = pd.merge(sales_train, shops, how='left', left_on = 'shop_id', right_on = 'shop_id')
df_eda = pd.merge(df_eda, items, how='left', left_on='item_id', right_on='item_id')
df_eda = pd.merge(df_eda, item_categories, how='left', left_on='item_category_id', right_on='item_category_id')
df_eda.drop(['item_id', 'item_category_id', 'shop_id'], axis=1, inplace=True)
df_eda.head()

In [None]:
df_eda['date'] = pd.to_datetime(df_eda['date'])
df_eda.info()

In [None]:
#고유 가게이름 갯수
print(df_eda['shop_name'].unique())

#고유 가게이름 리스트
print(*df_eda['shop_name'].unique(), sep='\n')

In [None]:
#도시 나누기
cities = []
for i in df_eda['shop_name']:
    city = i.split(' ')[0]
    if city[0] == '!':
        city = i.split('!')[1].split(' ')[0]
        cities.append(city)
    elif city == 'Интернет-магазин':
        cities.append('Интернет')
    elif city == 'Цифровой':
        cities.append('Интернет')
    else:
        cities.append(city)

In [None]:
df_eda['city'] = cities
df_eda.head(3)

In [None]:
# 고유 도시이름 리스트
df_eda['city'].unique()

In [None]:
#상품 많이 팔린 도시 순위 10위까지
top_10_selling_freq_city = df_eda['city'].value_counts().head(10)
top_10_selling_city = df_eda['item_cnt_day'].groupby(df_eda['city']).sum().sort_values(ascending=False).head(10)
top_10_selling_city = top_10_selling_city.to_frame().reset_index()
top_10_selling_freq_city = top_10_selling_freq_city.to_frame().reset_index()
top_10_selling_city.columns = ['city', 'city_selling_count']
top_10_selling_freq_city.columns = ["city", "city_selling_freq_count"]


In [None]:
print(top_10_selling_city.head())
print(top_10_selling_freq_city.head())

In [None]:
#퍼신티지로 나타내기
# top10 도시의 합
sell_sum = top_10_selling_city['city_selling_count'].sum()
freq_sum = top_10_selling_freq_city['city_selling_freq_count'].sum()

# 도시 판매량 / 전체 판매량
sell_percentage = []
freq_percentage = []
for i in top_10_selling_city['city_selling_count']:
    sell_percentage.append(i/sell_sum)
for i in top_10_selling_freq_city['city_selling_freq_count']:
    freq_percentage.append(i/freq_sum)

# 퍼신티지 열 생성
top_10_selling_city['city_selling_percentage'] = sell_percentage
top_10_selling_freq_city['city_selling_freq_percentage'] = freq_percentage

del sell_percentage, freq_percentage
gc.collect()

print(top_10_selling_city)
print(top_10_selling_freq_city)

In [None]:
plt.figure(figsize=(10,10))
explode = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
plt.pie(top_10_selling_city['city_selling_percentage'], labels=list(top_10_selling_city.city),
        explode=explode, autopct='%.0f%%')
my_circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('Top cities by amount of sales.', fontdict={'size':14})

plt.figure(figsize=(10,10))
explode = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
plt.pie(top_10_selling_freq_city['city_selling_freq_percentage'], labels=list(top_10_selling_freq_city.city),
        explode=explode, autopct='%.0f%%')
my_circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('Top cities by amount of sales.', fontdict={'size':14})

plt.show()

In [None]:
#모스크바가 유의미한 판매량을 가진 도시이기 때문에
#모스크바에서 제일 많이 팔린 판매량을 feature로 만들자.
#all_data에 innerjoin으로 city열을 추가한 다음 groupby로 모스크바 판매량을 나열한 리스트를 feature로

In [None]:
# Creating the bar plot to take a look at sales by shop name
plt.figure(figsize=(10, 10))
sns.countplot(y=df_eda['shop_name'])
plt.ylabel('Shop name')
plt.title('Amout of sold products in different shops')
plt.show()

In [None]:
# Find and plot top 10 selling items
top_10_selling_item = df_eda['item_name'].value_counts().head(10)
top_10_selling_item = top_10_selling_item.to_frame().reset_index()
top_10_selling_item.columns = ['item_name', 'count']
plt.figure(figsize=(13, 5))
sns.barplot(data=top_10_selling_item, x='count', y='item_name')
plt.title('Top 10 selling items.')
plt.ylabel('Item name')
plt.xlabel('Count')
plt.show()

In [None]:
# Find and plot top 10 selling categories of products
top_10_selling_category = df_eda['item_category_name'].value_counts().head(10)
top_10_selling_category = top_10_selling_category.to_frame().reset_index()
top_10_selling_category.columns = ['item_name', 'count']
plt.figure(figsize=(13, 4))
sns.barplot(data=top_10_selling_category, x='count', y='item_name')
plt.title('Top 10 selling categories.')
plt.ylabel('Item name')
plt.xlabel('Count')
plt.show()

In [None]:
# Function to plot the trends from the list of columns.
def plot_trend(columns):
    for column in columns:
        movies = df_eda[df_eda['item_category_name'] == column]
        movies = movies.groupby('date_block_num')['item_cnt_day'].sum().to_frame().reset_index()
        plt.figure(figsize=(20,4))
        plt.axhline (y =movies['item_cnt_day'].mean(), color='red', linewidth = 1, linestyle ='dashed', label = 'Average sales for all time.')
        sns.lineplot(data=movies, x='date_block_num', y='item_cnt_day', label = 'Trend line.')
        plt.title('Trend of buying ' + str(column) + '.', fontsize=14)
        plt.legend(loc = 'upper right')
        plt.xlabel('Number of Month (starting from Jan 2013 to October 2015)')
        plt.ylabel('Items sold')
        plt.show()
        print('\n')

In [None]:
# Calling the function from previous cell
plot_trend(list(top_10_selling_category['item_name']))

## item

### cleaning - item_name

In [None]:
import re
def name_correction(x):
    x = x.lower() # all letters lower case
    x = x.partition('[')[0] # partition by square brackets
    x = x.partition('(')[0] # partition by curly brackets
    x = re.sub('[^A-Za-z0-9А-Яа-я]+', ' ', x) # remove special characters
    x = x.replace('  ', ' ') # replace double spaces with single spaces
    x = x.strip() # remove leading and trailing white space
    return x

In [None]:
items.head()

In [None]:
# 괄호에 따라 name split
items["name"], items["item_cat_1"] = items.item_name.str.split("[", 1).str
items["name"], items["item_cat_2"] = items.item_name.str.split("(", 1).str

# 소문자로, 특수문자 제거
items["item_cat_1"] = items.item_cat_1.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()
items["item_cat_2"] = items.item_cat_2.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()

# null 값에 0
items = items.fillna('0')

items["item_name"] = items["item_name"].apply(lambda x: name_correction(x))

# return all characters except the last if name 2 is not "0" - the closing bracket
items.item_cat_1 = items.item_cat_1.apply( lambda x: x[:-1] if x !="0" else "0")

### Clean item type

In [None]:
items["type"] = items.item_cat_1.apply(lambda x: x[0:8] if x.split(" ")[0] == "xbox" else x.split(" ")[0] )
items.loc[(items.type == "x360") | (items.type == "xbox360") | (items.type == "xbox 360") ,"type"] = "xbox 360"
items.loc[ items.type == "", "type"] = "mac"
items.type = items.type.apply( lambda x: x.replace(" ", "") )
items.loc[ (items.type == 'pc' )| (items.type == 'pс') | (items.type == "pc"), "type" ] = "pc"
items.loc[ items.type == 'рs3' , "type"] = "ps3"

In [None]:
group_sum = items.groupby(["type"]).agg({"item_id": "count"})
group_sum = group_sum.reset_index()
drop_cols = []
for cat in group_sum.type.unique():
    if group_sum.loc[(group_sum.type == cat), "item_id"].values[0] <40:
        drop_cols.append(cat)
items.item_cat_1 = items.item_cat_1.apply( lambda x: "other" if (x in drop_cols) else x )
items = items.drop(["type"], axis = 1)

del drop_cols

In [None]:
# 해당 상품이 처음으로 팔린 달
items['first_sale_date'] = sales_train.groupby('item_id').agg({'date_block_num': 'min'})['date_block_num']
items['first_sale_date'] = items['first_sale_date'].fillna(34) # 팔린 적 없으면 34

In [None]:
items.item_cat_1 = LabelEncoder().fit_transform(items.item_cat_1)
items.item_cat_2 = LabelEncoder().fit_transform(items.item_cat_2)

items.drop(["item_name", "name"],axis = 1, inplace= True)
items.head()

## shop

In [None]:
shops.head()

In [None]:
shops.loc[ shops.shop_name == 'Сергиев Посад ТЦ "7Я"',"shop_name" ] = 'СергиевПосад ТЦ "7Я"'

In [None]:
shops["city"] = shops.shop_name.str.split(" ").map( lambda x: x[0] )
shops["shops_category"] = shops.shop_name.str.split(" ").map( lambda x: x[1] )
shops.loc[shops.city == "!Якутск", "city"] = "Якутск"

In [None]:
top_10_selling_city.head()

In [None]:
top_10_selling_city=top_10_selling_city.drop('city_selling_count', axis=1)
# top_10_selling merge
shops = shops.merge(top_10_selling_city, on='city', how='left')

In [None]:
# 수윤님 추가내용
city10 = top_10_selling_city['city']
shops["city"].apply(lambda x: 'others' if x not in city10.values else x)

In [None]:
city_mapping = {'others':0, 'Чехов':1, 'Сергиев':2, 'Самара':3, 'Тюмень':4, 'Уфа':5, 'Интернет':6, 'СПб':7, 'Воронеж':8, 'Якутск':9, 'Москва':10}
shops['city'] = shops['city'].map(city_mapping)

In [None]:
shops = shops.fillna(0)
shops.head()

In [None]:
category = []
for cat in shops.shops_category.unique():
    if len(shops[shops.shops_category == cat]) >= 5:
        category.append(cat)
shops.shops_category = shops.shops_category.apply( lambda x: x if (x in category) else "other" )

del category

In [None]:
shops.shops_category.value_counts()

In [None]:
shops['shops_category'] = LabelEncoder().fit_transform(shops['shops_category'])

In [None]:
shops.head()

In [None]:
# shop_name 제거
shops = shops.drop(['shop_name'], axis=1)

shops.head()

## item_categories

In [None]:
item_categories.head()

In [None]:
item_categories['type'] = item_categories['item_category_name'].apply(lambda x: x.split()[0])  

In [None]:
item_categories['type'].value_counts()

In [None]:
cat_type = []
for ty in item_categories.type.unique():
    if len(item_categories[item_categories.type == ty]) >= 5:
        cat_type.append(ty)
item_categories.type = item_categories.type.apply(lambda x: x if (x in cat_type) else "etc")

del cat_type

In [None]:
item_categories['type'].value_counts()

In [None]:
item_categories.head()

In [None]:
item_categories["split"] = item_categories.item_category_name.apply(lambda x: x.split("-"))
item_categories["subtype"] = item_categories.split.apply(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())

In [None]:
item_categories['type'] = LabelEncoder().fit_transform(item_categories['type'])
item_categories['subtype'] = LabelEncoder().fit_transform(item_categories['subtype'])

# name과 split 삭제
item_categories = item_categories.drop(['item_category_name','split'], axis=1)

In [None]:
item_categories.head()

In [None]:
gc.collect()

## all_data

In [None]:
sales_train.head()

In [None]:
sales_train["revenue"] = sales_train["item_cnt_day"] * sales_train["item_price"]
sales_train.head()

In [None]:
sales_train.info()

In [None]:
# product는 리스트를 받아 그 조합을 만들어내는 툴
from itertools import product

train = []

for i in sales_train['date_block_num'].unique():
    all_shop = sales_train.loc[sales_train['date_block_num']==i, 'shop_id'].unique()
    all_item = sales_train.loc[sales_train['date_block_num']==i, 'item_id'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))
    
idx_features = ['date_block_num', 'shop_id', 'item_id'] # 기본틀
train = pd.DataFrame(np.vstack(train), columns=idx_features)
print(train.head())

In [None]:
group = sales_train.groupby(idx_features).agg({'item_cnt_day': 'sum',
                                               'revenue' : 'mean',
                                               'item_price': 'mean'})
group = group.reset_index()
group = group.rename(columns={'item_cnt_day': 'item_cnt_month', 'revenue' : 'revenue_month', 'item_price': 'item_price_mean'})

train = train.merge(group, on=idx_features, how='left')

train.head()

In [None]:
# 상품 판매된 횟수(갯수는 고려 x)
group = sales_train.groupby(idx_features).agg({'item_cnt_day': 'count'})
group = group.reset_index()
group = group.rename(columns={'item_cnt_day': 'item_sell_count'})

train = train.merge(group, on=idx_features, how='left')

# gc
del group
gc.collect()

train.head()

In [None]:
# test 데이터의 block_num 을 34로 설정한 후, train 데이터와 합친다. 이유는 train 데이터와 형식을 맞추기 위함
test['date_block_num'] = 34

all_data = pd.concat([train, test.drop('ID', axis=1)],
                     ignore_index=True,
                     keys=idx_features)
# 널값에 0
all_data = all_data.fillna(0)

all_data.head()

In [None]:
# shops, items, item_categories 데이터와 merge
all_data = all_data.merge(shops, on='shop_id', how='left')
all_data = all_data.merge(items, on='item_id', how='left')
all_data = all_data.merge(item_categories, on='item_category_id', how='left')

# Data downcasting
all_data = downcast(all_data)

# Garbage collection
del shops, items, item_categories
gc.collect();

In [None]:
all_data.head()

## visualization

In [None]:
import matplotlib as mpl
%matplotlib inline
mpl.rc('font', size=13)
figure, ax = plt.subplots() 
figure.set_size_inches(11, 5)
# total montly item sales
group_month_sum = all_data.groupby('date_block_num').agg({'item_cnt_month': 'sum'})
group_month_sum = group_month_sum.reset_index()
sns.barplot(x='date_block_num', y='item_cnt_month', data=group_month_sum)
ax.set(title='Distribution of monthly item counts by date block number',
       xlabel='Date block number', 
       ylabel='Monthly item counts');

In [None]:
figure, ax= plt.subplots() 
figure.set_size_inches(11, 5)
# Total item sales by item_category_id
group_cat_sum = all_data.groupby('item_category_id').agg({'item_cnt_month': 'sum'})
group_cat_sum = group_cat_sum.reset_index()
# Extract only item categories with total sales > 10,000
group_cat_sum = group_cat_sum[group_cat_sum['item_cnt_month'] > 10000]
sns.barplot(x='item_category_id', y='item_cnt_month', data=group_cat_sum)
ax.set(title='Distribution of total item counts by item category id',
       xlabel='Item category ID', 
       ylabel='Total item counts')
ax.tick_params(axis='x', labelrotation=90) # Rotate X label

In [None]:
figure, ax= plt.subplots() 
figure.set_size_inches(11, 5)
# Total item sales by shop_id
group_shop_sum = all_data.groupby('shop_id').agg({'item_cnt_month': 'sum'})
group_shop_sum = group_shop_sum.reset_index()
group_shop_sum = group_shop_sum[group_shop_sum['item_cnt_month'] > 10000]
sns.barplot(x='shop_id', y='item_cnt_month', data=group_shop_sum)
ax.set(title='Distribution of total item counts by shop id',
       xlabel='Shop ID', 
       ylabel='Total item counts')
ax.tick_params(axis='x', labelrotation=90)

## 정리

In [None]:
def resumetable(df):
    print(f'Data Shape: {df.shape}')
    summary = pd.DataFrame(df.dtypes, columns=['Dtypes'])
    summary['Null'] = df.isnull().sum().values
    summary['Uniques'] = df.nunique().values
    summary['First_values'] = df.loc[0].values
    summary['Second_values'] = df.loc[1].values
    summary['Third_values'] = df.loc[2].values
    
    return summary

In [None]:
resumetable(all_data)

### idx_feature를 조합해 이에 대한 상품 판매량의 평균 feature 생성

In [None]:
def add_mean_features(df, mean_features, idx_features):
    # Check base features
    assert (idx_features[0] == 'date_block_num') and \
           len(idx_features) in [2, 3]
    
    # Set derived feature name 
    if len(idx_features) == 2:
        feature_name = idx_features[1] + '_mean_sales'
    else:
        feature_name = idx_features[1] + '_' + idx_features[2] + '_mean_sales'
    
    # Get average monthly sales by grouping based on base features
    group = df.groupby(idx_features).agg({'item_cnt_month': 'mean'})
    group = group.reset_index()
    group = group.rename(columns={'item_cnt_month': feature_name})
    
    # Merge df with group based on idx_features
    df = df.merge(group, on=idx_features, how='left')
    # Date downcasting
    df = downcast(df, False)
    # Append newly created mean_feature_name features to the mean_features list
    mean_features.append(feature_name)
    
    # Garbage collection
    del group
    gc.collect()
    
    return df, mean_features

In [None]:
# List of derived features containing 'item_id' in the grouping base features
item_mean_features = []


# Create monthly average sales derived features grouped by ['date_block_num', 'item_id']
all_data, item_mean_features = add_mean_features(df=all_data,
                                                 mean_features=item_mean_features,
                                                 idx_features=['date_block_num', 'item_id'])

# Create monthly average sales derived features grouped by ['date_block_num', 'item_id', 'city']
all_data, item_mean_features = add_mean_features(df=all_data,
                                                 mean_features=item_mean_features,
                                                 idx_features=['date_block_num', 'item_id', 'city'])

In [None]:
item_mean_features

In [None]:
# List of derived features containing 'shop_id' in the grouping base features
shop_mean_features = []

# Create monthly average sales derived features grouped by ['date_block_num', 'shop_id', 'item_category_id']
all_data, shop_mean_features = add_mean_features(df=all_data, 
                                                 mean_features=shop_mean_features,
                                                 idx_features=['date_block_num', 'shop_id', 'item_category_id'])

In [None]:
shop_mean_features

### lag feature 생성 함수

In [None]:
def add_lag_features(df, lag_features_to_clip, idx_features, 
                     lag_feature, nlags=3, clip=False):
    # Copy only the part of the DataFrame needed to create the lag features
    df_temp = df[idx_features + [lag_feature]].copy() 

    # Create lag features
    for i in range(1, nlags+1):
        # Lag featrue name
        lag_feature_name = lag_feature +'_lag' + str(i)
        # Set df_temp column name
        df_temp.columns = idx_features + [lag_feature_name]
        # Add 1 to date_block_num feature in df_temp
        df_temp['date_block_num'] += 1
        # Merge df with df_temp based on idx_feature
        df = df.merge(df_temp.drop_duplicates(), 
                      on=idx_features, 
                      how='left')
        # Replace NaN with 0
        df[lag_feature_name] = df[lag_feature_name].fillna(0)
        # Add lag features to lag_features_to_clip to clip between 0 and 20
        if clip: 
            lag_features_to_clip.append(lag_feature_name)
    
    # Date downcasting
    df = downcast(df, False)
    # Garbage collection
    del df_temp
    gc.collect()
    
    return df, lag_features_to_clip

In [None]:
lag_features_to_clip = [] # list of lag features to be clipped to between 0 to 20 
idx_features = ['date_block_num', 'shop_id', 'item_id'] # base features

# Create 3 month lag features of item_cnt_month based on idx_features
all_data, lag_features_to_clip = add_lag_features(df=all_data, 
                                                  lag_features_to_clip=lag_features_to_clip,
                                                  idx_features=idx_features,
                                                  lag_feature='item_cnt_month', 
                                                  nlags=3,
                                                  clip=True)

In [None]:
all_data.head().T

In [None]:
# item count 에 대한 3개의 래그피쳐 생성
all_data, lag_features_to_clip = add_lag_features(df=all_data, 
                                                  lag_features_to_clip=lag_features_to_clip,
                                                  idx_features=idx_features,
                                                  lag_feature='item_sell_count', 
                                                  nlags=3)

# item_price_mean 에 대한 3개의 래그피쳐 생성
all_data, lag_features_to_clip = add_lag_features(df=all_data, 
                                                  lag_features_to_clip=lag_features_to_clip,
                                                  idx_features=idx_features,
                                                  lag_feature='item_price_mean', 
                                                  nlags=3)

# revenue_month 에 대한 3개의 래그피쳐 생성
all_data, lag_features_to_clip = add_lag_features(df=all_data, 
                                                  lag_features_to_clip=lag_features_to_clip,
                                                  idx_features=idx_features,
                                                  lag_feature='revenue_month', 
                                                  nlags=3)

In [None]:
X_test_temp = all_data[all_data['date_block_num'] == 34]
X_test_temp[item_mean_features].sum()

In [None]:
# Create lag features by item_mean_features element based on dx_features
for item_mean_feature in item_mean_features:
    all_data, lag_features_to_clip = add_lag_features(df=all_data, 
                                                      lag_features_to_clip=lag_features_to_clip, 
                                                      idx_features=idx_features, 
                                                      lag_feature=item_mean_feature, 
                                                      nlags=3)
# Remove features in item_mean_features
all_data = all_data.drop(item_mean_features, axis=1)

In [None]:
# Create lag features by shop_mean_features element based on ['date_block_num', 'shop_id', 'item_category_id']
for shop_mean_feature in shop_mean_features:
    all_data, lag_features_to_clip = add_lag_features(df=all_data,
                                                      lag_features_to_clip=lag_features_to_clip, 
                                                      idx_features=['date_block_num', 'shop_id', 'item_category_id'], 
                                                      lag_feature=shop_mean_feature, 
                                                      nlags=3)
# Remove features in shop_mean_features
all_data = all_data.drop(shop_mean_features, axis=1)

In [None]:
for i in [1,2,3]:
        all_data["delta_price_lag" + str(i) ] = (all_data["item_price_mean_lag" + str(i)]- all_data["item_price_mean"] )/ all_data["item_price_mean"]

all_data['delta_price_lag_mean'] = all_data[['delta_price_lag1',
                                         'delta_price_lag2', 
                                         'delta_price_lag3']].mean(axis=1)
all_data.head()

In [None]:
all_data['item_cnt_month_lag_mean'] = all_data[['item_cnt_month_lag1',
                                         'item_cnt_month_lag2', 
                                         'item_cnt_month_lag3']].mean(axis=1)

In [None]:
# Clip 0~20
all_data[lag_features_to_clip + ['item_cnt_month', 'item_cnt_month_lag_mean']] = all_data[lag_features_to_clip +['item_cnt_month', 'item_cnt_month_lag_mean']].clip(0, 20)

In [None]:
all_data['lag_grad1'] = all_data['item_cnt_month_lag1']-all_data['item_cnt_month_lag2']
all_data['lag_grad1'] = all_data['lag_grad1'].replace([np.inf, -np.inf], 
                                                        np.nan).fillna(0)

all_data['lag_grad2'] = all_data['item_cnt_month_lag2']-all_data['item_cnt_month_lag3']
all_data['lag_grad2'] = all_data['lag_grad2'].replace([np.inf, -np.inf], 
                                                        np.nan).fillna(0)

In [None]:
# features_to_drop = ['item_price_mean', 'item_sell_count', 'delta_price']
# features_to_drop = ['item_price_mean', 'item_sell_count', 'delta_price']
features_to_drop = ['item_price_mean', 'item_sell_count']


for i in [1,2,3]:
    features_to_drop.append("item_price_mean_lag" + str(i) )
    features_to_drop.append("delta_price_lag" + str(i) )
    features_to_drop.append("item_cnt_month_lag" + str(i) )

all_data.drop(features_to_drop, axis = 1, inplace = True)
all_data.head()

In [None]:
all_data['brand_new'] = all_data['first_sale_date'] == all_data['date_block_num']

In [None]:
all_data['duration_after_first_sale'] = all_data['date_block_num'] - all_data['first_sale_date']
all_data = all_data.drop('first_sale_date', axis=1)

In [None]:
all_data['month'] = all_data['date_block_num']%12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
all_data["days"] = all_data["month"].map(days)

In [None]:
all_data.info()

In [None]:
all_data.head(10)

In [None]:
# Remove item_price_mean, item_count features
all_data = all_data.drop(['days'], axis=1)
all_data = downcast(all_data, False) # Data downcasting
all_data.info()

In [None]:
# all_data = all_data.drop(['delta_price_lag1','delta_price_lag2','delta_price_lag3','days'],axis=1)

In [None]:
# 앞 3달 삭제

all_data = all_data.drop(all_data[all_data['date_block_num'] < 3].index)

In [None]:
# Train data (Features)
X_train = all_data[all_data['date_block_num'] < 33]
X_train = X_train.drop(['item_cnt_month'], axis=1)
# Valid data (Features)
X_valid = all_data[all_data['date_block_num'] == 33]
X_valid = X_valid.drop(['item_cnt_month'], axis=1)
# Test data (Features)
X_test = all_data[all_data['date_block_num'] == 34]
X_test = X_test.drop(['item_cnt_month'], axis=1)

# Train data (Target values)
Y_train = all_data[all_data['date_block_num'] < 33]['item_cnt_month']
# Valid data (Target values)
Y_valid = all_data[all_data['date_block_num'] == 33]['item_cnt_month']

# Garbage collection
del all_data
gc.collect();

BayesianOptimization parameter 수정

In [None]:
# from bayes_opt import BayesianOptimization
# from sklearn.model_selection import StratifiedKFold
# import lightgbm as lgb
# from sklearn.metrics import mean_squared_error

In [None]:
# predictors = X_train.columns.tolist()

# bayesian_tr_index, bayesian_val_index = list(StratifiedKFold(2, random_state=12, shuffle=True).split(X_train, Y_train))[0]

In [None]:
# def lgb_black_box(
#     num_leaves,  # int
#     min_data_in_leaf,  # int
#     learning_rate,
#     min_sum_hessian_in_leaf,    # int  
#     feature_fraction,
#     lambda_l1,
#     lambda_l2,
#     min_gain_to_split,
#     max_depth):
    
#     # lgb need some inputs as int but BayesianOptimization library send continuous values values. so we change type.

#     num_leaves = int(num_leaves)
#     min_data_in_leaf = int(min_data_in_leaf)
#     max_depth = int(max_depth)
    
#     # all this hyperparameter values are just for test. our goal in this kernel is how to use bayesian optimization
#     # you can see lgb documentation for more info about hyperparameters
#     params = {
#         'num_leaves': num_leaves,
#         'max_bin': 63,
#         'min_data_in_leaf': min_data_in_leaf,
#         'learning_rate': learning_rate,
#         'min_sum_hessian_in_leaf': min_sum_hessian_in_leaf,
#         'bagging_fraction': 1.0,
#         'bagging_freq': 5,
#         'feature_fraction': feature_fraction,
#         'lambda_l1': lambda_l1,
#         'lambda_l2': lambda_l2,
#         'min_gain_to_split': min_gain_to_split,
#         'max_depth': max_depth,
#         'save_binary': True, 
#         'seed': 1337,
#         'feature_fraction_seed': 1337,
#         'bagging_seed': 1337,
#         'drop_seed': 1337,
#         'data_random_seed': 1337,
#         'objective': 'regression',
#         'boosting_type': 'gbdt',
#         'verbose': 1,
#         'metric': 'rmse',
#         'is_unbalance': True,
#         'boost_from_average': False, 
#     }
    
#     train_data = lgb.Dataset(X_train.iloc[bayesian_tr_index].values,
#                             label = Y_train[bayesian_tr_index],
#                             feature_name=predictors,
#                             free_raw_data = False)
    
    
#     validation_data = lgb.Dataset(X_train.iloc[bayesian_val_index].values,
#                                  label= Y_train[bayesian_val_index],
#                                  feature_name=predictors,
#                                  free_raw_data=False)
    
#     num_round = 1000
#     clf = lgb.train(params, train_data, num_round, valid_sets = [validation_data], verbose_eval=250,
#                  early_stopping_rounds = 50)
    
#     predictions = clf.predict(X_train.iloc[bayesian_val_index].values,
#                               num_iteration = clf.best_iteration)
    
# #      we need to compute a regression score. roc_auc_score is a classification score. we can't use it
# #     score = metrics.roc_auc_score(y_valid_train[bayesian_val_index], predictions)
#     mse = mean_squared_error(Y_train[bayesian_val_index], predictions)
#     rmse = np.sqrt(mse)
# #     our bayesian optimization expect us to give him increasing number to understand this is getting better
#     return -rmse

In [None]:
# LGB_bound = {
#     "num_leaves" : (5, 20),
#     "min_data_in_leaf" : (5, 20),
#     "learning_rate" : (0.01, 0.3),
#     "min_sum_hessian_in_leaf" : (0.00001, 0.01),
#     "feature_fraction" : (0.05, 0.5),
#     "lambda_l1" : (0, 5.0),
#     "lambda_l2" : (0, 5.0),
#     'min_gain_to_split': (0, 1.0),
#     'max_depth':(3,15)
# }

In [None]:
# optimizer = BayesianOptimization(
#     f=lgb_black_box,
#     pbounds = LGB_bound,
#     random_state = 13
# )
# print(optimizer.space.keys)

In [None]:
# init_points = 3
# n_iter = 3

# optimizer.maximize(init_points = init_points, n_iter = n_iter)

In [None]:
# optimizer.max["params"]

In [None]:
# # here i say hey optimizer! search for this new parameter to see if they are really better or not.
# # probe = کاوش
# #  tmp code
# #  feature fraction = 0.3064, l1=  2.659,  l2 =   0.3892, learning =  0.1054,
# # max_depth = 14.76, min_da  19.7,   min_ga = 0.6548,   min_su = 0.000626, num_lea 19.06

# optimizer.probe(
#     params = {
#         'feature_fraction': 0.3064, 
#             'lambda_l1': 2.659, 
#             'lambda_l2': 0.3892, 
#             'learning_rate': 0.1054, 
#             'max_depth': 14.76, 
#             'min_data_in_leaf': 19.7, 
#             'min_gain_to_split': 0.6548, 
#             'min_sum_hessian_in_leaf': 0.000626, 
#             'num_leaves': 19.06
#     },
#     lazy = False
# )

In [None]:
# optimizer.max["params"]

In [None]:
# optimized_lgb_params = {
#         'num_leaves': int(optimizer.max["params"]["num_leaves"]),
#         'max_bin': 63,
#         'min_data_in_leaf': int(optimizer.max["params"]["min_data_in_leaf"]),
#         'learning_rate': optimizer.max["params"]["learning_rate"],
#         'min_sum_hessian_in_leaf': optimizer.max["params"]["min_sum_hessian_in_leaf"],
#         'bagging_fraction': 1.0,
#         'bagging_freq': 5,
#         'feature_fraction': optimizer.max["params"]["feature_fraction"],
#         'lambda_l1': optimizer.max["params"]["lambda_l1"],
#         'lambda_l2': optimizer.max["params"]["lambda_l2"],
#         'min_gain_to_split': optimizer.max["params"]["min_gain_to_split"],
#         'max_depth': int(optimizer.max["params"]["max_depth"]),
#         'save_binary': True, 
#         'seed': 1337,
#         'feature_fraction_seed': 1337,
#         'bagging_seed': 1337,
#         'drop_seed': 1337,
#         'data_random_seed': 1337,
#         'objective': 'regression',
#         'boosting_type': 'gbdt',
#         'verbose': 1,
#         'metric': 'rmse',
#         'is_unbalance': True,
#         'boost_from_average': False, 
#     }

In [None]:
# n_folds = 5
# random_seed=6
# dtrain = lgb.Dataset(X_train, Y_train)
# dvalid = lgb.Dataset(X_valid, Y_valid)

# def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
#     params = {'application':'binary',
#               'num_iterations': 500 , 
#               'learning_rate':0.05, 
#               'early_stopping_round':100,
#               "objective" : "binary",
#               "num_threads" : 20 ,
#              }
#     params["num_leaves"] = int(round(num_leaves))
#     params['feature_fraction'] = max(min(feature_fraction, 1), 0)
#     params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
#     params['max_depth'] = int(round(max_depth))
#     params['lambda_l1'] = max(lambda_l1, 0)
#     params['lambda_l2'] = max(lambda_l2, 0)
#     params['min_split_gain'] = min_split_gain
#     params['min_child_weight'] = min_child_weight
#     cv_result = lgb.cv(params, dtrain,
#                        nfold=n_folds, seed=random_seed, 
#                        stratified=True, verbose_eval =200, 
#                        metrics=["None"],
#                       )
#     return max(cv_result['f1-mean'])
# lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (24, 45),
#                                         'feature_fraction': (0.1, 0.9),
#                                         'bagging_fraction': (0.8, 1),
#                                         'max_depth': (5, 8.99),
#                                         'lambda_l1': (0, 5),
#                                         'lambda_l2': (0, 3),
#                                         'min_split_gain': (0.001, 0.1),
#                                         'min_child_weight': (5, 50)}, random_state=0)
# lgb_model = lgbBO.fit(X=X_train, Y=Y_train, eval_set = (X_valid, Y_valid))
# init_round=5
# opt_round = 10
# lgb_model.maximize(init_points=init_round, n_iter=opt_round)
# params = lgb_model.res['max']['max_params']
# params

----------------------------
BayesianOptimization 파라미터 최적값

params -> optimized_lgb_params 로 넣으면 됨

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
import seaborn as sns
import gc
import pickle
import time
from itertools import product
import optuna
import lightgbm as lgb
from lightgbm import plot_importance
import sklearn.metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
print('Done')

In [None]:
lgb.__version__

In [None]:
lgb_train = lgb.Dataset(X_train, Y_train)
lgb_valid = lgb.Dataset(X_valid, Y_valid, reference=lgb_train)


In [None]:
def rmsle(y, y_pred):
        return np.sqrt(mean_squared_error(y, y_pred))

def plot_features(booster, figsize):
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

In [None]:
def objective(trial):
    # choose parameters that you want
    param = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "feature_pre_filter": False,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }
    model = lgb.train(param, 
                      lgb_train,
                      valid_sets=[lgb_train,lgb_valid],
                      early_stopping_rounds=15, #10,
                      verbose_eval=1)
    
    y_pred = model.predict(X_valid)
    accuracy = rmsle(Y_valid, y_pred)
    
    return accuracy

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)


In [None]:
best_params = study.best_trial.params
print(f'Best trial parameters\n{best_params}')

In [None]:
x = {"objective": "regression",
     "metric"   : "rmse",
     "verbosity": -1,
     "boosting_type": "gbdt"}
best_params.update(x)
best_params


In [None]:
import lightgbm as lgb

# lgb hyper-parameters
params = {'metric': 'rmse',
          'num_leaves': 256,
          'learning_rate': 0.005,
          'feature_fraction': 0.75,
          'bagging_fraction': 0.75,
          'bagging_freq': 5,
          'force_col_wise' : True,
          'random_state': 10}

cat_features = ['shop_id', 'item_id', 'city', 'shops_category', 'item_category_id', 'item_cat_1', 'item_cat_2', 'type', 'subtype']

# lgb train and valid dataset
dtrain = lgb.Dataset(X_train, Y_train)
dvalid = lgb.Dataset(X_valid, Y_valid)
 
# Train LightGBM model
lgb_model = lgb.train(params=params,
                      train_set=dtrain,
                      num_boost_round=2000,
                      valid_sets=(dtrain, dvalid),
                      early_stopping_rounds=200,
                      categorical_feature=cat_features,
                      verbose_eval=100)

In [None]:
preds = lgb_model.predict(X_test).clip(0,20)

submission['item_cnt_month'] = preds
submission.to_csv('LGBM_submission_final_final.csv', index=False)

In [None]:
# preds_lst = []
# competitionDir = '/kaggle/input/competitive-data-science-predict-future-sales'
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         if (dirname != competitionDir) & ('.csv' in filename):
#             df = pd.read_csv(os.path.join(dirname, filename))
#             if len(df) == len(submission):
#                 try:
#                     preds_lst.append(df['item_cnt_month'])
#                 except Exception:
#                     pass
# preds_lst.append(preds)

# submission['item_cnt_month'] = np.array(preds_lst).mean(axis=0).transpose()
# submission.to_csv("submission.csv", index=False)

In [None]:
from lightgbm import plot_importance
f, ax = plt.subplots(figsize=(10,10))
plot_importance(lgb_model, ax=ax) 

In [None]:
#최종편집 - 230118 14:20
#최종편집 - 230118 15:00