# 메모리 절약을 위한 데이터 형변환

데이터 타입 변환은 5장 [향후 판매량 예측 경진대회 데이터](https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data)를 활용해 수행했습니다.

In [1]:
# appendix/downcasting.ipynb

import pandas as pd

data_path = '/kaggle/input/competitive-data-science-predict-future-sales/'

sales_train = pd.read_csv(data_path + 'sales_train.csv')
shops = pd.read_csv(data_path + 'shops.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
test = pd.read_csv(data_path + 'test.csv')

In [2]:
train = sales_train.merge(shops, on='shop_id', how='left')
train = train.merge(items, on='item_id', how='left')
train = train.merge(item_categories, on='item_category_id', how='left')

In [3]:
train.dtypes

date                   object
date_block_num          int64
shop_id                 int64
item_id                 int64
item_price            float64
item_cnt_day          float64
shop_name              object
item_name              object
item_category_id        int64
item_category_name     object
dtype: object

In [4]:
train.memory_usage()

Index                 23486792
date                  23486792
date_block_num        23486792
shop_id               23486792
item_id               23486792
item_price            23486792
item_cnt_day          23486792
shop_name             23486792
item_name             23486792
item_category_id      23486792
item_category_name    23486792
dtype: int64

In [5]:
start_mem = train.memory_usage().sum() / 1024**2
start_mem

246.3862533569336

In [6]:
for col in train.columns:
    dtype_name = train[col].dtype.name
    if dtype_name == 'object':
        pass
    elif dtype_name == 'bool':
        train[col] = train[col].astype('int8')
    elif dtype_name.startswith('int') or (train[col].round()==train[col]).all():
        train[col] = pd.to_numeric(train[col], downcast='integer')
    else:
        train[col] = pd.to_numeric(train[col], downcast='float')

In [7]:
train.dtypes

date                   object
date_block_num           int8
shop_id                  int8
item_id                 int16
item_price            float32
item_cnt_day            int16
shop_name              object
item_name              object
item_category_id         int8
item_category_name     object
dtype: object

In [8]:
train.memory_usage()

Index                 23486792
date                  23486792
date_block_num         2935849
shop_id                2935849
item_id                5871698
item_price            11743396
item_cnt_day           5871698
shop_name             23486792
item_name             23486792
item_category_id       2935849
item_category_name    23486792
dtype: int64

In [9]:
end_mem = train.memory_usage().sum() / 1024**2
end_mem

142.7920331954956

In [10]:
print("{:.1f}% 압축됨".format(100 * (start_mem - end_mem) / start_mem))

42.0% 압축됨


In [11]:
def downcast(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('{:.1f}% 압축됨'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df