In [4]:
import pandas as pd
import numpy as np


import os

from sqlalchemy import create_engine, Integer, Float, String, Date
from sqlalchemy.types import TypeEngine

In [6]:
sscard_df = pd.read_csv("../data/DB/samsung/DIVE_FINAL_F.csv")

sscard_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17237620 entries, 0 to 17237619
Data columns (total 10 columns):
 #   Column         Dtype  
---  ------         -----  
 0   CRI_YM         int64  
 1   MRC_ADR        object 
 2   GENDER         int64  
 3   AGE_GR         object 
 4   JOB_GR         object 
 5   CATEGORY_L_NM  object 
 6   CATEGORY_M_NM  object 
 7   MAIN_CSM_AREA  object 
 8   SUM_CNT        float64
 9   SUM_AMT        float64
dtypes: float64(2), int64(2), object(6)
memory usage: 1.3+ GB


In [7]:
sscard_df.head()

Unnamed: 0,CRI_YM,MRC_ADR,GENDER,AGE_GR,JOB_GR,CATEGORY_L_NM,CATEGORY_M_NM,MAIN_CSM_AREA,SUM_CNT,SUM_AMT
0,202201,경상남도창원시,1,C,02.회사원,생활,가례,경상북도남부,3.0,6106000.0
1,202201,충청남도남부,2,E,05.기타,생활,가례,충청남도남부,3.0,115000.0
2,202201,울산광역시남구,2,E,04.자영업자,생활,가례,울산광역시남구,3.0,99000.0
3,202201,대구광역시기타,2,C,05.기타,생활,가례,인천광역시기타,3.0,43000.0
4,202201,울산광역시남구,2,B,04.자영업자,생활,가례,인천광역시기타,3.0,1365000.0


In [8]:
def type_update_df(table_name: str, df: pd.DataFrame) -> pd.DataFrame:
    """Date column type change"""
    if table_name == "lotte_mart" or table_name == "lotte_cs":
        df["stdt"] = pd.to_datetime(df["stdt"], format="%Y%m")
        df["bcode1"] = df["bcode1"].astype("category")
    elif table_name == "samsung":
        df["CRI_YM"] = pd.to_datetime(df["CRI_YM"], format="%Y%m")
        df["GENDER"] = df["GENDER"].astype("category")

    return df


def optimize_df(table_name: str, df: pd.DataFrame) -> pd.DataFrame:
    """Optimize the dataframe for the database"""
    df = df.dropna()

    df = type_update_df(table_name, df)

    # 문자열 컬럼 최적화
    for col in df.select_dtypes(include=["object"]).columns:
        df[col] = df[col].astype("category")

    # 숫자형 컬럼 최적화
    for col in df.select_dtypes(include=["int64", "float64"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")

    return df

In [9]:
sscard_df = optimize_df("samsung", sscard_df)

  new_result = trans(result).astype(dtype)


In [10]:
sscard_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17237620 entries, 0 to 17237619
Data columns (total 10 columns):
 #   Column         Dtype         
---  ------         -----         
 0   CRI_YM         datetime64[ns]
 1   MRC_ADR        category      
 2   GENDER         category      
 3   AGE_GR         category      
 4   JOB_GR         category      
 5   CATEGORY_L_NM  category      
 6   CATEGORY_M_NM  category      
 7   MAIN_CSM_AREA  category      
 8   SUM_CNT        int32         
 9   SUM_AMT        int64         
dtypes: category(7), datetime64[ns](1), int32(1), int64(1)
memory usage: 443.9 MB


In [16]:
lotte_cs_df = pd.read_csv("../data/DB/lotte/002_ltmb_k7_data.csv")
lotte_cs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4998619 entries, 0 to 4998618
Data columns (total 11 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   stdt       int64 
 1   channel    object
 2   ch_region  object
 3   bcode1     int64 
 4   bname1     object
 5   count      int64 
 6   money      int64 
 7   ppl        int64 
 8   gender     object
 9   age        int64 
 10  region     object
dtypes: int64(6), object(5)
memory usage: 419.5+ MB


In [17]:
lotte_cs_df = optimize_df("lotte_cs", lotte_cs_df)
lotte_cs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4998619 entries, 0 to 4998618
Data columns (total 11 columns):
 #   Column     Dtype         
---  ------     -----         
 0   stdt       datetime64[ns]
 1   channel    category      
 2   ch_region  category      
 3   bcode1     category      
 4   bname1     category      
 5   count      int32         
 6   money      int32         
 7   ppl        int16         
 8   gender     category      
 9   age        int8          
 10  region     category      
dtypes: category(6), datetime64[ns](1), int16(1), int32(2), int8(1)
memory usage: 119.2 MB


In [22]:
lotte_mart_cf = pd.read_csv("../data/DB/lotte/003_ltmb_mart_data.csv")
lotte_mart_cf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10171784 entries, 0 to 10171783
Data columns (total 11 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   stdt       int64 
 1   channel    object
 2   ch_region  object
 3   bcode1     int64 
 4   bname1     object
 5   count      int64 
 6   money      int64 
 7   ppl        int64 
 8   gender     object
 9   age        int64 
 10  region     object
dtypes: int64(6), object(5)
memory usage: 853.7+ MB


In [23]:
lotte_mart_cf = optimize_df("lotte_mart", lotte_mart_cf)
lotte_mart_cf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10171784 entries, 0 to 10171783
Data columns (total 11 columns):
 #   Column     Dtype         
---  ------     -----         
 0   stdt       datetime64[ns]
 1   channel    category      
 2   ch_region  category      
 3   bcode1     category      
 4   bname1     category      
 5   count      int32         
 6   money      int32         
 7   ppl        int16         
 8   gender     category      
 9   age        int8          
 10  region     category      
dtypes: category(6), datetime64[ns](1), int16(1), int32(2), int8(1)
memory usage: 242.5 MB
