# 라이브러리 설치

In [None]:
pip install autogluon



In [None]:
# 코랩 한글 폰트 깨짐 현상 해결(설치 후 런타임 재시작)
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  fonts-nanum
0 upgraded, 1 newly installed, 0 to remove and 18 not upgraded.
Need to get 10.3 MB of archives.
After this operation, 34.1 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 fonts-nanum all 20200506-1 [10.3 MB]
Fetched 10.3 MB in 1s (18.4 MB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package fonts-nanum.
(Reading database ... 120874 files and direc

# 라이브러리 로딩 및 초기화

In [None]:
import pandas as pd
import numpy as np
import random
import os
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
plt.rc('font', family='NanumBarunGothic')
warnings.filterwarnings(action='ignore')

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(96) # Seed 고정

# 구글 드라이브 연동

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 데이터 로딩 및 요약

In [None]:
train = pd.read_csv("/content/drive/MyDrive/데이콘/추석 선물/train.csv")
test = pd.read_csv("/content/drive/MyDrive/데이콘/추석 선물/test.csv")

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5872 entries, 0 to 5871
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ID             5872 non-null   object
 1   추석까지 남은 기간(주)  5872 non-null   int64 
 2   쇼핑몰 구분         5872 non-null   object
 3   가격(원)          5872 non-null   int64 
 4   프로모션 여부        5872 non-null   int64 
 5   도시 유형          5872 non-null   object
 6   지역 유형          5872 non-null   object
 7   쇼핑몰 유형         5872 non-null   object
 8   선물 유형          5872 non-null   object
 9   수요량            5872 non-null   int64 
dtypes: int64(4), object(6)
memory usage: 458.9+ KB


In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3915 entries, 0 to 3914
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ID             3915 non-null   object
 1   추석까지 남은 기간(주)  3915 non-null   int64 
 2   쇼핑몰 구분         3915 non-null   object
 3   가격(원)          3915 non-null   int64 
 4   프로모션 여부        3915 non-null   int64 
 5   도시 유형          3915 non-null   object
 6   지역 유형          3915 non-null   object
 7   쇼핑몰 유형         3915 non-null   object
 8   선물 유형          3915 non-null   object
dtypes: int64(3), object(6)
memory usage: 275.4+ KB


In [None]:
train.drop("ID", axis = 1, inplace = True)
test.drop("ID", axis = 1, inplace = True)

In [None]:
train.head()

Unnamed: 0,ID,추석까지 남은 기간(주),쇼핑몰 구분,가격(원),프로모션 여부,도시 유형,지역 유형,쇼핑몰 유형,선물 유형,수요량
0,TRAIN_0000,1,쇼핑몰 15,212000,0,도시 6,지역 1,쇼핑몰 유형 2,명절혼합과일선물세트,28
1,TRAIN_0001,2,쇼핑몰 72,113000,0,도시 21,지역 1,쇼핑몰 유형 1,발효홍삼선물세트,27
2,TRAIN_0002,0,쇼핑몰 15,67000,0,도시 6,지역 1,쇼핑몰 유형 2,실속스팸선물세트,769
3,TRAIN_0003,1,쇼핑몰 13,206000,0,도시 12,지역 3,쇼핑몰 유형 1,자연산프리미엄버섯선물세트,27
4,TRAIN_0004,1,쇼핑몰 65,140000,0,도시 16,지역 2,쇼핑몰 유형 2,자연산새우선물세트,337


# DecistionTree(전처리 X) - RMSE : 242.47507

In [None]:
from sklearn.preprocessing import LabelEncoder
ordinal_features = ['쇼핑몰 구분', '도시 유형', '지역 유형', '쇼핑몰 유형', '선물 유형']

for feature in ordinal_features:
    le = LabelEncoder()
    le = le.fit(train[feature])
    train[feature] = le.transform(train[feature])

    # train데이터에서 존재하지 않았던 값이 test 데이터에 존재할 수도 있습니다.
    # 따라서 test 데이터를 바로 변형시키지 않고 고유값을 확인후 test 데이터를 변환합니다.
    # Data Leakage를 발생시키지 않기 위함이니, 반드시 주의해주세요.
    for label in np.unique(test[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[feature] = le.transform(test[feature])

In [None]:
x = train.drop("수요량", axis = 1)
y = train["수요량"]

model = DecisionTreeRegressor(random_state = 42)
model.fit(x, y)
pred = model.predict(test)

In [None]:
sub = pd.read_csv("/content/drive/MyDrive/데이콘/추석 선물/sample_submission.csv")
sub["수요량"] = pred
sub.to_csv("1.csv", index = False)

# 전처리 + DecisionTree - RMSE : 216.99309


In [None]:
from sklearn.preprocessing import MinMaxScaler
# 1. 이상치 처리
for column in ['수요량', '가격(원)']:
    Q1 = train[column].quantile(0.25)
    Q3 = train[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = (train[column] < lower_bound) | (train[column] > upper_bound)
    train = train[~outliers]

# 2. 범주형 데이터 인코딩
categorical_columns = ['쇼핑몰 구분', '도시 유형', '지역 유형', '쇼핑몰 유형', '선물 유형']
data_encoded = pd.get_dummies(train, columns=categorical_columns)
test_encoded = pd.get_dummies(test, columns=categorical_columns)

# 3. 특성 스케일링
scaler = MinMaxScaler()
data_encoded['가격(원)'] = scaler.fit_transform(data_encoded['가격(원)'].values.reshape(-1, 1))
test_encoded['가격(원)'] = scaler.transform(test_encoded['가격(원)'].values.reshape(-1, 1))

data_encoded.head()

In [None]:
from sklearn.tree import DecisionTreeRegressor
x = data_encoded.drop("수요량", axis = 1)
y = data_encoded["수요량"]

model = DecisionTreeRegressor(random_state = 42)
model.fit(x, y)
pred = model.predict(test_encoded)

In [None]:
sub = pd.read_csv("/content/drive/MyDrive/데이콘/추석 선물/sample_submission.csv")
sub["수요량"] = pred
sub.to_csv("2.csv", index = False)

# 모델 초기화 및 훈련(Autogluon) - RMSE : 100.29132

In [None]:
train.drop("ID", axis = 1, inplace = True)
test.drop("ID", axis = 1, inplace = True)

In [None]:
tune_kwargs = {
    'searcher': 'random',
    'num_trials': 7,
    'scheduler' : 'local'
}

In [None]:
from autogluon.tabular import TabularPredictor, TabularDataset
pred = TabularPredictor(label="수요량", eval_metric="rmse", problem_type="regression").fit(train_data=train,
                                                                                          presets=["best_quality"],
                                                                                          hyperparameter_tune_kwargs=tune_kwargs,
                                                                                          num_bag_folds=20,
                                                                                          refit_full=True,
                                                                                          time_limit=43200)

# 예측 및 저장

In [None]:
predict = pred.predict(test)

In [None]:
sub = pd.read_csv("/content/drive/MyDrive/데이콘/추석 선물/sample_submission.csv")

In [None]:
sub["수요량"] = predict

In [None]:
sub.to_csv("3.csv", index = False)

In [None]:
from google.colab import files
files.download('3.csv')