In [79]:
# =========================================
# 1️⃣ 라이브러리 불러오기
# =========================================
import pandas as pd
import numpy as np
from itertools import product
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

# =========================================
# 2️⃣ CSV 파일 읽기 (input 폴더 기준)
# =========================================
sales = pd.read_csv('input/sales_train.csv')
items = pd.read_csv('input/items.csv')
shops = pd.read_csv('input/shops.csv')
cats  = pd.read_csv('input/item_categories.csv')
test  = pd.read_csv('input/test.csv')

# =========================================
# 3️⃣ 외부 데이터 처리
# =========================================
# 외부값 제거
sales = sales[sales.item_price < 100000]
sales = sales[sales.item_cnt_day < 1001]

# 가격이 음수인 경우 중앙값 대체
median = sales[(sales.shop_id==32) & (sales.item_id==2973) & (sales.date_block_num==4) & (sales.item_price>0)].item_price.median()
sales.loc[sales.item_price<0, 'item_price'] = median

# =========================================
# 4️⃣ 아이템 + 카테고리 병합 후 GMM 클러스터링
# =========================================
items_ext = pd.merge(items, cats, on='item_category_id', how='left')

# 평균 가격
avg_price = sales.groupby('item_id')['item_price'].mean().reset_index()
avg_price.columns = ['item_id','average_price']
items_ext = pd.merge(items_ext, avg_price, on='item_id', how='left')

# GMM 클러스터링
features = items_ext[['average_price']].fillna(0)
gmm = GaussianMixture(n_components=5, random_state=42)
clusters = gmm.fit_predict(features)
items_ext['cluster'] = clusters

# =========================================
# 5️⃣ 월별 매트릭스 생성
# =========================================
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales_month = sales[sales.date_block_num==i]
    matrix.append(np.array(list(product([i], sales_month.shop_id.unique(), sales_month.item_id.unique())), dtype='int16'))

matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)

# 월별 판매 합계
sales['revenue'] = sales['item_price'] * sales['item_cnt_day']
group = sales.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day':'sum'}).reset_index()
group.columns = ['date_block_num','shop_id','item_id','item_cnt_month']
matrix = pd.merge(matrix, group, on=cols, how='left')
matrix['item_cnt_month'] = matrix['item_cnt_month'].fillna(0).clip(0,20)

# 테스트 데이터 추가
test['date_block_num'] = 34
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)
matrix = pd.concat([matrix, test[['date_block_num','shop_id','item_id']]], ignore_index=True, sort=False)
matrix['item_cnt_month'] = matrix['item_cnt_month'].fillna(0)

# =========================================
# 6️⃣ GMM 클러스터 정보 합치기
# =========================================
matrix = pd.merge(matrix, items_ext[['item_id','cluster']], on='item_id', how='left')
matrix['cluster'] = matrix['cluster'].astype(np.int8)

# =========================================
# 7️⃣ 학습/검증/테스트 분리
# =========================================
X_train = matrix[matrix.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = matrix[matrix.date_block_num < 33]['item_cnt_month']
X_valid = matrix[matrix.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = matrix[matrix.date_block_num == 33]['item_cnt_month']
X_test  = matrix[matrix.date_block_num == 34].drop(['item_cnt_month'], axis=1)

# =========================================
# 8️⃣ NaN 처리 + 스케일링 + PCA
# =========================================
imputer = SimpleImputer(strategy='constant', fill_value=0)
X_train_imputed = imputer.fit_transform(X_train)
X_valid_imputed = imputer.transform(X_valid)
X_test_imputed  = imputer.transform(X_test)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_valid_scaled = scaler.transform(X_valid_imputed)
X_test_scaled  = scaler.transform(X_test_imputed)

pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_valid_pca = pca.transform(X_valid_scaled)
X_test_pca  = pca.transform(X_test_scaled)

print("PCA 후 차원 수:", X_train_pca.shape[1])

# =========================================
# 9️⃣ XGBRegressor 학습 및 평가
# =========================================
xgb = XGBRegressor(
    n_estimators=1000,
    max_depth=8,
    learning_rate=0.03,
    min_child_weight=300,
    colsample_bytree=0.8,
    subsample=0.8,
    eval_metric="rmse",
    early_stopping_rounds=10,
    random_state=42
)

xgb.fit(X_train_pca, Y_train, eval_set=[(X_valid_pca, Y_valid)], verbose=10)
Y_pred = xgb.predict(X_valid_pca)
rmse = np.sqrt(mean_squared_error(Y_valid, Y_pred))
print("검증 RMSE:", rmse)

# =========================================
# 10️⃣ 테스트 예측 및 CSV 출력
# =========================================
Y_test_pred = xgb.predict(X_test_pca)
submission = pd.DataFrame({'ID': test['ID'], 'item_cnt_month': Y_test_pred.clip(0,20)})
submission.to_csv('structured_data/submission.csv', index=False)

PCA 후 차원 수: 4
[0]	validation_0-rmse:1.13654
[10]	validation_0-rmse:1.13441
[20]	validation_0-rmse:1.13274
[30]	validation_0-rmse:1.13180
[40]	validation_0-rmse:1.13102
[50]	validation_0-rmse:1.13051
[60]	validation_0-rmse:1.13020
[70]	validation_0-rmse:1.13007
[80]	validation_0-rmse:1.13005
[90]	validation_0-rmse:1.12985
[100]	validation_0-rmse:1.12989
검증 RMSE: 1.1298044619872714


OSError: Cannot save file into a non-existent directory: 'structured_data'