In [20]:
import os
print(os.listdir("/kaggle/input"))

['competitive-data-science-predict-future-sales']


In [21]:
data_path = "/kaggle/input/competitive-data-science-predict-future-sales/"
print(os.listdir(data_path))

['items.csv', 'sample_submission.csv', 'item_categories.csv', 'sales_train.csv', 'shops.csv', 'test.csv']


In [22]:
import numpy as np
import pandas as pd

data_path = '/kaggle/input/competitive-data-science-predict-future-sales/'

items = pd.read_csv(data_path + 'items.csv')
shops = pd.read_csv(data_path + 'shops.csv')
cats = pd.read_csv(data_path + 'item_categories.csv')
train = pd.read_csv(data_path + 'sales_train.csv')
test  = pd.read_csv(data_path + 'test.csv').set_index('ID')

In [23]:
from itertools import product
from sklearn.preprocessing import LabelEncoder

# 이상치 처리
train = train[train.item_price < 100000]
train = train[train.item_cnt_day < 1001]
median = train[
    (train.shop_id == 32) & (train.item_id == 2973) &
    (train.date_block_num == 4) & (train.item_price > 0)
].item_price.median()
train.loc[train.item_price < 0, 'item_price'] = median

# shop_id 보정
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

# shop / category 전처리
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад TЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_code']]

cats['split'] = cats['item_category_name'].str.split('-')
cats['type'] = cats['split'].map(lambda x: x[0].strip())
cats['type_code'] = LabelEncoder().fit_transform(cats['type'])
cats['subtype'] = cats['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
cats['subtype_code'] = LabelEncoder().fit_transform(cats['subtype'])
cats = cats[['item_category_id','type_code','subtype_code']]

items.drop(['item_name'], axis=1, inplace=True)

# 월별 판매량 데이터
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = train[train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols,inplace=True)

train['revenue'] = train['item_price'] *  train['item_cnt_day']
group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month'].fillna(0).clip(0,20)).astype(np.float16)

# test 데이터 추가
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

matrix = pd.concat([matrix, test], ignore_index=True, sort=False)
matrix.fillna(0, inplace=True)

# shop, item, cat 조인
matrix = pd.merge(matrix, shops, on=['shop_id'], how='left')
matrix = pd.merge(matrix, items, on=['item_id'], how='left')
matrix = pd.merge(matrix, cats, on=['item_category_id'], how='left')
matrix['city_code'] = matrix['city_code'].astype(np.int8)
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['type_code'] = matrix['type_code'].astype(np.int8)
matrix['subtype_code'] = matrix['subtype_code'].astype(np.int8)

# lag feature 추가
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

matrix = lag_feature(matrix, [1,2,3,6,12], 'item_cnt_month')

In [24]:
# 데이터 분할 
X_train = matrix[matrix.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = matrix[matrix.date_block_num < 33]['item_cnt_month']

X_valid = matrix[matrix.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = matrix[matrix.date_block_num == 33]['item_cnt_month']

X_test = matrix[matrix.date_block_num == 34].drop(['item_cnt_month'], axis=1)

Question 1

In [29]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# ============= 課題1: 回帰モデル =============
model = XGBRegressor(
    n_estimators=1000,
    max_depth=8,
    learning_rate=0.03,
    min_child_weight=300,
    colsample_bytree=0.8,
    subsample=0.8,
    eval_metric="rmse",
    early_stopping_rounds=10,
    random_state=42
)

# 학습 (검증 세트 포함)
model.fit(X_train, Y_train, eval_set=[(X_valid, Y_valid)], verbose=False)

# 검증 RMSE 출력
Y_pred = model.predict(X_valid)
rmse = mean_squared_error(Y_valid, Y_pred, squared=False)
print("[課題1] Validation RMSE:", rmse)

# 테스트 데이터 예측 → CSV 저장
submission = model.predict(X_test)
pd.DataFrame({'ID': X_test.index, 'item_cnt_month': submission}).to_csv('part1_resubmit1.csv', index=False)


[課題1] Validation RMSE: 0.9281663


Question 2

In [28]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# ============= 課題2: 分類モデル =============
def categorize_sales(x):
    if x == 0:
        return 'no_sales'
    elif x <= 5:
        return 'low'
    elif x <= 15:
        return 'medium'
    else:
        return 'high'

# 목적 변수를 카테고리화
Y_train_cls = Y_train.apply(categorize_sales)
Y_valid_cls = Y_valid.apply(categorize_sales)

# 문자열 → 숫자 변환
le = LabelEncoder()
Y_train_enc = le.fit_transform(Y_train_cls)
Y_valid_enc = le.transform(Y_valid_cls)

# XGBoost 분류 모델 (CPU 전용)
clf = XGBClassifier(
    random_state=42,
    early_stopping_rounds=10
)

clf.fit(X_train, Y_train_enc, eval_set=[(X_valid, Y_valid_enc)], verbose=False)

# 예측 및 결과
Y_pred_enc = clf.predict(X_valid)
Y_pred_cls = le.inverse_transform(Y_pred_enc)

print("[課題2] 分類モデル 評価結果")
print(classification_report(Y_valid_cls, Y_pred_cls))
print(confusion_matrix(Y_valid_cls, Y_pred_cls))

[課題2] 分類モデル 評価結果
              precision    recall  f1-score   support

        high       0.47      0.29      0.36       358
         low       0.63      0.21      0.32     29869
      medium       0.45      0.14      0.21      1244
    no_sales       0.89      0.98      0.94    206701

    accuracy                           0.88    238172
   macro avg       0.61      0.41      0.46    238172
weighted avg       0.86      0.88      0.85    238172

[[   104     29     26    199]
 [    35   6299    145  23390]
 [    46    527    173    498]
 [    36   3104     43 203518]]


Question 3

In [31]:
# ============= 課題3: PCA (寄与率を変えて確認) =============
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import numpy as np
import pandas as pd

# lag 특징은 NaN이 많아 PCA 전에 제거 (과제 지시사항)
drop_cols = ['item_cnt_month_lag_1', 'item_cnt_month_lag_2',
             'item_cnt_month_lag_3', 'item_cnt_month_lag_6',
             'item_cnt_month_lag_12']

X_train_prep = X_train.drop(columns=drop_cols, errors='ignore')
X_valid_prep = X_valid.drop(columns=drop_cols, errors='ignore')
X_test_prep  = X_test.drop(columns=drop_cols,  errors='ignore')

# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_prep)
X_valid_scaled = scaler.transform(X_valid_prep)
X_test_scaled  = scaler.transform(X_test_prep)

# 여러 寄与率을 시도하면서 가장 성능(RMSE)이 좋은 PCA+모델을 선택
best = {"rmse": np.inf, "rate": None, "pca": None, "model": None, "n_comp": None}
for rate in [0.90, 0.95, 0.99]:
    pca = PCA(n_components=rate)  # 寄与率 기반 주성분 추출
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_valid_pca = pca.transform(X_valid_scaled)

    model_pca = XGBRegressor(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        early_stopping_rounds=10,
        random_state=42
    )
    model_pca.fit(X_train_pca, Y_train, eval_set=[(X_valid_pca, Y_valid)], verbose=False)

    pred_pca = model_pca.predict(X_valid_pca)
    rmse_pca = mean_squared_error(Y_valid, pred_pca, squared=False)
    print(f"[課題3] PCA 寄与率={rate}, 主成分数={X_train_pca.shape[1]}, RMSE={rmse_pca}")

    if rmse_pca < best["rmse"]:
        best["rmse"]   = rmse_pca
        best["rate"]   = rate
        best["pca"]    = pca
        best["model"]  = model_pca
        best["n_comp"] = X_train_pca.shape[1]

print(f"[課題3] 最良モデル: 寄与率={best['rate']}, 主成分数={best['n_comp']}, RMSE={best['rmse']}")

# 테스트 셋을 '베스트 PCA'로 변환 후 예측 → CSV 저장
X_test_pca = best["pca"].transform(X_test_scaled)
submission_pca = best["model"].predict(X_test_pca)
pd.DataFrame({'ID': X_test.index, 'item_cnt_month': submission_pca}).to_csv('part1_resubmit3.csv', index=False)
print("Saved part1_resubmit3.csv")

[課題3] PCA 寄与率=0.9, 主成分数=5, RMSE=1.1036396026611328
[課題3] PCA 寄与率=0.95, 主成分数=5, RMSE=1.1036396026611328
[課題3] PCA 寄与率=0.99, 主成分数=5, RMSE=1.1036396026611328
[課題3] 最良モデル: 寄与率=0.9, 主成分数=5, RMSE=1.1036396026611328
Saved part1_resubmit3.csv


Question 4

In [33]:
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# ============= 課題4: クラスタリング (GMMで最適クラスタ数) =============

# 商品情報 확장 (items + cats + 평균 가격)
items_extended = pd.merge(items, cats, on='item_category_id', how='left')
average_price = train.groupby('item_id')['item_price'].mean().reset_index()
average_price.columns = ['item_id', 'average_price']
items_extended = pd.merge(items_extended, average_price, on='item_id', how='left')

# 특징: 평균 가격
features = items_extended[['average_price']]

# 스케일링 + NaN 제거
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features.dropna())

# ---------------- K-means ----------------
kmeans = KMeans(n_clusters=5, n_init=10, random_state=42)
clusters = kmeans.fit_predict(features_scaled)
silhouette_avg = silhouette_score(features_scaled, clusters)
print("[課題4] K-means 결과")
print(f"  - silhouette score: {silhouette_avg:.4f}")

# ---------------- GMM ----------------
bics, aics = [], []
for n in range(1, 11):
    gmm = GaussianMixture(n_components=n, random_state=42)
    gmm.fit(features_scaled)
    bics.append(gmm.bic(features_scaled))
    aics.append(gmm.aic(features_scaled))

best_n_bic = np.argmin(bics) + 1
best_n_aic = np.argmin(aics) + 1
print("[課題4] GMM 結果")
print(f"  - 最適クラスタ数 (BIC): {best_n_bic}")
print(f"  - 最適クラスタ数 (AIC): {best_n_aic}")

# 최적 GMM으로 학습
best_gmm = GaussianMixture(n_components=best_n_bic, random_state=42)
clusters_gmm = best_gmm.fit_predict(features_scaled)

items_extended = items_extended.dropna(subset=['average_price'])
items_extended['cluster'] = clusters_gmm

print("[課題4] GMM クラスタごとの平均値 (average_price)")
print(items_extended.groupby('cluster')['average_price'].mean())

# ---------------- 精度改善: 클러스터 feature를 활용한 회귀 모델 ----------------

# item_id - cluster 매핑 테이블 생성
item_cluster_map = items_extended[['item_id', 'cluster']].drop_duplicates()

# 학습/검증/테스트 데이터에 cluster 병합
X_train_cluster = pd.merge(X_train, item_cluster_map, on='item_id', how='left')
X_valid_cluster = pd.merge(X_valid, item_cluster_map, on='item_id', how='left')
X_test_cluster  = pd.merge(X_test,  item_cluster_map, on='item_id', how='left')

# 결측치(NaN) 있으면 -1로 채움
X_train_cluster['cluster'] = X_train_cluster['cluster'].fillna(-1).astype(int)
X_valid_cluster['cluster'] = X_valid_cluster['cluster'].fillna(-1).astype(int)
X_test_cluster['cluster']  = X_test_cluster['cluster'].fillna(-1).astype(int)

# 개선된 회귀 모델 학습
model_cluster = XGBRegressor(
    n_estimators=800,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    early_stopping_rounds=10,
    random_state=42
)
model_cluster.fit(X_train_cluster, Y_train, eval_set=[(X_valid_cluster, Y_valid)], verbose=False)

# 검증 RMSE 출력
pred_valid_cluster = model_cluster.predict(X_valid_cluster)
rmse_cluster = mean_squared_error(Y_valid, pred_valid_cluster, squared=False)
print("[課題4] Validation RMSE (클러스터 feature 포함):", rmse_cluster)

# ---------------- CSV 저장 ----------------
submission_cluster = model_cluster.predict(X_test_cluster)
pd.DataFrame({'ID': X_test.index, 'item_cnt_month': submission_cluster}).to_csv('part1_resubmit4.csv', index=False)
print("Saved part1_resubmit4.csv")

[課題4] K-means 결과
  - silhouette score: 0.7274
[課題4] GMM 結果
  - 最適クラスタ数 (BIC): 10
  - 最適クラスタ数 (AIC): 10
[課題4] GMM クラスタごとの平均値 (average_price)
cluster
0     1558.589708
1     8522.775558
2      225.084317
3    29496.453323
4     4742.223836
5     2753.435610
6    20199.338675
7    44734.300000
8      761.589483
9    12822.564946
Name: average_price, dtype: float64
[課題4] Validation RMSE (클러스터 feature 포함): 0.9371017
Saved part1_resubmit4.csv


Download csv


In [38]:
import shutil

# 결과 파일들을 하나로 압축
shutil.make_archive("part1_results", "zip", "/kaggle/working")

# 다운로드 링크 출력
import IPython.display as disp
disp.FileLink("part1_results.zip")