In [7]:
pip install -U scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [8]:
import sklearn
print(sklearn.__version__)

1.3.1


In [4]:
# 라이브러리 및 데이터 불러오기
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
#--------------------------------------------
# EDA
#--------------------------------------------
# train.shape, test.shape
# train.head()
# train.info()
# train.describe()
# import matplotlib.pyplot as plt
# import seaborn as sns
# sns.displot(train['Item_Outlet_Sales'])
# plt.show()
# train.isnull().sum()
# test.isnull().sum()
#--------------------------------------------
# 데이터 전처리
#--------------------------------------------
cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
target = train.pop('Item_Outlet_Sales')

#
df = pd.concat([train, test])
# 레이블 인코딩
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in cols:
    df[col] = le.fit_transform(df[col])
    
train = df.iloc[:len(train)].copy()
test = df.iloc[len(train):].copy()
train.shape, test.shape

train['Item_Weight'] = train['Item_Weight'].fillna(train['Item_Weight'].min())
train['Outlet_Size'] = train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0])

test['Item_Weight'] = test['Item_Weight'].fillna(train['Item_Weight'].min())
test['Outlet_Size'] = test['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0])

print(train.shape, test.shape)
train.drop('Item_Identifier', axis=1, inplace=True)
test.drop('Item_Identifier', axis=1, inplace=True)
print(train.shape, test.shape)
#--------------------------------------------
# 검증 데이터 나누기
#--------------------------------------------
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    train,
    target,
    test_size=0.2,
    random_state=0)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

#--------------------------------------------
# 머신러닝 학습 및 평가
#--------------------------------------------
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import root_mean_squared_error

# LightGBM
import lightgbm as lgb
model = lgb.LGBMRegressor(random_state=0, verbose=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

result = mean_squared_error(y_val, y_pred)
print('MSE:', result)

result = mean_absolute_error(y_val, y_pred)
print('MAE:', result)

result = r2_score(y_val, y_pred)
print('R2:', result)

result = root_mean_squared_error(y_val, y_pred)
print('RMSE:', result)


(6818, 11) (1705, 11)
(6818, 10) (1705, 10)


ImportError: cannot import name 'root_mean_squared_error' from 'sklearn.metrics' (/opt/conda/lib/python3.11/site-packages/sklearn/metrics/__init__.py)

(6818, 11) (1705, 11)
(6818, 10) (1705, 10)


ImportError: cannot import name 'root_mean_squared_error' from 'sklearn.metrics' (/opt/conda/lib/python3.11/site-packages/sklearn/metrics/__init__.py)

In [None]:
#--------------------------------------------
# 머신러닝 학습 및 평가 (cross_val_score를 사용한 K-Fold 교차 검증 - NumPy 대신 statistics 사용)
#--------------------------------------------
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error, make_scorer
# import numpy as np  # NumPy를 사용하지 않음
import statistics # 표준 라이브러리 statistics 모듈 사용
import lightgbm as lgb
import warnings

# LightGBM에서 verbose=-1 설정 시 경고가 발생할 수 있어 무시합니다.
warnings.filterwarnings('ignore', category=UserWarning)


# K-Fold 설정 (예: 5-Fold)
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# LightGBM 모델 생성
model = lgb.LGBMRegressor(random_state=0, verbose=-1)

# cross_val_score를 사용하여 각 지표의 평균 점수를 계산합니다.
# scikit-learn은 점수를 최대화하는 방식으로 동작하므로, 
# MSE와 MAE에는 'neg_' 접두사를 붙여 음수로 변환한 후, 결과를 다시 양수로 바꿉니다.

# 1. MSE (Negated Mean Squared Error)
# cross_val_score는 결과를 numpy 배열로 반환하지만, 이후에 리스트로 변환하여 처리합니다.
mse_scores_neg = cross_val_score(
    model, 
    train, 
    target, 
    cv=kf, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1
).tolist() # NumPy 배열을 Python 리스트로 변환

# 2. RMSE (MSE를 루트 씌움)
# 각 MSE 점수에 대해 루트를 씌우고, 결과를 리스트로 저장합니다.
rmse_scores = [val**0.5 for val in [-score for score in mse_scores_neg]]

# 3. MAE (Negated Mean Absolute Error)
mae_scores_neg = cross_val_score(
    model, 
    train, 
    target, 
    cv=kf, 
    scoring='neg_mean_absolute_error', 
    n_jobs=-1
).tolist() # NumPy 배열을 Python 리스트로 변환

# 4. R2 (R-squared)
r2_scores = cross_val_score(
    model, 
    train, 
    target, 
    cv=kf, 
    scoring='r2', 
    n_jobs=-1
).tolist() # NumPy 배열을 Python 리스트로 변환

# 최종 평균 성능 지표 출력
print("="*50)
print(f"Final Average Cross-Validation Results ({n_splits} Folds) using cross_val_score:")
print("="*50)

# MSE (음수 결과를 다시 양수로 변환하고 statistics 사용)
mse_scores = [-score for score in mse_scores_neg]
avg_mse = statistics.mean(mse_scores)
std_mse = statistics.stdev(mse_scores)
print(f'Average MSE: {avg_mse:.4f} (Std: {std_mse:.4f})')

# RMSE
avg_rmse = statistics.mean(rmse_scores)
std_rmse = statistics.stdev(rmse_scores)
print(f'Average RMSE: {avg_rmse:.4f} (Std: {std_rmse:.4f})')

# MAE (음수 결과를 다시 양수로 변환하고 statistics 사용)
mae_scores = [-score for score in mae_scores_neg]
avg_mae = statistics.mean(mae_scores)
std_mae = statistics.stdev(mae_scores)
print(f'Average MAE: {avg_mae:.4f} (Std: {std_mae:.4f})')

# R2
avg_r2 = statistics.mean(r2_scores)
std_r2 = statistics.stdev(r2_scores)
print(f'Average R2: {avg_r2:.4f} (Std: {std_r2:.4f})')
print("="*50)