## 데이터 확인

In [1]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
# 파일 읽기
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

### 데이터 전처리 

#### 계절성 확인 및 이상치 보간

In [3]:
from statsmodels.tsa.seasonal import seasonal_decompose

# 전처리
train = train.set_index('datetime')  # datetime 인덱스로 설정

# 분해
result = seasonal_decompose(train['activePower'], model='additive', period=24)
resid = result.resid

# 이상치: 잔차가 크면 이상치
threshold = 3 * np.nanstd(resid)
train['outlier'] = np.abs(resid) > threshold

In [4]:
outlier_count = train['outlier'].sum()
print(f"이상치 개수: {outlier_count}")

이상치 개수: 4490


In [5]:
train.loc[train['outlier'], 'activePower'] = np.nan
train['activePower'] = train['activePower'].interpolate(method='linear')

In [6]:
train = train.reset_index() 
test = test.reset_index() 
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

#### 노이즈 제거

In [7]:
# 추세 유지
from scipy.signal import savgol_filter

train['activePower_n'] = savgol_filter(train['activePower'], window_length=101, polyorder=2)
test['activePower_n'] = savgol_filter(test['activePower'], window_length=101, polyorder=2)

## 데이터 분석

In [8]:
def create_time_features(df):
    df['minute'] = df['datetime'].dt.minute
    df['day'] = df['datetime'].dt.day
    df['year'] = df['datetime'].dt.year 
    df['hour'] = df['datetime'].dt.hour
    df['dayofweek'] = df['datetime'].dt.dayofweek
    df['month'] = df['datetime'].dt.month
    return df

In [9]:
train = create_time_features(train)
test = create_time_features(test)

In [10]:
# 평균 전압/전류
train['voltage_mean'] = train[['voltageR', 'voltageS', 'voltageT']].mean(axis=1)
test['voltage_mean'] = test[['voltageR', 'voltageS', 'voltageT']].mean(axis=1)

train['current_mean'] = train[['currentR', 'currentS', 'currentT']].mean(axis=1)
test['current_mean'] = test[['currentR', 'currentS', 'currentT']].mean(axis=1)

train['current_std'] = train[['currentR', 'currentS', 'currentT']].std(axis=1)
test['current_std'] = test[['currentR', 'currentS', 'currentT']].std(axis=1)

In [None]:
features_cols = [
    'voltageR', 'voltageS',
    'voltageRS', 'voltageST', 'voltageTR',
    'currentR', 'currentS', 'currentT',
    'activePower_n',
    'powerFactorR', 'powerFactorS', 'powerFactorT',
    'reactivePowerLagging',
    'accumActiveEnergy','minute','day',
    'hour', 'dayofweek', 'month',
    'voltage_mean', 'current_mean','current_std'
]

target_col = 'activePower_n'

In [12]:
X_train = train[features_cols]
y_train = train[target_col]
X_test = test[features_cols]
y_test = test[target_col]

In [13]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test, label=y_test)

In [14]:
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate': 0.03,
    'max_depth': 6,
    'min_child_weight': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'seed': 42
}

# 🚀 모델 학습 
model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=1000,
    evals=[(dtrain, 'train'), (dvalid, 'valid')],
    early_stopping_rounds=100,
    verbose_eval=100
)

[0]	train-rmse:107.23161	valid-rmse:106.88885
[100]	train-rmse:10.74899	valid-rmse:10.70184
[200]	train-rmse:3.05399	valid-rmse:3.01776
[300]	train-rmse:2.92458	valid-rmse:2.89301
[400]	train-rmse:2.91450	valid-rmse:2.89473
[409]	train-rmse:2.91312	valid-rmse:2.89484


### RMSE, MAE, SMAPE

In [15]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# 기존 RMSE
y_pred = model.predict(dvalid)
rmse = mean_squared_error(y_test, y_pred)

# MAE
mae = mean_absolute_error(y_test, y_pred)

# SMAPE 계산 함수
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-8))

smape_score = smape(y_test.values, y_pred)

# 출력
print(f"✅ XGBoost RMSE:  {rmse:.4f}")
print(f"✅ XGBoost MAE:   {mae:.4f}")
print(f"✅ XGBoost SMAPE: {smape_score:.4f}%")

✅ XGBoost RMSE:  8.3800
✅ XGBoost MAE:   0.7395
✅ XGBoost SMAPE: 0.0247%


# scale_factor 계산

## 4월 예측

In [16]:
# 1. 4월 데이터 생성
hours = pd.date_range(start="2025-04-01 00:00:00", periods=672, freq='H')
april_fake = pd.DataFrame({'datetime': hours})
april_fake['minute'] = april_fake['datetime'].dt.minute
april_fake['day'] = april_fake['datetime'].dt.day
april_fake['year'] = april_fake['datetime'].dt.year
april_fake['hour'] = april_fake['datetime'].dt.hour
april_fake['dayofweek'] = april_fake['datetime'].dt.dayofweek
april_fake['month'] = april_fake['datetime'].dt.month

np.random.seed(42)
april_fake['voltage_mean'] = april_fake['hour'].apply(lambda h: np.random.normal(220, 2) if 8 <= h <= 20 else np.random.normal(215, 2))
april_fake['current_mean'] = np.random.normal(15.5, 2, size=672)
april_fake['reactivePowerLagging'] = np.random.normal(320, 50, size=672)
april_fake['powerFactorR'] = np.random.normal(95, 2, size=672)
april_fake['powerFactorS'] = np.random.normal(94, 2, size=672)
april_fake['powerFactorT'] = np.random.normal(93, 2, size=672)
april_fake = april_fake.sort_values('datetime').reset_index(drop=True)
april_fake['accumActiveEnergy'] = 0
april_fake['currentR'] = np.random.normal(15.5, 2, size=672)
april_fake['currentS'] = np.random.normal(15.5, 2, size=672)
april_fake['currentT'] = np.random.normal(15.5, 2, size=672)
april_fake['current_std'] = april_fake[['currentR', 'currentS', 'currentT']].std(axis=1)

# 2. 훈련 시 사용한 features_cols 정의 (정확히 훈련 시 썼던 순서대로!)
features_cols = [
    'voltageR', 'voltageS', 
    'voltageRS', 'voltageST', 'voltageTR',
    'currentR', 'currentS', 'currentT',
    'activePower_n',
    'powerFactorR', 'powerFactorS', 'powerFactorT',
    'reactivePowerLagging',
    'accumActiveEnergy','minute','day',
    'hour', 'dayofweek', 'month',
    'voltage_mean', 'current_mean','current_std'
]

# 3. 누락된 컬럼을 0으로 채움
for col in features_cols:
    if col not in april_fake.columns:
        april_fake[col] = 0


# 4. 예측
dtest_april = xgb.DMatrix(april_fake[features_cols], enable_categorical=True)
april_fake['hourly_pow_pred'] = model.predict(dtest_april)

# 5. 실제 4월 데이터 평균값 계산
test['datetime'] = pd.to_datetime(test['timestamp'], unit='ms')
test_april = test[(test['datetime'] >= '2025-04-01') & (test['datetime'] < '2025-05-01')].copy()
test_april['kWh'] = test_april['activePower'] * (5 / 3600)
test_april['hour'] = test_april['datetime'].dt.floor('H')
true_hourly = test_april.groupby('hour')['kWh'].sum().reset_index(name='true_hourly_pow')

# 6. 스케일 팩터 계산
true_avg = true_hourly['true_hourly_pow'].mean()
pred_avg = april_fake['hourly_pow_pred'].mean()
scale_factor = true_avg / pred_avg

print(f"✅ True April avg: {true_avg:.4f}")
print(f"✅ Pred April avg: {pred_avg:.4f}")
print(f"🚀 스케일 팩터 계산됨: {scale_factor:.4f}")

  hours = pd.date_range(start="2025-04-01 00:00:00", periods=672, freq='H')
  test_april['hour'] = test_april['datetime'].dt.floor('H')


✅ True April avg: 39072.0612
✅ Pred April avg: 2692.0840
🚀 스케일 팩터 계산됨: 14.5137


## scaled 적용 예측

In [17]:
# 1. 미래 시간 생성 (5월 한 달)
future_df = pd.DataFrame({
    'datetime': pd.date_range("2025-05-01 00:00:00", "2025-05-28 23:00:00", freq='1H')
})

# 시간 기반 파생 변수
future_df['minute'] = future_df['datetime'].dt.minute
future_df['day'] = future_df['datetime'].dt.day
future_df['year'] = future_df['datetime'].dt.year
future_df['hour'] = future_df['datetime'].dt.hour
future_df['dayofweek'] = future_df['datetime'].dt.dayofweek
future_df['month'] = future_df['datetime'].dt.month

# 랜덤/규칙 기반 피처 생성
np.random.seed(42)
future_df['voltage_mean'] = future_df['hour'].apply(lambda h: np.random.normal(220, 2) if 8 <= h <= 20 else np.random.normal(215, 2))
future_df['current_mean'] = np.random.normal(15.5, 2, size=len(future_df))
future_df['reactivePowerLagging'] = np.random.normal(320, 50, size=len(future_df))
future_df['powerFactorR'] = np.random.normal(95, 2, size=len(future_df))
future_df['powerFactorS'] = np.random.normal(94, 2, size=len(future_df))
future_df['powerFactorT'] = np.random.normal(93, 2, size=len(future_df))
future_df['accumActiveEnergy'] = 0

# 센서 기반 가상 데이터
future_df['voltageR'] = np.random.normal(220, 2, size=len(future_df))
future_df['voltageS'] = np.random.normal(220, 2, size=len(future_df))
future_df['voltageRS'] = np.random.normal(380, 5, size=len(future_df))
future_df['voltageST'] = np.random.normal(380, 5, size=len(future_df))
future_df['voltageTR'] = np.random.normal(380, 5, size=len(future_df))
future_df['currentR'] = np.random.normal(15.5, 2, size=len(future_df))
future_df['currentS'] = np.random.normal(15.5, 2, size=len(future_df))
future_df['currentT'] = np.random.normal(15.5, 2, size=len(future_df))
future_df['activePower_n'] = np.random.normal(15.5, 2, size=len(future_df))
future_df['current_std'] = np.random.normal(15.5, 2, size=len(future_df))
april_fake = april_fake.sort_values('datetime').reset_index(drop=True)

future_df['hourly_pow_pred'] = np.random.normal(15, 3, size=len(future_df))  # 임시 값

# 8. 누락된 피처 보정
for col in features_cols:
    if col not in future_df.columns:
        future_df[col] = 0

# . 예측 수행
dtest_future = xgb.DMatrix(future_df[features_cols])
future_df['hourly_pow_pred'] = model.predict(dtest_future)

# 예측값을 적용
future_df['hourly_pow'] = future_df['hourly_pow_pred'] * scale_factor

# 9. 최종 집계 및 계산
agg_pow = future_df['hourly_pow'].sum()
may_bill = agg_pow * 180        # 요금 계산 예시 (원)
may_carbon = agg_pow * 0.424    # 탄소배출 계산 예시 (kg CO2)

submission = pd.DataFrame({
    'id': pd.date_range(start='2025-05-01 00:00:00', periods=672, freq='1H').strftime('%Y-%m-%d %H:%M:%S'),
    'hourly_pow': future_df['hourly_pow'],
    'agg_pow': [agg_pow] * 672,
    'may_bill': [may_bill] * 672,
    'may_carbon': [may_carbon] * 672
})
submission.to_csv("submission8.csv", index=False, encoding="utf-8-sig")

  'datetime': pd.date_range("2025-05-01 00:00:00", "2025-05-28 23:00:00", freq='1H')
  'id': pd.date_range(start='2025-05-01 00:00:00', periods=672, freq='1H').strftime('%Y-%m-%d %H:%M:%S'),


In [21]:
future_df['date'] = future_df['datetime'].dt.date

# 일별 전력량 예측 (kWh 단위)
daily_pow = future_df.groupby('date')['hourly_pow_pred'].sum().reset_index()
daily_pow.columns = ['date', 'daily_kWh_pred']

In [22]:
daily_pow

Unnamed: 0,date,daily_kWh_pred
0,2025-05-01,64617.359375
1,2025-05-02,64614.40625
2,2025-05-03,64612.125
3,2025-05-04,64647.078125
4,2025-05-05,64618.46875
5,2025-05-06,64618.332031
6,2025-05-07,64616.78125
7,2025-05-08,64616.804688
8,2025-05-09,64617.761719
9,2025-05-10,64617.40625


In [23]:
future_df['week'] = future_df['datetime'].dt.isocalendar().week

# 주별 전력 예측 (kWh)
weekly_pow = future_df.groupby('week')['hourly_pow_pred'].sum().reset_index()
weekly_pow.columns = ['week', 'weekly_kWh_pred']

In [24]:
weekly_pow

Unnamed: 0,week,weekly_kWh_pred
0,18,258490.96875
1,19,452346.8125
2,20,452315.03125
3,21,452324.78125
4,22,193833.34375


In [None]:
unit_price = 180  
total_kWh = future_df['hourly_pow_pred'].sum()
total_bill = total_kWh * unit_price

print(f"💡 5월 총 예측 전력사용량: {total_kWh:.2f} kWh")
print(f"💰 5월 예측 전기요금: {total_bill:,.0f} 원")

💡 5월 총 예측 전력사용량: 1809311.00 kWh
💰 5월 예측 전기요금: 325,675,968 원


In [None]:
carbon_factor = 0.424
total_carbon = total_kWh * carbon_factor

print(f"🌿 5월 탄소배출량 예측: {total_carbon:,.2f} kg CO₂")

🌿 5월 탄소배출량 예측: 767,147.88 kg CO₂
