In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

train_path = "/content/drive/MyDrive/train_dataset.csv"
station_path = "/content/drive/MyDrive/station_info.csv"

train_df = pd.read_csv(train_path)
station_df = pd.read_csv(station_path)

train_df.info(), station_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13132 entries, 0 to 13131
Columns: 342 entries, id to target
dtypes: float64(338), int64(2), object(2)
memory usage: 34.3+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   지점                9 non-null      int64  
 1   시작일               9 non-null      object 
 2   종료일               1 non-null      object 
 3   지점명               9 non-null      object 
 4   위도                9 non-null      float64
 5   경도                9 non-null      float64
 6   노장해발고도(m)         9 non-null      float64
 7   기압계(관측장비지상높이(m))  9 non-null      float64
 8   기온계(관측장비지상높이(m))  9 non-null      float64
 9   풍속계(관측장비지상높이(m))  9 non-null      float64
 10  강우계(관측장비지상높이(m))  9 non-null      float64
dtypes: float64(7), int64(1), object(3)
memory usage: 924.0+ bytes


(None, None)

In [22]:
import numpy as np
import pandas as pd

def dew_to_temp(dew, rh):
    a = 17.62
    b = 243.12
    gamma_inv = (a * dew) / (b + dew) - np.log(rh / 100.0)
    temp = (b * gamma_inv) / (a - gamma_inv)
    return temp

def preprocess_data(train_df, station_df):
    df = train_df.copy()

    # 1. -9999를 NaN으로 변환
    df.replace(-9999, np.nan, inplace=True)

    # 2. 날짜 분리 (month, day)
    df[['month', 'day']] = df['date'].str.split('-', expand=True).astype(int)
    df.drop(columns=['date'], inplace=True)

    # 3. 결측치 처리
    zero_key = ['sunshine_duration', 'snow_depth', 'precipitation', 'visibility', 'wind_speed', 'cloud_cover']

    for col in df.columns:
        if any(key in col for key in zero_key):
            # 수정: inplace 사용하지 않고 명시적으로 할당
            df[col] = df[col].fillna(0)
        if "min_cloud_height" in col:
            df[col] = df[col].fillna(90)

    # NaN 값이 있는 피처 추출
    bf_nan_columns_list = df.columns[df.isnull().any()].tolist()
    # 각 결측치 컬럼별로 월별 평균으로 결측치 대체
    for col in bf_nan_columns_list:
        df[col] = df.groupby(['month'])[col].transform(lambda x: x.fillna(x.mean()))

    # 결과 확인
    af_nan_columns_list = df.columns[df.isnull().any()].tolist()
    print("여기", len(af_nan_columns_list))
    print(af_nan_columns_list)
    print("---------")

    # mag_temp 컬럼을 한 번에 생성하여 concat (fragmentation 방지)
    mag_temp_cols = {}
    for i in range(24):
        mag_temp_cols[f'mag_temp_{i}'] = dew_to_temp(df[f'dew_point_{i}'], df[f'humidity_{i}'])
    mag_temp_df = pd.DataFrame(mag_temp_cols, index=df.index)
    df = pd.concat([df, mag_temp_df], axis=1)
    df = df.copy()  # fragmentation 해소

    # 4. 관측소 정보 병합
    station_df_renamed = station_df.rename(columns={
        '지점': 'station',
        '위도': 'latitude',
        '경도': 'longitude',
        '노장해발고도(m)': 'altitude'
    })[['station', 'latitude', 'longitude', 'altitude']]

    station_df_renamed = station_df_renamed.drop_duplicates(subset='station')
    df = df.merge(station_df_renamed, on='station', how='left')

    # 6. 불필요 컬럼 제거
    for col in ['station_name', 'station', 'id']:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)

    return df

processed_df = preprocess_data(train_df, station_df)
processed_df.info()

여기 0
[]
---------
[]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13132 entries, 0 to 13131
Columns: 367 entries, cloud_cover_0 to altitude
dtypes: float64(365), int64(2)
memory usage: 36.8 MB


In [23]:
!pip install xgboost



In [24]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

# 전처리된 데이터 불러온 후 float32로 변환
df = processed_df

X = df.drop(columns=['target'])
y = df['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# TRial 13s' hyperparams

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
from xgboost import XGBRegressor

def run_kfold_xgb(X, y, n_splits=10):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    rmses = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"🌱 Fold {fold + 1}")
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = XGBRegressor(
            n_estimators=665,
            learning_rate=0.07468890154873875,
            max_depth=7,
            random_state=42,
            min_child_weight=10,
            tree_method='hist'
        )
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        mse = mean_squared_error(y_val, preds)
        rmse = np.sqrt(mse)
        print(f"  📏 RMSE: {rmse:.4f}")
        rmses.append(rmse)

    print(f"\n✅ 평균 RMSE: {np.mean(rmses):.4f} ± {np.std(rmses):.4f}")
    return rmses

run_kfold_xgb(X, y)

🌱 Fold 1
  📏 RMSE: 1.2849
🌱 Fold 2
  📏 RMSE: 1.2815
🌱 Fold 3
  📏 RMSE: 1.3052
🌱 Fold 4
  📏 RMSE: 1.2543
🌱 Fold 5
  📏 RMSE: 1.2841
🌱 Fold 6
  📏 RMSE: 1.2531
🌱 Fold 7
  📏 RMSE: 1.2560
🌱 Fold 8
  📏 RMSE: 1.3000
🌱 Fold 9
  📏 RMSE: 1.1823
🌱 Fold 10
  📏 RMSE: 1.2730

✅ 평균 RMSE: 1.2674 ± 0.0333


[np.float64(1.2849271067613954),
 np.float64(1.2815316845989155),
 np.float64(1.3051628788250091),
 np.float64(1.254318094612868),
 np.float64(1.2840823119329354),
 np.float64(1.2531122372877936),
 np.float64(1.256003413606879),
 np.float64(1.3000461631889217),
 np.float64(1.1822558814987325),
 np.float64(1.2729796555901374)]

In [26]:
pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [27]:
import optuna
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
from xgboost import XGBRegressor

def objective(trial, X, y):
    # 하이퍼파라미터 탐색 공간 정의
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
        'tree_method': 'hist',
        'random_state': 42
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmses = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        # 조기 중단(pruning)을 위한 callback
        pruning_callback = optuna.integration.XGBoostPruningCallback(trial, f"validation_{fold}-rmse")

        # 데이터 분할
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # 모델 학습
        model = XGBRegressor(**params)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_val, y_val)],
            eval_metric="rmse",
            verbose=False,
            callbacks=[pruning_callback]
        )

        # 검증 성능 계산
        preds = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        rmses.append(rmse)

    return np.mean(rmses)  # K-Fold 평균 RMSE 반환

# Optuna 최적화 실행
study = optuna.create_study(
    direction='minimize',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5)
)
study.optimize(lambda trial: objective(trial, X, y), n_trials=100)

# 결과 출력
print(f"🎯 Best trial:")
print(f"  RMSE: {study.best_value:.4f}")
print("🔧 Best hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

# 최종 모델 학습 (전체 데이터 사용)
best_model = XGBRegressor(**study.best_params, tree_method='hist', random_state=42)
best_model.fit(X, y)

# (선택사항) 튜닝된 하이퍼파라미터로 K-Fold 재검증
print("\n🚀 튜닝된 파라미터 검증:")
run_kfold_xgb(X, y, model=XGBRegressor(**study.best_params))


[I 2025-05-10 02:13:06,896] A new study created in memory with name: no-name-f2b1860b-09cd-45e6-90e9-01ba5cf770fe
[W 2025-05-10 02:13:06,901] Trial 0 failed with parameters: {'n_estimators': 874, 'learning_rate': 0.2536999076681772, 'max_depth': 8, 'min_child_weight': 12, 'subsample': 0.6624074561769746, 'colsample_bytree': 0.662397808134481, 'gamma': 0.05808361216819946, 'reg_alpha': 0.8661761457749352, 'reg_lambda': 0.6011150117432088} because of the following error: ModuleNotFoundError('\nCould not find `optuna-integration` for `xgboost`.\nPlease run `pip install optuna-integration[xgboost]`.').
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/integration/xgboost.py", line 5, in <module>
    from optuna_integration.xgboost import XGBoostPruningCallback
ModuleNotFoundError: No module named 'optuna_integration'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3

ModuleNotFoundError: 
Could not find `optuna-integration` for `xgboost`.
Please run `pip install optuna-integration[xgboost]`.

In [None]:
final_model = XGBRegressor(
    n_estimators=665,
    learning_rate=0.07468890154873875,
    max_depth=8,
    random_state=42,
    min_child_weight=10,
    tree_method='hist'
)

# 💥 전부 다 학습에 씀
final_model.fit(X, y)

In [None]:
y_pred = final_model.predict(X)

mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
print(f'✅ 검증 RMSE: {rmse:.4f} °C')

In [None]:
test_path = '/content/drive/MyDrive/test_dataset.csv'
test_df = pd.read_csv(test_path)

In [None]:
test = preprocess_data(test_df, station_df)
test.info()

In [None]:
prediction = final_model.predict(test)
submission_df = pd.DataFrame({'target':prediction})
submission_df.to_csv('asdfsadnnf.csv',index=True)

In [None]:
import shap
import matplotlib.pyplot as plt

# 🎯 모델에 대한 SHAP Explainer 생성
explainer = shap.Explainer(final_model)

# 🔍 validation 데이터에 대한 SHAP 값 계산
shap_values = explainer(X_val)

# 🐝 변수 중요도 시각화 (Beeswarm plot)
shap.plots.beeswarm(shap_values)

In [None]:
shap.summary_plot(shap_values.values, X_val, plot_type="bar")

In [None]:
shap.plots.bar(shap_values, max_display=shap_values.shape[1])

In [None]:
import numpy as np
import pandas as pd

# SHAP 값 배열 추출 (Explanation 객체 → numpy array)
shap_array = shap_values.values  # 👈 이거 추가!

# 1. 변수별 평균 SHAP값 계산 (절댓값 기준)
mean_abs_shap = np.abs(shap_array).mean(axis=0)

# 2. 중요도 데이터프레임 만들기
importance_df = pd.DataFrame({
    'feature': X.columns,
    'mean_abs_shap': mean_abs_shap
}).sort_values(by='mean_abs_shap', ascending=False)

# 3. 누적합 기준으로 95% 이상 되는 feature 추리기
importance_df['cumsum'] = importance_df['mean_abs_shap'].cumsum()
importance_df['cumsum_pct'] = importance_df['cumsum'] / importance_df['mean_abs_shap'].sum()

# 4. 상위 95% 중요 변수만 필터링
top_features = importance_df[importance_df['cumsum_pct'] <= 0.99]['feature'].tolist()

# 🎉 최종 중요 변수만 남긴 X (선택사항)
X_top = X[top_features]

print(f"📊 상위 95% 중요 변수 개수: {len(top_features)}개")

In [None]:
final_model = XGBRegressor(
    n_estimators=665,
    learning_rate=0.07468890154873875,
    max_depth=8,
    random_state=29,
    min_child_weight=10,
    tree_method='hist'
)

# 💥 전부 다 학습에 씀
final_model.fit(X_top, y)

In [None]:
y_pred = final_model.predict(X_top)

rmse = mean_squared_error(y, y_pred, squared=False)
print(f'✅ 검증 RMSE: {rmse:.4f} °C')

In [None]:
test_top = test[top_features]

In [None]:
prediction = final_model.predict(test_top)
submission_df = pd.DataFrame({'target':prediction})
submission_df.to_csv('yesShap_8.csv',index=True)