In [46]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# 데이터 불러오기 및 변환 (생략 가능)
# Update the file path to the correct location or ensure the file exists
df1 = pd.read_csv(r"D:\back_office-main\hoyeon\현대.csv")
id_vars = ['국가명', '연도', '기후대', 'GDP', '차종', '차량 구분']
month_cols = [f"{i}월" for i in range(1, 13)]
df_long = pd.melt(df1, id_vars=id_vars, value_vars=month_cols, 
                  var_name='월', value_name='수출량')
df_long['월'] = df_long['월'].str.replace('월', '').astype(int)
df_long['날짜'] = pd.to_datetime(df_long['연도'].astype(str) + '-' + df_long['월'].astype(str) + '-01')
df_long = df_long.sort_values(by=['국가명', '날짜'])

# 시차/타겟 생성
df_long['전월_수출량'] = df_long.groupby('국가명')['수출량'].shift(1)
df_long['다음달_수출량'] = df_long.groupby('국가명')['수출량'].shift(-1)
df_model = df_long.dropna(subset=['전월_수출량', '다음달_수출량']).copy()

# 특성 정의
features = ['수출량', '전월_수출량', '연도', '월', 'GDP', '국가명', '기후대', '차종', '차량 구분']
target = '다음달_수출량'
categorical_features = ['국가명', '기후대', '차종', '차량 구분']

# 결과 저장용
results = []

# 전체 날짜 목록
unique_dates = sorted(df_model['날짜'].unique())

# 최소 12개월 학습 후 시작
for i in range(12, len(unique_dates) - 1):
    train_end = unique_dates[i]
    test_month = unique_dates[i + 1]
    
    # 학습 데이터: 현재 시점까지
    train_data = df_model[df_model['날짜'] <= train_end]
    test_data = df_model[df_model['날짜'] == test_month]
    
    # 특성 & 타겟
    X_train = train_data[features]
    y_train = train_data[target]
    X_test = test_data[features]
    y_test = test_data[target]

    # 인코딩
    X_train = pd.get_dummies(X_train, columns=categorical_features)
    X_test = pd.get_dummies(X_test, columns=categorical_features)

    # 열 정렬 맞추기
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

    # 스케일링
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # 모델 학습 및 예측
    model = LGBMRegressor()
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    # 성능 측정
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # 결과 저장
    results.append({
        '기준월': test_month,
        'MSE': mse,
        'R2': r2
    })

# 결과 DataFrame
results_df = pd.DataFrame(results)
print(results_df)

# 평균 성능
print("\n✅ 전체 평균 성능")
print("평균 MSE:", results_df['MSE'].mean())
print("평균 R²:", results_df['R2'].mean())


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010844 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 798
[LightGBM] [Info] Number of data points in the train set: 45871, number of used features: 92
[LightGBM] [Info] Start training from score 945.542303




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009976 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 798
[LightGBM] [Info] Number of data points in the train set: 47111, number of used features: 92
[LightGBM] [Info] Start training from score 944.480015




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008829 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 798
[LightGBM] [Info] Number of data points in the train set: 48351, number of used features: 92
[LightGBM] [Info] Start training from score 943.099853




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008511 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 798
[LightGBM] [Info] Number of data points in the train set: 49591, number of used features: 92
[LightGBM] [Info] Start training from score 942.108850




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009268 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 798
[LightGBM] [Info] Number of data points in the train set: 50831, number of used features: 92
[LightGBM] [Info] Start training from score 941.080659




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009967 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 798
[LightGBM] [Info] Number of data points in the train set: 52071, number of used features: 92
[LightGBM] [Info] Start training from score 944.844328




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005812 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 798
[LightGBM] [Info] Number of data points in the train set: 53311, number of used features: 92
[LightGBM] [Info] Start training from score 948.205323




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010071 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 798
[LightGBM] [Info] Number of data points in the train set: 54551, number of used features: 92
[LightGBM] [Info] Start training from score 951.788217




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008150 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 798
[LightGBM] [Info] Number of data points in the train set: 55791, number of used features: 92
[LightGBM] [Info] Start training from score 954.770572




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010517 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 798
[LightGBM] [Info] Number of data points in the train set: 57031, number of used features: 92
[LightGBM] [Info] Start training from score 953.550139




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 798
[LightGBM] [Info] Number of data points in the train set: 58271, number of used features: 92
[LightGBM] [Info] Start training from score 952.372467
          기준월           MSE        R2
0  2024-02-01  43615.825183  0.852173
1  2024-03-01  45359.577812  0.840013
2  2024-04-01  45025.351584  0.844793
3  2024-05-01  43326.240788  0.850988
4  2024-06-01  47363.615320  0.841030
5  2024-07-01  47378.854495  0.839000
6  2024-08-01  45024.081353  0.840830
7  2024-09-01  49558.855106  0.834018
8  2024-10-01  44431.385972  0.852108
9  2024-11-01  43255.666191  0.852586
10 2024-12-01  41316.761814  0.850522

✅ 전체 평균 성능
평균 MSE: 45059.65596535369
평균 R²: 0.8452782406488012




In [47]:
import joblib

# 모델 저장
joblib.dump(model, "h_lgbm_model.pkl")

# 스케일러 저장
joblib.dump(scaler, "h_scaler.pkl")
# 모델 저장
joblib.dump(X_train.columns.tolist(), "h_model_columns.pkl")




['h_model_columns.pkl']

In [2]:
import pandas as pd

df1 = pd.read_csv(r"D:\back_office-main\hoyeon\현대_기아.csv")

In [3]:
df1["차량 브랜드"].unique()

array(['Hyundai'], dtype=object)

In [41]:
# Filter out rows where the "차량 브랜드" column is 'kia'
df1 = df1[df1["차량 브랜드"] != 'Kia']

In [5]:
df1 = df1.drop(columns=["차량 브랜드"])

In [44]:
df1['기후대'].unique()

array(['온대', '한랭', '열대', '건조'], dtype=object)

In [7]:
import os

# Ensure the directory exists
output_dir = r"D:\back_office-main\hoyeon"
os.makedirs(output_dir, exist_ok=True)

# Save the file
df1.to_csv(os.path.join(output_dir, "현대_기아.csv"))

In [19]:
df1['국가명'].unique()

array(['US', 'Canada', 'Mexico', 'EU+EFTA', 'E.Europe/CIS',
       'Latin America', 'Middle East/Africa', 'Asia / Pacific', 'China',
       'India'], dtype=object)

In [18]:
# Load the dataframe
df = pd.read_csv(r"D:\back_office-main\hoyeon\기아.csv")

# Get unique values of the '차종' column
df['국가명'].unique()

array(['US', 'Canada', 'Mexico', 'EU+EFTA', 'E.Europe/CIS',
       'Latin America', 'Middle East/Africa', 'Asia / Pacific', 'China',
       'India'], dtype=object)

In [34]:
reverse_map = {
    'US': 'US',
    'Canada': 'Canada',
    'Mexico': 'Mexico',
    'EU+EFTA': 'EU+EFTA',
    'E.Europe/CIS': 'E.Europe/CIS',
    'Latin America': 'Latin America',
    'Middle East/Africa': 'Middle East/Africa',
    'Asia / Pacific': 'Asia / Pacific',
    'China': 'Asia / Pacific',
    'India': 'Asia / Pacific'
}

# 매핑 적용
df1['국가명'] = df['국가명'].map(reverse_map)

In [39]:
df1.to_csv(r"D:\back_office-main\hoyeon\현대.csv")

In [42]:
df1['기후대'].unique()

array(['온대', '한랭', '열대', '건조'], dtype=object)