In [15]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sb

In [69]:
# 기아 해외판매 데이터 불러오기
df = pd.read_csv("../../data/processed/기아_해외판매_전처리.CSV")
df

Unnamed: 0,차량 구분,국가명,연도,월,수출량,차종,거래 유형,판매량,공장명(국가),공장 코드,...,Canada,Mexico,Europe,Eastern Europe,Latin America,Middle East,Africa,Asia Pacific,India,China
0,일반 차량,Asia / Pacific,2023,1,0.0,Bongo,국내,3580.0,China Plants,0,...,90.346314,103.258182,574.922355,84.178609,146.816008,176.756377,52.502593,171.622211,255.140779,86.836533
1,일반 차량,Asia / Pacific,2023,1,0.0,Bongo,수출,3014.0,China Plants,0,...,90.346314,103.258182,574.922355,84.178609,146.816008,176.756377,52.502593,171.622211,255.140779,86.836533
2,특수 차량,Asia / Pacific,2023,1,0.0,Bongo,국내,0.0,China Plants,0,...,90.346314,103.258182,574.922355,84.178609,146.816008,176.756377,52.502593,171.622211,255.140779,86.836533
3,특수 차량,Asia / Pacific,2023,1,0.0,Bongo,수출,141.0,China Plants,0,...,90.346314,103.258182,574.922355,84.178609,146.816008,176.756377,52.502593,171.622211,255.140779,86.836533
4,일반 차량,Asia / Pacific,2023,2,0.0,Bongo,국내,8977.0,China Plants,0,...,90.346314,103.258182,574.922355,84.178609,146.816008,176.756377,52.502593,171.622211,255.140779,86.836533
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4987,Special Vehicle,Latin America,2025,12,0.0,K3,국내,0.0,China Plants,0,...,90.346314,103.258182,574.922355,84.178609,146.816008,176.756377,52.502593,171.622211,255.140779,86.836533
4988,Special Vehicle,Middle East/Africa,2025,12,0.0,K3,국내,0.0,China Plants,0,...,90.346314,103.258182,574.922355,84.178609,146.816008,176.756377,52.502593,171.622211,255.140779,86.836533
4989,Special Vehicle,Asia / Pacific,2025,12,0.0,K3,국내,0.0,China Plants,0,...,90.346314,103.258182,574.922355,84.178609,146.816008,176.756377,52.502593,171.622211,255.140779,86.836533
4990,Special Vehicle,India,2025,12,0.0,K3,국내,0.0,China Plants,0,...,90.346314,103.258182,574.922355,84.178609,146.816008,176.756377,52.502593,171.622211,255.140779,86.836533


In [70]:
df.columns

Index(['차량 구분', '국가명', '연도', '월', '수출량', '차종', '거래 유형', '판매량', '공장명(국가)',
       '공장 코드', '공장판매량', 'Korea', 'U.S.A', 'Canada', 'Mexico', 'Europe',
       'Eastern Europe', 'Latin America', 'Middle East', 'Africa',
       'Asia Pacific', 'India', 'China'],
      dtype='object')

##  고객생애가치 (CLTV: Customer Lifetime Value) 계산

고객생애가치(Customer Lifetime Value, CLTV)는 **한 명의 고객이 기업과의 거래를 통해 얼마나 많은 가치를 제공할 수 있는지를 측정하는 지표**입니다.  
이는 마케팅 전략, 고객 세분화, 타겟 마케팅, CRM 등에 활용됩니다.

---

### 주요 항목 설명

| 항목 | 설명 |
|------|------|
| **총구매금액 (Total Purchase Amount)** | 고객이 지금까지 구매한 전체 금액 |
| **구매빈도 (Purchase Frequency)** | 일정 기간 동안 얼마나 자주 구매했는지 |
| **고객유지기간 (Customer Retention Period)** | 고객이 얼마나 오랫동안 거래하고 있는지 |
| **CLTV_Calculated** | 위 3개 컬럼을 곱해서 산출한 고객 생애 가치 |

---

###  CLTV 계산 수식
\[
\text{CLTV} = \text{총구매금액} \times \text{구매빈도} \times \text{고객유지기간}
\]

각 요소가 클수록 고객의 생애가치는 증가하게 됩니다.  
CLTV는 고객 세그먼트를 나누거나, 마케팅 ROI를 극대화하기 위한 중요한 기준으로 사용됩니다.

---

### 활용 방안
- 이 CLTV 컬럼을 Y값으로 활용하여 AI 모델 학습에 사용할 수 있습니다.
- 고객 세분화와 타겟 마케팅 전략 수립에 유용합니다.


In [72]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [73]:
# 수치형 / 범주형 컬럼 분리
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()


In [50]:
numeric_cols

['연도',
 '월',
 '수출량',
 '판매량',
 '공장판매량',
 'Korea',
 'U.S.A',
 'Canada',
 'Mexico',
 'Europe',
 'Eastern Europe',
 'Latin America',
 'Middle East',
 'Africa',
 'Asia Pacific',
 'India',
 'China',
 '총판매량',
 '차량단가',
 '차량가격',
 '평균구매금액',
 '구매빈도',
 '고객유지기간',
 'CLTV',
 'CLTV_log',
 '최근구매연도',
 '구매횟수',
 '총구매금액',
 'R',
 'F',
 'M',
 'RFM점수']

In [51]:
categorical_cols

['차량 구분', '국가명', '차종', '거래 유형', '공장명(국가)', '공장 코드', '고객등급']

In [52]:
target = 'CLTV'

In [53]:
# 범주형 변수 인코딩 (Label Encoding)
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [54]:
# 결측치 처리 (수치형 평균 대체)
imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

In [55]:
# 스케일링 (StandardScaler 사용)
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [56]:
# 클러스터링 (KMeans, n_clusters=4)
kmeans = KMeans(n_clusters=4, random_state=23, n_init=10)
df_scaled['Cluster'] = kmeans.fit_predict(df_scaled[numeric_cols])

In [57]:
# 모델 학습을 위한 Feature 및 Target 설정
features = ['평균구매금액', '구매빈도', '고객유지기간', '구매횟수', '총구매금액', 'R', 'F', 'M', 'Cluster']
target = 'CLTV'

In [58]:
# 결측치 제거
df_model = df_scaled.dropna(subset=features + [target])
X = df_model[features]
y = df_model[target]

In [59]:
X

Unnamed: 0,평균구매금액,구매빈도,고객유지기간,구매횟수,총구매금액,R,F,M,Cluster
0,-0.268635,4.547474e-13,0.0,-0.672138,-0.268635,0.0,-1.352750,0.0,3
1,-0.268635,4.547474e-13,0.0,-0.672138,-0.268635,0.0,-1.352750,0.0,3
2,-0.268635,4.547474e-13,0.0,-0.672138,-0.268635,0.0,-1.352750,0.0,0
3,-0.268635,4.547474e-13,0.0,-0.672138,-0.268635,0.0,-1.352750,0.0,0
4,-0.268635,4.547474e-13,0.0,-0.672138,-0.268635,0.0,-1.352750,0.0,3
...,...,...,...,...,...,...,...,...,...
4987,-0.268635,4.547474e-13,0.0,0.678632,-0.268635,0.0,0.739235,0.0,1
4988,-0.268635,4.547474e-13,0.0,0.678632,-0.268635,0.0,0.739235,0.0,1
4989,-0.268635,4.547474e-13,0.0,0.678632,-0.268635,0.0,0.739235,0.0,1
4990,-0.268635,4.547474e-13,0.0,0.678632,-0.268635,0.0,0.739235,0.0,1


In [60]:
y

0      -0.268635
1      -0.268635
2      -0.268635
3      -0.268635
4      -0.268635
          ...   
4987   -0.268635
4988   -0.268635
4989   -0.268635
4990   -0.268635
4991   -0.268635
Name: CLTV, Length: 4992, dtype: float64

In [61]:
# 학습/테스트 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)


In [62]:
# 모델 학습 (RandomForestRegressor)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [63]:
# 예측 및 평가
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("✅ RMSE:", rmse)

✅ RMSE: 7.468536659882067e-15


In [68]:
# 성능 지표 계산
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 출력
print(f"✅ RMSE: {rmse:.4f}")
print(f"✅ MAE: {mae:.4f}")
print(f"✅ R² Score: {r2:.4f}")


✅ RMSE: 0.0000
✅ MAE: 0.0000
✅ R² Score: 1.0000


In [103]:
# 1. RandomForestRegressor

In [74]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [75]:
# 기아 해외판매 데이터 불러오기
df = pd.read_csv("../../data/processed/기아_해외판매_전처리.CSV")
df

Unnamed: 0,차량 구분,국가명,연도,월,수출량,차종,거래 유형,판매량,공장명(국가),공장 코드,...,Canada,Mexico,Europe,Eastern Europe,Latin America,Middle East,Africa,Asia Pacific,India,China
0,일반 차량,Asia / Pacific,2023,1,0.0,Bongo,국내,3580.0,China Plants,0,...,90.346314,103.258182,574.922355,84.178609,146.816008,176.756377,52.502593,171.622211,255.140779,86.836533
1,일반 차량,Asia / Pacific,2023,1,0.0,Bongo,수출,3014.0,China Plants,0,...,90.346314,103.258182,574.922355,84.178609,146.816008,176.756377,52.502593,171.622211,255.140779,86.836533
2,특수 차량,Asia / Pacific,2023,1,0.0,Bongo,국내,0.0,China Plants,0,...,90.346314,103.258182,574.922355,84.178609,146.816008,176.756377,52.502593,171.622211,255.140779,86.836533
3,특수 차량,Asia / Pacific,2023,1,0.0,Bongo,수출,141.0,China Plants,0,...,90.346314,103.258182,574.922355,84.178609,146.816008,176.756377,52.502593,171.622211,255.140779,86.836533
4,일반 차량,Asia / Pacific,2023,2,0.0,Bongo,국내,8977.0,China Plants,0,...,90.346314,103.258182,574.922355,84.178609,146.816008,176.756377,52.502593,171.622211,255.140779,86.836533
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4987,Special Vehicle,Latin America,2025,12,0.0,K3,국내,0.0,China Plants,0,...,90.346314,103.258182,574.922355,84.178609,146.816008,176.756377,52.502593,171.622211,255.140779,86.836533
4988,Special Vehicle,Middle East/Africa,2025,12,0.0,K3,국내,0.0,China Plants,0,...,90.346314,103.258182,574.922355,84.178609,146.816008,176.756377,52.502593,171.622211,255.140779,86.836533
4989,Special Vehicle,Asia / Pacific,2025,12,0.0,K3,국내,0.0,China Plants,0,...,90.346314,103.258182,574.922355,84.178609,146.816008,176.756377,52.502593,171.622211,255.140779,86.836533
4990,Special Vehicle,India,2025,12,0.0,K3,국내,0.0,China Plants,0,...,90.346314,103.258182,574.922355,84.178609,146.816008,176.756377,52.502593,171.622211,255.140779,86.836533


In [77]:
# 1. 범주형 컬럼 인코딩
categorical_cols = ['차량 구분', '국가명', '차종', '거래 유형', '공장명(국가)']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [78]:
le

In [79]:
# 2. 고객ID 생성 및 총구매금액 계산
df['고객ID'] = df['국가명'].astype(str) + "_" + df['차량 구분'].astype(str) + "_" + df['차종'].astype(str)
UNIT_PRICE = 30000  # 임의 단가 가정
df['총구매금액'] = df['수출량'] * UNIT_PRICE

In [80]:
df['총구매금액']

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
4987    0.0
4988    0.0
4989    0.0
4990    0.0
4991    0.0
Name: 총구매금액, Length: 4992, dtype: float64

In [81]:
# 3. 고객별 지표 계산
purchase_freq = df.groupby('고객ID').size().reset_index(name='구매빈도')
period_df = df.groupby('고객ID')['연도'].agg(['min', 'max']).reset_index()
period_df['고객유지기간'] = period_df['max'] - period_df['min'] + 1
purchase_total = df.groupby('고객ID')['총구매금액'].sum().reset_index()

In [82]:
purchase_total

Unnamed: 0,고객ID,총구매금액
0,0_0_51,1.064040e+09
1,0_1_51,1.878000e+07
2,0_2_51,9.120000e+07
3,0_3_51,1.967100e+08
4,0_4_51,7.287900e+08
...,...,...
167,9_3_51,0.000000e+00
168,9_4_51,2.124600e+08
169,9_5_51,1.089198e+10
170,9_6_51,0.000000e+00


In [83]:
# 4. CLTV 계산
cltv_df = purchase_total.merge(purchase_freq, on='고객ID')
cltv_df = cltv_df.merge(period_df[['고객ID', '고객유지기간']], on='고객ID')
cltv_df['CLTV'] = cltv_df['총구매금액'] * cltv_df['구매빈도'] * cltv_df['고객유지기간']
cltv_df['CLTV_log'] = np.log1p(cltv_df['CLTV'])

In [85]:
cltv_df['CLTV_log']

0      23.270245
1      20.619504
2      22.199766
3      23.779372
4      25.089027
         ...    
167     0.000000
168    23.856396
169    27.793424
170     0.000000
171    27.091754
Name: CLTV_log, Length: 172, dtype: float64

In [86]:
# 5. 클러스터링
scaler = StandardScaler()
cluster_features = ['총구매금액', '구매빈도', '고객유지기간']
scaled_data = scaler.fit_transform(cltv_df[cluster_features])


In [87]:
scaled_data 

array([[ 0.37638647, -1.26048125, -1.35708033],
       [-0.26041668, -0.37194529, -0.21845683],
       [-0.21629628, -0.37194529, -0.21845683],
       [-0.15201648,  0.51659068,  0.92016667],
       [ 0.17214231,  0.51659068,  0.92016667],
       [ 1.39031482,  0.51659068,  0.92016667],
       [-0.26480313,  0.51659068,  0.92016667],
       [ 3.03221858, -0.37194529, -0.21845683],
       [-0.271858  ,  1.40512664,  0.92016667],
       [-0.271858  ,  1.40512664,  0.92016667],
       [-0.271858  , -0.37194529,  0.92016667],
       [-0.271858  , -0.37194529,  0.92016667],
       [-0.271858  , -0.37194529,  0.92016667],
       [-0.271858  , -0.37194529,  0.92016667],
       [-0.271858  ,  1.40512664,  0.92016667],
       [-0.271858  , -1.26048125, -1.35708033],
       [-0.271858  , -0.37194529,  0.92016667],
       [-0.271858  , -0.37194529,  0.92016667],
       [-0.271858  , -0.37194529,  0.92016667],
       [-0.271858  , -0.37194529,  0.92016667],
       [-0.271858  , -1.26048125, -1.357

In [88]:
kmeans = KMeans(n_clusters=4, random_state=23, n_init=10)
cltv_df['Cluster'] = kmeans.fit_predict(scaled_data)

In [89]:
cltv_df['Cluster']

0      3
1      0
2      0
3      2
4      2
      ..
167    2
168    2
169    1
170    2
171    1
Name: Cluster, Length: 172, dtype: int32

In [98]:
# 6. 예측 모델 학습 (CLTV_log 예측)
features = ['총구매금액', '구매빈도', '고객유지기간', 'Cluster']
target = 'CLTV_log'


In [99]:
model_df = cltv_df.dropna(subset=features + [target])
X = model_df[features]
y = model_df[target]

In [95]:
X 

Unnamed: 0,총구매금액,구매빈도,고객유지기간,Cluster
0,1.064040e+09,12,1,3
1,1.878000e+07,24,2,0
2,9.120000e+07,24,2,0
3,1.967100e+08,36,3,2
4,7.287900e+08,36,3,2
...,...,...,...,...
167,0.000000e+00,36,3,2
168,2.124600e+08,36,3,2
169,1.089198e+10,36,3,1
170,0.000000e+00,36,3,2


In [97]:
y

0      23.270245
1      20.619504
2      22.199766
3      23.779372
4      25.089027
         ...    
167     0.000000
168    23.856396
169    27.793424
170     0.000000
171    27.091754
Name: CLTV_log, Length: 172, dtype: float64

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)



In [101]:
# 7. 예측 및 평가
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


In [115]:
print("RandomForestRegressor 예측 결과")
print(f"RMSE: {rmse:.4f}")
print(f"MAE : {mae:.4f}")
print(f"R²  : {r2:.4f}")

RandomForestRegressor 예측 결과
RMSE: 0.2300
MAE : 0.0868
R²  : 0.9994


In [104]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

In [None]:
# 예측 모델 학습 (CLTV_log 예측)
features = ['총구매금액', '구매빈도', '고객유지기간', 'Cluster']
target = 'CLTV_log'


In [105]:
# 2. Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)


In [106]:

# 평가
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

In [107]:
print(" Linear Regression 성능")
print(f"RMSE: {rmse_lr:.4f}")
print(f"MAE : {mae_lr:.4f}")
print(f"R²  : {r2_lr:.4f}")

 Linear Regression 성능
RMSE: 8.8921
MAE : 7.6779
R²  : 0.1234


In [108]:
# 3. Decision Tree
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

In [109]:
rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))
mae_dt = mean_absolute_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)


In [110]:

print("\n Decision Tree 성능")
print(f"RMSE: {rmse_dt:.4f}")
print(f"MAE : {mae_dt:.4f}")
print(f"R²  : {r2_dt:.4f}")


✅ Decision Tree 성능
RMSE: 0.2390
MAE : 0.0787
R²  : 0.9994


In [111]:
# 4. XGBoost Regressor
xgb_model = XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

In [112]:
# 평가
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

In [113]:
print("\n✅ XGBoost 성능")
print(f"RMSE: {rmse_xgb:.4f}")
print(f"MAE : {mae_xgb:.4f}")
print(f"R²  : {r2_xgb:.4f}")


✅ XGBoost 성능
RMSE: 0.2268
MAE : 0.0881
R²  : 0.9994


In [114]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

poly_model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
poly_model.fit(X_train, y_train)
y_pred_poly = poly_model.predict(X_test)

r2_poly = r2_score(y_test, y_pred_poly)
print(f"Polynomial Regression R²: {r2_poly:.4f}")


Polynomial Regression R²: 0.4482


## 🔍 Linear Regression vs Tree 기반 모델 비교 분석

---

### 📊 모델별 성능 요약

| 모델명 | RMSE | MAE | R² Score | 해석 |
|--------|------|-----|-----------|------|
| **Linear Regression** | 8.8921 | 7.6779 | 0.1234 | ❌ 비선형 구조를 반영하지 못해 설명력 매우 낮음 |
| **Polynomial Regression (2차)** | - | - | 0.4482 | ⬆️ 일부 비선형 구조 반영 성공 |
| **Decision Tree** | 0.2390 | 0.0787 | 0.9994 | ✅ 비선형 + 조건 분기 학습 가능 |
| **Random Forest** | 0.7576 | 0.2483 | 0.9945 | ✅ 강력한 예측 성능, 변수 중요도 해석 가능 |
| **XGBoost** | 0.2268 | 0.0881 | 0.9994 | ✅ 트리 기반 중 최고 성능, 실무 최적화에 강함 |

---

### ⚠️ Linear Regression 성능 저하 원인

| 원인 | 설명 |
|------|------|
| ❌ **선형 모델의 한계** | `CLTV_log`는 변수 간 곱셈 구조 → 선형 회귀는 적합하지 않음 |
| ❌ **상호작용 반영 불가** | 변수 간 제곱, 곱 등을 반영하지 못함 |
| ❌ **복잡한 조건 구조 학습 불가** | 트리 기반처럼 분기 규칙을 학습할 수 없음 |

---

### ✅ 대안 모델 제안

| 모델 | 장점 |
|------|------|
| **DecisionTree / RandomForest / XGBoost** | 분기 규칙 기반 → 비선형 구조 및 변수 상호작용 학습에 강함 |
| **Polynomial Regression** | 고차항 특성 추가 → 선형회귀보다 나은 성능 가능 |
| **Feature Engineering** | 범주형 변수(`국가명`, `차종`, `공장명`) 추가 시 설명력 향상 기대 |

---

### 🧪 Polynomial Regression 실험 결과

```python
Polynomial Regression R²: 0.4482
```

| 모델 유형 | R² 점수 | 해석 |
|-----------|---------|------|
| **Linear Regression** | 0.1234 | ❌ 단순 선형으로는 설명 부족 |
| **Polynomial Regression (2차)** | 0.4482 | ⬆️ 곡선 구조 반영해 성능 일부 향상 |
| **Tree 기반 모델들** | 0.99+ | ✅ 조건 분기 및 비선형 관계 완전 학습 |

---

### 🎯 핵심 요약

- **선형 회귀 모델**은 곱셈, 조건 분기 구조에 적합하지 않음
- **Polynomial Regression**은 제한된 비선형 구조를 반영해 성능 개선 가능
- **Tree 기반 모델 (RandomForest, XGBoost 등)** 은 실무에 최적화된 고성능 모델

---

### ✅ 추천 전략 정리

| 전략 | 목적 |
|------|------|
| ✅ 트리 기반 모델 중심 분석 | 실제 예측 정확도 + 설명력 모두 확보 |
| ✅ 선형 회귀는 빠른 초기 분석용 | 간단한 시각화나 베이스라인 모델링에 적합 |
| ✅ Polynomial 회귀는 중간 대안 | 선형보다 성능이 높고 구현 쉬움 |
| ✅ 범주형 특성 추가 | 국가명, 차종, 공장명 등 인코딩 시 설명력 향상 가능

---
