In [15]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sb

In [46]:
# 기아 해외판매 데이터 불러오기
df = pd.read_csv("../../data/processed/기아_해외판매_전처리.CSV")
df

Unnamed: 0,차량 구분,국가명,연도,월,수출량,차종,거래 유형,판매량,공장명(국가),공장 코드,...,CLTV,CLTV_log,최근구매연도,구매횟수,총구매금액,R,F,M,RFM점수,고객등급
0,일반 차량,Asia / Pacific,2023,1,0.0,bongo,국내,3580.0,China Plants,0,...,3.728866e+14,33.552295,2025,2,6.275096e+10,1,1,1,3,저위험
1,일반 차량,Asia / Pacific,2023,1,0.0,bongo,수출,3014.0,China Plants,0,...,3.728866e+14,33.552295,2025,2,6.275096e+10,1,1,1,3,저위험
2,특수 차량,Asia / Pacific,2023,1,0.0,bongo,국내,0.0,China Plants,0,...,3.728866e+14,33.552295,2025,2,6.275096e+10,1,1,1,3,저위험
3,특수 차량,Asia / Pacific,2023,1,0.0,bongo,수출,141.0,China Plants,0,...,3.728866e+14,33.552295,2025,2,6.275096e+10,1,1,1,3,저위험
4,일반 차량,Asia / Pacific,2023,2,0.0,bongo,국내,8977.0,China Plants,0,...,3.728866e+14,33.552295,2025,2,6.275096e+10,1,1,1,3,저위험
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4987,Special Vehicle,Latin America,2025,12,0.0,k3,국내,0.0,China Plants,0,...,3.728866e+14,33.552295,2025,3,6.275096e+10,1,2,1,4,저위험
4988,Special Vehicle,Middle East/Africa,2025,12,0.0,k3,국내,0.0,China Plants,0,...,3.728866e+14,33.552295,2025,3,6.275096e+10,1,2,1,4,저위험
4989,Special Vehicle,Asia / Pacific,2025,12,0.0,k3,국내,0.0,China Plants,0,...,3.728866e+14,33.552295,2025,3,6.275096e+10,1,2,1,4,저위험
4990,Special Vehicle,India,2025,12,0.0,k3,국내,0.0,China Plants,0,...,3.728866e+14,33.552295,2025,3,6.275096e+10,1,2,1,4,저위험


In [47]:
df.columns

Index(['차량 구분', '국가명', '연도', '월', '수출량', '차종', '거래 유형', '판매량', '공장명(국가)',
       '공장 코드', '공장판매량', 'Korea', 'U.S.A', 'Canada', 'Mexico', 'Europe',
       'Eastern Europe', 'Latin America', 'Middle East', 'Africa',
       'Asia Pacific', 'India', 'China', '총판매량', '차량단가', '차량가격', '평균구매금액',
       '구매빈도', '고객유지기간', 'CLTV', 'CLTV_log', '최근구매연도', '구매횟수', '총구매금액', 'R',
       'F', 'M', 'RFM점수', '고객등급'],
      dtype='object')

##  고객생애가치 (CLTV: Customer Lifetime Value) 계산

고객생애가치(Customer Lifetime Value, CLTV)는 **한 명의 고객이 기업과의 거래를 통해 얼마나 많은 가치를 제공할 수 있는지를 측정하는 지표**입니다.  
이는 마케팅 전략, 고객 세분화, 타겟 마케팅, CRM 등에 활용됩니다.

---

### 주요 항목 설명

| 항목 | 설명 |
|------|------|
| **총구매금액 (Total Purchase Amount)** | 고객이 지금까지 구매한 전체 금액 |
| **구매빈도 (Purchase Frequency)** | 일정 기간 동안 얼마나 자주 구매했는지 |
| **고객유지기간 (Customer Retention Period)** | 고객이 얼마나 오랫동안 거래하고 있는지 |
| **CLTV_Calculated** | 위 3개 컬럼을 곱해서 산출한 고객 생애 가치 |

---

###  CLTV 계산 수식
\[
\text{CLTV} = \text{총구매금액} \times \text{구매빈도} \times \text{고객유지기간}
\]

각 요소가 클수록 고객의 생애가치는 증가하게 됩니다.  
CLTV는 고객 세그먼트를 나누거나, 마케팅 ROI를 극대화하기 위한 중요한 기준으로 사용됩니다.

---

### 활용 방안
- 이 CLTV 컬럼을 Y값으로 활용하여 AI 모델 학습에 사용할 수 있습니다.
- 고객 세분화와 타겟 마케팅 전략 수립에 유용합니다.


In [48]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [49]:
# 수치형 / 범주형 컬럼 분리
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()


In [50]:
numeric_cols

['연도',
 '월',
 '수출량',
 '판매량',
 '공장판매량',
 'Korea',
 'U.S.A',
 'Canada',
 'Mexico',
 'Europe',
 'Eastern Europe',
 'Latin America',
 'Middle East',
 'Africa',
 'Asia Pacific',
 'India',
 'China',
 '총판매량',
 '차량단가',
 '차량가격',
 '평균구매금액',
 '구매빈도',
 '고객유지기간',
 'CLTV',
 'CLTV_log',
 '최근구매연도',
 '구매횟수',
 '총구매금액',
 'R',
 'F',
 'M',
 'RFM점수']

In [51]:
categorical_cols

['차량 구분', '국가명', '차종', '거래 유형', '공장명(국가)', '공장 코드', '고객등급']

In [52]:
target = 'CLTV'

In [53]:
# 범주형 변수 인코딩 (Label Encoding)
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [54]:
# 결측치 처리 (수치형 평균 대체)
imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

In [55]:
# 스케일링 (StandardScaler 사용)
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [56]:
# 클러스터링 (KMeans, n_clusters=4)
kmeans = KMeans(n_clusters=4, random_state=23, n_init=10)
df_scaled['Cluster'] = kmeans.fit_predict(df_scaled[numeric_cols])

In [57]:
# 모델 학습을 위한 Feature 및 Target 설정
features = ['평균구매금액', '구매빈도', '고객유지기간', '구매횟수', '총구매금액', 'R', 'F', 'M', 'Cluster']
target = 'CLTV'

In [58]:
# 결측치 제거
df_model = df_scaled.dropna(subset=features + [target])
X = df_model[features]
y = df_model[target]

In [59]:
X

Unnamed: 0,평균구매금액,구매빈도,고객유지기간,구매횟수,총구매금액,R,F,M,Cluster
0,-0.268635,4.547474e-13,0.0,-0.672138,-0.268635,0.0,-1.352750,0.0,3
1,-0.268635,4.547474e-13,0.0,-0.672138,-0.268635,0.0,-1.352750,0.0,3
2,-0.268635,4.547474e-13,0.0,-0.672138,-0.268635,0.0,-1.352750,0.0,0
3,-0.268635,4.547474e-13,0.0,-0.672138,-0.268635,0.0,-1.352750,0.0,0
4,-0.268635,4.547474e-13,0.0,-0.672138,-0.268635,0.0,-1.352750,0.0,3
...,...,...,...,...,...,...,...,...,...
4987,-0.268635,4.547474e-13,0.0,0.678632,-0.268635,0.0,0.739235,0.0,1
4988,-0.268635,4.547474e-13,0.0,0.678632,-0.268635,0.0,0.739235,0.0,1
4989,-0.268635,4.547474e-13,0.0,0.678632,-0.268635,0.0,0.739235,0.0,1
4990,-0.268635,4.547474e-13,0.0,0.678632,-0.268635,0.0,0.739235,0.0,1


In [60]:
y

0      -0.268635
1      -0.268635
2      -0.268635
3      -0.268635
4      -0.268635
          ...   
4987   -0.268635
4988   -0.268635
4989   -0.268635
4990   -0.268635
4991   -0.268635
Name: CLTV, Length: 4992, dtype: float64

In [61]:
# 학습/테스트 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)


In [62]:
# 모델 학습 (RandomForestRegressor)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [63]:
# 예측 및 평가
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("✅ RMSE:", rmse)

✅ RMSE: 7.468536659882067e-15


In [67]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [68]:
# 성능 지표 계산
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 출력
print(f"✅ RMSE: {rmse:.4f}")
print(f"✅ MAE: {mae:.4f}")
print(f"✅ R² Score: {r2:.4f}")


✅ RMSE: 0.0000
✅ MAE: 0.0000
✅ R² Score: 1.0000
