In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# features/ csv -> dataframe
feat_edu_raw = pd.read_csv("./birth_csv/feat_edu.csv")
df_edu = pd.DataFrame(feat_edu_raw)
feat_hou_raw = pd.read_csv("./birth_csv/feat_hou.csv")
df_hou = pd.DataFrame(feat_hou_raw)
feat_mar_raw = pd.read_csv("./birth_csv/feat_mar.csv")
df_mar = pd.DataFrame(feat_mar_raw)
feat_preg_raw = pd.read_csv("./birth_csv/feat_preg.csv")
df_preg = pd.DataFrame(feat_preg_raw)
feat_sal_raw = pd.read_csv("./birth_csv/feat_sal.csv")
df_sal = pd.DataFrame(feat_sal_raw)

# target/ csv -> dataframe
target_birth_raw = pd.read_csv("./birth_csv/target_birth.csv")
df_birth = pd.DataFrame(target_birth_raw)

# 'ADD_UP'이 포함된 데이터 제거
df_birth = df_birth[~df_birth['AREA'].str.contains('ADD_UP', na=False)]

# 고유한 지역 리스트 추출
areas = df_birth['AREA'].unique()

# feature dict / target dict
features_dict = {}
target_dict = {}

In [2]:
# 각 지역별 feature와 target 데이터 생성
for area in areas:
    area_target = df_birth[df_birth['AREA'] == area].drop(columns='AREA').transpose()
    area_target.columns = ['TARGET']

    # 각 feature 데이터 생성
    area_feat_edu = df_edu[df_edu['AREA'] == area].drop(columns='AREA').transpose()
    area_feat_edu.columns = ['EDU']
    
    area_feat_hou = df_hou[df_hou['AREA'] == area].drop(columns='AREA').transpose()
    area_feat_hou.columns = ['HOU']
    
    area_feat_mar = df_mar[df_mar['AREA'] == area].drop(columns='AREA').transpose()
    area_feat_mar.columns = ['MAR']
    
    area_feat_preg = df_preg[df_preg['AREA'] == area].drop(columns='AREA').transpose()
    area_feat_preg.columns = ['PREG']
    
    area_feat_sal = df_sal[df_sal['AREA'] == area].drop(columns='AREA').transpose()
    area_feat_sal.columns = ['SAL']

    # 지역별 feature 데이터 결합
    area_features = pd.concat([area_feat_edu, area_feat_hou, area_feat_mar, area_feat_preg, area_feat_sal], axis=1)

    # 'ADD_UP'이 포함된 features 데이터 제거
    area_features = area_features[~area_features.index.str.contains('ADD_UP', na=False)]

    # dictionary에 feature와 target 저장
    features_dict[area] = area_features
    target_dict[area] = area_target


In [3]:
# 각 지역에 대해 학습 및 평가 진행
for area in features_dict.keys():
    print(f"\n{area} 데이터에 대한 학습 시작")

    # 해당 지역의 feature와 target을 가져오기
    X_area = features_dict[area]
    y_area = target_dict[area]

    # 데이터 스케일링
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_area)

    # 다항 특성 추가 (degree=2, 예시로 2차 다항식 추가)
    poly = PolynomialFeatures(degree=2)
    X_poly = poly.fit_transform(X_scaled)

    # train_test_split 80% : 20%
    X_train, X_test, y_train, y_test = train_test_split(X_poly, y_area, test_size=0.2, random_state=42)

    # Lasso 회귀 모델 + GridSearchCV
    lasso_model = Lasso()
    param_grid = {'alpha': np.logspace(-6, 6, 13)}  # alpha 하이퍼파라미터 튜닝
    grid_search = GridSearchCV(lasso_model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # 최적 모델
    best_lasso_model = grid_search.best_estimator_

    # 예측
    y_train_pred = best_lasso_model.predict(X_train)
    y_test_pred = best_lasso_model.predict(X_test)

    # MSE (Mean Squared Error) for Test Data
    mse = mean_squared_error(y_test, y_test_pred)
    print(f"{area} - 평균 제곱 오차 (MSE) for Test Data: {mse}")

    # R² for Test Data
    r2_test = r2_score(y_test, y_test_pred)
    print(f"{area} - 결정 계수 (R²) for Test Data: {r2_test}")

    # MAE (Mean Absolute Error) for Test Data
    mae = mean_absolute_error(y_test, y_test_pred)
    print(f"{area} - 평균 절대 오차 (MAE) for Test Data: {mae}")

    # R² for Train Data
    r2_train = r2_score(y_train, y_train_pred)
    print(f"{area} - 결정 계수 (R²) for Train Data: {r2_train}")
    print("-" * 50)  # 구분선


SEOUL 데이터에 대한 학습 시작


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


SEOUL - 평균 제곱 오차 (MSE) for Test Data: 7339124.723944077
SEOUL - 결정 계수 (R²) for Test Data: 0.986787128949341
SEOUL - 평균 절대 오차 (MAE) for Test Data: 2611.72021771075
SEOUL - 결정 계수 (R²) for Train Data: 0.9700775061081148
--------------------------------------------------

BUSAN 데이터에 대한 학습 시작


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


BUSAN - 평균 제곱 오차 (MSE) for Test Data: 1597921.2147361026
BUSAN - 결정 계수 (R²) for Test Data: 0.9559157386705571
BUSAN - 평균 절대 오차 (MAE) for Test Data: 1066.496470594942
BUSAN - 결정 계수 (R²) for Train Data: 0.9155021416446765
--------------------------------------------------

DAEGU 데이터에 대한 학습 시작


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


DAEGU - 평균 제곱 오차 (MSE) for Test Data: 571967.7701227134
DAEGU - 결정 계수 (R²) for Test Data: 0.9765161133701923
DAEGU - 평균 절대 오차 (MAE) for Test Data: 557.955578234546
DAEGU - 결정 계수 (R²) for Train Data: 0.9773677592053723
--------------------------------------------------

INCHEON 데이터에 대한 학습 시작


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


INCHEON - 평균 제곱 오차 (MSE) for Test Data: 1009736.9115844903
INCHEON - 결정 계수 (R²) for Test Data: 0.9644913647678828
INCHEON - 평균 절대 오차 (MAE) for Test Data: 820.9563743932913
INCHEON - 결정 계수 (R²) for Train Data: 0.9806889568632874
--------------------------------------------------

GWANGJU 데이터에 대한 학습 시작


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


GWANGJU - 평균 제곱 오차 (MSE) for Test Data: 571158.0893088278
GWANGJU - 결정 계수 (R²) for Test Data: 0.9370943884634595
GWANGJU - 평균 절대 오차 (MAE) for Test Data: 640.3918752968862
GWANGJU - 결정 계수 (R²) for Train Data: 0.9657352794492504
--------------------------------------------------

DAEJEON 데이터에 대한 학습 시작


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


DAEJEON - 평균 제곱 오차 (MSE) for Test Data: 1936800.8778804354
DAEJEON - 결정 계수 (R²) for Test Data: 0.8241976488551648
DAEJEON - 평균 절대 오차 (MAE) for Test Data: 1259.3730393483988
DAEJEON - 결정 계수 (R²) for Train Data: 0.9965589047826966
--------------------------------------------------

ULSAN 데이터에 대한 학습 시작


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


ULSAN - 평균 제곱 오차 (MSE) for Test Data: 2767006.3771860073
ULSAN - 결정 계수 (R²) for Test Data: 0.6466924982414979
ULSAN - 평균 절대 오차 (MAE) for Test Data: 1525.250011353815
ULSAN - 결정 계수 (R²) for Train Data: 0.9999706717928403
--------------------------------------------------

SEJONG 데이터에 대한 학습 시작


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


SEJONG - 평균 제곱 오차 (MSE) for Test Data: 100511.32931750515
SEJONG - 결정 계수 (R²) for Test Data: 0.9060811291657649
SEJONG - 평균 절대 오차 (MAE) for Test Data: 281.54500160467137
SEJONG - 결정 계수 (R²) for Train Data: 0.987105104457839
--------------------------------------------------

GYEONGGI 데이터에 대한 학습 시작


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


GYEONGGI - 평균 제곱 오차 (MSE) for Test Data: 11913065.883545581
GYEONGGI - 결정 계수 (R²) for Test Data: 0.9731695728620341
GYEONGGI - 평균 절대 오차 (MAE) for Test Data: 2624.6139591237297
GYEONGGI - 결정 계수 (R²) for Train Data: 0.9648493881994172
--------------------------------------------------

GANGWON 데이터에 대한 학습 시작


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

GANGWON - 평균 제곱 오차 (MSE) for Test Data: 377236.8658279956
GANGWON - 결정 계수 (R²) for Test Data: 0.9377173605059655
GANGWON - 평균 절대 오차 (MAE) for Test Data: 542.1645718062491
GANGWON - 결정 계수 (R²) for Train Data: 0.9909721494573777
--------------------------------------------------

CHUNGBUK 데이터에 대한 학습 시작


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

CHUNGBUK - 평균 제곱 오차 (MSE) for Test Data: 689469.1608946164
CHUNGBUK - 결정 계수 (R²) for Test Data: 0.9318475923265713
CHUNGBUK - 평균 절대 오차 (MAE) for Test Data: 803.2467026010538
CHUNGBUK - 결정 계수 (R²) for Train Data: 0.9748241762706589
--------------------------------------------------

CHUNGNAM 데이터에 대한 학습 시작


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


CHUNGNAM - 평균 제곱 오차 (MSE) for Test Data: 719242.2485483072
CHUNGNAM - 결정 계수 (R²) for Test Data: 0.9622716687577695
CHUNGNAM - 평균 절대 오차 (MAE) for Test Data: 753.0275987740997
CHUNGNAM - 결정 계수 (R²) for Train Data: 0.9849416070810005
--------------------------------------------------

JEONBUK 데이터에 대한 학습 시작
JEONBUK - 평균 제곱 오차 (MSE) for Test Data: 284184.49217943975
JEONBUK - 결정 계수 (R²) for Test Data: 0.9831527465201623
JEONBUK - 평균 절대 오차 (MAE) for Test Data: 366.444919257066
JEONBUK - 결정 계수 (R²) for Train Data: 0.9843057195865076
--------------------------------------------------

JEONNAM 데이터에 대한 학습 시작


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

JEONNAM - 평균 제곱 오차 (MSE) for Test Data: 2001771.745086188
JEONNAM - 결정 계수 (R²) for Test Data: 0.8754292853328433
JEONNAM - 평균 절대 오차 (MAE) for Test Data: 1143.2027774207145
JEONNAM - 결정 계수 (R²) for Train Data: 0.9985156900098143
--------------------------------------------------

GYEONGBUK 데이터에 대한 학습 시작


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


GYEONGBUK - 평균 제곱 오차 (MSE) for Test Data: 659448.9430141514
GYEONGBUK - 결정 계수 (R²) for Test Data: 0.9797025034448464
GYEONGBUK - 평균 절대 오차 (MAE) for Test Data: 639.055524485922
GYEONGBUK - 결정 계수 (R²) for Train Data: 0.9808462877092923
--------------------------------------------------

GYEONGNAM 데이터에 대한 학습 시작


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


GYEONGNAM - 평균 제곱 오차 (MSE) for Test Data: 4363539.826969499
GYEONGNAM - 결정 계수 (R²) for Test Data: 0.9336615006388016
GYEONGNAM - 평균 절대 오차 (MAE) for Test Data: 1903.7614182556886
GYEONGNAM - 결정 계수 (R²) for Train Data: 0.9581426309861728
--------------------------------------------------

JEJU 데이터에 대한 학습 시작
JEJU - 평균 제곱 오차 (MSE) for Test Data: 173348.29883638266
JEJU - 결정 계수 (R²) for Test Data: 0.8357836492656894
JEJU - 평균 절대 오차 (MAE) for Test Data: 332.1500609199062
JEJU - 결정 계수 (R²) for Train Data: 0.9446572781831615
--------------------------------------------------


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
