In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# features/ csv -> dataframe
feat_cancer_raw = pd.read_csv("feat_cancer.csv")
df_cancer = pd.DataFrame(feat_cancer_raw)
feat_old_raw = pd.read_csv("feat_old.csv")
df_old = pd.DataFrame(feat_old_raw)
feat_cyc_raw = pd.read_csv("feat_cyc.csv")
df_cyc = pd.DataFrame(feat_cyc_raw)
feat_ment_raw = pd.read_csv("feat_ment.csv")
df_ment = pd.DataFrame(feat_ment_raw)

# target/ csv -> dataframe
target_death_raw = pd.read_csv("target_death.csv")
df_death = pd.DataFrame(target_death_raw)

# 'ADD_UP'이 포함된 데이터 제거
df_death = df_death[~df_death['AREA'].str.contains('ADD_UP', na=False)]

# 고유한 지역 리스트 추출
areas = df_death['AREA'].unique()

# feature dict / target dict
features_dict = {}
target_dict = {}

In [None]:
# 각 지역별 feature와 target 데이터 생성
for area in areas:
    area_target = df_death[df_death['AREA'] == area].drop(columns='AREA').transpose()
    area_target.columns = ['TARGET']

    # 각 feature 데이터 생성
    area_feat_cancer = df_cancer[df_cancer['AREA'] == area].drop(columns='AREA').transpose()
    area_feat_cancer.columns = ['CANCER']

    area_feat_old = df_old[df_old['AREA'] == area].drop(columns='AREA').transpose()
    area_feat_old.columns = ['OLD']

    area_feat_cyc = df_cyc[df_cyc['AREA'] == area].drop(columns='AREA').transpose()
    area_feat_cyc.columns = ['CYC']

    area_feat_ment = df_ment[df_ment['AREA'] == area].drop(columns='AREA').transpose()
    area_feat_ment.columns = ['MENT']

    # 지역별 feature 데이터 결합
    area_features = pd.concat([area_feat_cancer, area_feat_old, area_feat_cyc, area_feat_ment], axis=1)

    # # 'ADD_UP'이 포함된 features 데이터 제거
    # area_features = area_features[~area_features.index.str.contains('ADD_UP', na=False)]

    # dictionary에 feature와 target 저장
    features_dict[area] = area_features
    target_dict[area] = area_target

In [None]:
# PolyFeatures + GridSearch + Ridge 회귀
for area in features_dict.keys():
    print(f"\n{area} 데이터에 대한 학습 시작")

    # 해당 지역의 feature와 target을 가져오기
    X_area = features_dict[area]
    y_area = target_dict[area]

    # 데이터 스케일링
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_area)

    # 다항 특성 추가 (degree=2, 예시로 2차 다항식 추가)
    poly = PolynomialFeatures(degree=2)
    X_poly = poly.fit_transform(X_scaled)

    # train_test_split 80% : 20%
    X_train, X_test, y_train, y_test = train_test_split(X_poly, y_area, test_size=0.2, random_state=42)

    # Ridge 회귀 모델 + GridSearchCV
    ridge_model = Ridge()
    param_grid = {'alpha': np.logspace(-6, 6, 13)}  # alpha 하이퍼파라미터 튜닝
    grid_search = GridSearchCV(ridge_model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # 최적 모델
    best_ridge_model = grid_search.best_estimator_

    # 예측
    y_train_pred = best_ridge_model.predict(X_train)
    y_test_pred = best_ridge_model.predict(X_test)

    # MSE (Mean Squared Error) for Test Data
    mse = mean_squared_error(y_test, y_test_pred)
    print(f"{area} - 평균 제곱 오차 (MSE) for Test Data: {mse}")

    # R² for Test Data
    r2_test = r2_score(y_test, y_test_pred)
    print(f"{area} - 결정 계수 (R²) for Test Data: {r2_test}")

    # MAE (Mean Absolute Error) for Test Data
    mae = mean_absolute_error(y_test, y_test_pred)
    print(f"{area} - 평균 절대 오차 (MAE) for Test Data: {mae}")

    # R² for Train Data
    r2_train = r2_score(y_train, y_train_pred)
    print(f"{area} - 결정 계수 (R²) for Train Data: {r2_train}")
    print("-" * 50)  # 구분선