# 交差検証
- どの方法でモデル作るのがいいか調べる
  - 多項式（1 - 10次）
  - ガウス型基底関数
  - 平滑化スプラインモデル
  - GAM（general aditive modeel, 一般下方モデル）
  - リッジ回帰

- 固定Pと全ての非固定Pの値の組み合わせについて回帰モデルを作る
  - そのそれぞれのモデルについてk分割交差検証
  - 平均を取る
- それぞれの回帰モデルごとの平均全体の平均を元に優秀なモデル生成方法を見つける
  - とりあえずスコア高ければいいよな？


In [None]:
DATASET_NAMES = ["les_miserables", "1138_bus", "USpowerGrid"]
PARAMS_NAMES = sorted(["number_of_pivots", "number_of_iterations", "eps"])


In [None]:
# Third Party Library
import pandas as pd

# First Party Library
from config.paths import get_project_root_path


def generate_data_df_dict(dataset_names):
    EXPERIENT_DATA_DIR = (
        get_project_root_path()
        .joinpath("data")
        .joinpath("experiments")
        .joinpath("regression_analysis")
    )

    data_df_dict = {}
    for dataset_name in dataset_names:
        data_path = EXPERIENT_DATA_DIR.joinpath("grid").joinpath(
            f"{dataset_name}-without-pos.pkl"
        )
        data_df_dict[dataset_name] = pd.read_pickle(data_path)

    return data_df_dict


def generate_params_candidates():
    params_steps = {
        "number_of_pivots": 5,
        "number_of_iterations": 10,
        "eps": 0.05,
    }

    params_candidates = {}
    params_name1s = ["number_of_pivots", "number_of_iterations", "eps"]
    for params_name1 in params_name1s:
        params_candidates[params_name1] = [
            v * params_steps[params_name1] for v in list(range(1, 20 + 1))
        ]

    return params_candidates


data_df_dict = generate_data_df_dict(DATASET_NAMES)
params_candidates = generate_params_candidates()


In [None]:
model_ids = [
    "n=1",
    "n=2",
    "n=3",
    "n=4",
    "n=5",
    "n=6",
    "n=7",
    "n=8",
    "n=9",
    "n=10",
    "gaussian",
    "spline",
    "ridge",
]

result = {}
for model_id in model_ids:
    result[model_id] = {}
    for dataset_name in DATASET_NAMES:
        result[model_id][dataset_name] = {'scores': []}


In [1]:
from itertools import combinations, product
import pickle
from config.paths import get_project_root_path
from config.quality_metrics import ALL_QM_NAMES
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold

data_y_dict = {}
for dataset_name in DATASET_NAMES:
    data_y_dict[dataset_name] = {}
    for qm_name in ALL_QM_NAMES:
        data_y_dict[dataset_name][qm_name] = np.array(
            data_df_dict[dataset_name][qm_name]
        ).reshape(-1, 1)

sscaler = StandardScaler()
degs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

for params_name in PARAMS_NAMES:
    # params_name以外のパラメータの組み合わせ作成
    params_t = list(filter(lambda x: x != params_name, PARAMS_NAMES))
    comb = list(
        product(params_candidates[params_t[0]], params_candidates[params_t[1]])
    )
    for qm_name in ALL_QM_NAMES:
        for dataset_name in DATASET_NAMES:
            for c in comb:
                df = data_df_dict[dataset_name]
                df = df.query(
                    " & ".join(
                        [
                            f"{item[1]} == {item[0]}"
                            for item in zip(c, params_t)
                        ]
                    )
                )
                x = df[params_name]
                y = df[qm_name]
                x = np.array(x).reshape(-1, 1)
                y = np.array(y).reshape(-1, 1)

                sscaler.fit(data_y_dict[dataset_name][qm_name])
                yss = sscaler.transform(y)

                for deg in degs:
                    regr = Pipeline(
                        [
                            ("poly", PolynomialFeatures(degree=deg)),
                            ("linear", LinearRegression()),
                        ]
                    )

                    rand_state = 0
                    kf = KFold()

                    regr.fit(x, yss)

                    ux = np.unique(x).reshape(-1, 1)
                    p_poly = regr.predict(ux)


NameError: name 'DATASET_NAMES' is not defined