- 各パラメータごとに非線形回帰を行う
- このとき、対象パラメータが20通り存在するので、それぞれについて回帰線を出す
  - 対象ではないパラメータについては、全ての組み合わせ（対象パラメータを持つ全データ）を利用する
- 複数のグラフの上記回帰線を重ねてみる


In [1]:
DATASET_NAMES = ["les_miserables", "1138_bus", "USpowerGrid"]
PARAMS_NAMES = ["number_of_pivots", "number_of_iterations", "eps"]
COLOR_MAP = {
    DATASET_NAMES[0]: 'red',
    DATASET_NAMES[1]: "green",
    DATASET_NAMES[2]: 'blue'
}

In [2]:
# Third Party Library
import pandas as pd

# First Party Library
from config.paths import get_project_root_path


def generate_data_df_dict(dataset_names):
    EXPERIENT_DATA_DIR = (
        get_project_root_path()
        .joinpath("data")
        .joinpath("experiments")
        .joinpath("regression_analysis")
    )

    data_df_dict = {}
    for dataset_name in dataset_names:
        data_path = EXPERIENT_DATA_DIR.joinpath("grid").joinpath(
            f"{dataset_name}-without-pos.pkl"
        )
        data_df_dict[dataset_name] = pd.read_pickle(data_path)

    return data_df_dict


def generate_params_candidates():
    params_steps = {
        "number_of_pivots": 5,
        "number_of_iterations": 10,
        "eps": 0.05,
    }

    params_candidates = {}
    params_names = ["number_of_pivots", "number_of_iterations", "eps"]
    for params_name in params_names:
        params_candidates[params_name] = [
            v * params_steps[params_name] for v in list(range(1, 20 + 1))
        ]

    return params_candidates


data_df_dict = generate_data_df_dict(DATASET_NAMES)
params_candidates = generate_params_candidates()


In [3]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from itertools import product
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
from config.paths import get_project_root_path
from config.quality_metrics import ALL_QM_NAMES

# sscaler = MinMaxScaler()
sscaler = StandardScaler()

deg = 5

# sscaler_map = {}
# for params_name in PARAMS_NAMES:
#     # params_name以外のパラメータの組み合わせ作成
#     params_t = list(filter(lambda x: x != params_name, PARAMS_NAMES))
#     comb = list(
#         product(params_candidates[params_t[0]], params_candidates[params_t[1]])
#     )
#     for qm_name in ALL_QM_NAMES:
#         fig, axis = plt.subplots(1, 1, figsize=(8, 8))
#         fig.subplots_adjust(left=0.04, right=0.98, bottom=0.05, top=0.95)
#         legends = {}
#         for dataset_name in DATASET_NAMES:

data_y_dict = {}
for dataset_name in DATASET_NAMES:
    data_y_dict[dataset_name] = {}
    for qm_name in ALL_QM_NAMES:
        data_y_dict[dataset_name][qm_name] = np.array(data_df_dict[dataset_name][qm_name]).reshape(-1, 1)

for params_name in PARAMS_NAMES:
    # params_name以外のパラメータの組み合わせ作成
    params_t = list(filter(lambda x: x != params_name, PARAMS_NAMES))
    comb = list(
        product(params_candidates[params_t[0]], params_candidates[params_t[1]])
    )
    for qm_name in ALL_QM_NAMES:
        fig, axis = plt.subplots(1, 1, figsize=(8, 8))
        fig.subplots_adjust(left=0.04, right=0.98, bottom=0.05, top=0.95)
        legends = {}
        for dataset_name in DATASET_NAMES:
            for c in comb:
                df = data_df_dict[dataset_name]
                df = df.query(
                    " & ".join(
                        [
                            f"{item[1]} == {item[0]}"
                            for item in zip(c, params_t)
                        ]
                    )
                )
                x = df[params_name]
                y = df[qm_name]
                x = np.array(x).reshape(-1, 1)
                y = np.array(y).reshape(-1, 1)

                sscaler.fit(data_y_dict[dataset_name][qm_name])
                yss = sscaler.transform(y)

                regr = Pipeline(
                    [
                        ("poly", PolynomialFeatures(degree=deg)),
                        ("linear", LinearRegression()),
                    ]
                )

                regr.fit(x, yss)

                ux = np.unique(x).reshape(-1, 1)
                p_poly = regr.predict(ux)

                axis.scatter(
                    x, yss, color=COLOR_MAP[dataset_name], alpha=0.1
                )
                line = axis.plot(
                    ux,
                    p_poly,
                    color=COLOR_MAP[dataset_name],
                    alpha=0.1,
                    label=dataset_name,
                )
                if dataset_name not in legends:
                    legends[dataset_name] = line
                axis.set_title(f"{qm_name}")
                # axis.set_title(
                #     f'{qm_name}-{"R^2={:.3f}".format(regr.score(x, y))}'
                # )
        axis.legend([legends[key] for key in legends], DATASET_NAMES)
        
        # export_path = (
        #     get_project_root_path()
        #     .joinpath("data")
        #     .joinpath("experiments")
        #     .joinpath("regression_analysis")
        #     .joinpath("params")
        #     .joinpath("overlap")
        #     .joinpath(f"{params_name}-{qm_name}.png")
        # )
        # export_path.parent.mkdir(parents=True, exist_ok=True)
        # plt.savefig(export_path)


A proxy artist may be used instead.
See: https://matplotlib.org/stable/tutorials/intermediate/legend_guide.html#controlling-the-legend-entries
  axis.legend([legends[key] for key in legends], DATASET_NAMES)


In [None]:
from PIL import Image, ImageDraw, ImageFont


def get_concat_h(im1, im2):
    dst = Image.new("RGB", (im1.width + im2.width, im1.height), "black")
    dst.paste(im1, (0, 0))
    dst.paste(im2, (im1.width, 0))
    return dst


def get_concat_v(im1, im2):
    dst = Image.new("RGB", (im1.width, im1.height + im2.height))
    dst.paste(im1, (0, 0))
    dst.paste(im2, (0, im1.height))
    return dst


image_path = (
    get_project_root_path()
    .joinpath("data")
    .joinpath("experiments")
    .joinpath("regression_analysis")
    .joinpath("params")
    .joinpath('overlap')
)
image_path.mkdir(parents=True, exist_ok=True)



for params_name in PARAMS_NAMES:
    images = []
    tmp = []
    for qm_name in ALL_QM_NAMES:
        img_path = image_path.joinpath(f"{params_name}-{qm_name}.png")
        img = Image.open(img_path)

        tmp.append({"image": img})

        if len(tmp) == 3:
            images.append(tmp)
            tmp = []

    dst = None
    for v in images:
        h_dst = None
        for h in v:
            if h_dst is None:
                h_dst = h["image"]
                continue
            h_dst = get_concat_h(h_dst, h["image"])
        if dst is None:
            dst = h_dst
            continue
        dst = get_concat_v(dst, h_dst)
    draw = ImageDraw.Draw(dst)
    font = ImageFont.truetype("Arial.ttf", 36)
    draw.text((40, 40), f"{params_name}", "red", font=font)
    for i, dataset_name in enumerate(DATASET_NAMES):
        draw.text((600, 40 * (i + 1)), f'{dataset_name}', COLOR_MAP[dataset_name], font=font)
    dst.save(image_path.joinpath(f"{params_name}.png"))



In [None]:
import os

for dataset_name in DATASET_NAMES:
    for params_name in PARAMS_NAMES:
        for qm_name in ALL_QM_NAMES:
            img_path = image_path.joinpath(f"{params_name}-{qm_name}.png")
            os.remove(img_path)