<a href="https://colab.research.google.com/github/AkiyonKS/AIMathBook/blob/master/notebooks/pca_and_cluster_analysis__group_name.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# クラスター分析と主成分分析

階層型クラスター分析と主成分分析を行うためのノートブックです。

クラスター分析の結果を用いて主成分スコアの色分けをします。

本ワークブックではグループの名前(group_name)を指定することで、バッチ処理が可能です。

（主成分分析はまだ未完成）

##Google Driveに接続

毎回最初に1回実行する。認証のページが開くので手続きする。

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## ライブラリのインストール
インストールしていないライブラリがある場合はインストールする

polars<br>
https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/index.html
<br>
polarsは毎回最初にインストールが必要っぽい

In [3]:
!pip install polars
!pip install scipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


##ライブラリのimport

毎回最初に1回実行する。
インストールが必要なものがあれば別途!pipでインストールする。

In [2]:
import os
import numpy as np
import polars as pl
import random
import seaborn as sns
import hashlib
import itertools
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, dendrogram, distance, fcluster


# クラスター分析のサンプル

In [None]:
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage

# 2次元のデータを生成する
data = np.random.rand(10, 2)

# 距離行列を計算する
distance_matrix = linkage(data, method='ward')

# リンクの色を指定する関数を定義する
def color_func(x):
    if x <= 0.4:
        return 'C1'
    elif x <= 0.8:
        return 'C2'
    else:
        return 'C3'

# 樹形図を描画する
dendrogram(distance_matrix, link_color_func=lambda x: color_func(x))


# 階層型クラスター分析

In [105]:
def cluster_analysis(df,
                     label_col_name,
                     clust_num,
                     fig_save_dir,
                     rm_col_names,
                     plot_fig=False):
  len_df = len(df)

  # 目的変数を取得して正規化する
  X = df.select(pl.all().exclude(label_col_name)).to_numpy()
  y = df.select([label_col_name]).to_numpy()
  X = StandardScaler().fit_transform(X)

  # クラスター分析実行
  # 距離行列の計算
  dists = distance.pdist(X)

  # 階層型クラスタリング(ward法)の実行
  Z = linkage(dists, method='ward')
  df_dist_vs_cluster_numbers = dist_vs_cluster_numbers(Z, df, fig_save_dir, plot_fig=False)
  max_cluster = df_dist_vs_cluster_numbers.get_column('n_cluster').max()

  clust_num_r = 0 + clust_num
  if clust_num_r > max_cluster:
      clust_num_r = 0 + max_cluster

  ths = list(map(lambda x: df_dist_vs_cluster_numbers.filter(pl.col('n_cluster') == (clust_num_r + x)).get_column('dist').to_list()[0], [-1, 0]))
  clusters = fcluster(Z, t=ths[1], criterion='distance')

  clust_nums_list = []
  for row in df_dist_vs_cluster_numbers.rows():
    clust_nums_list.append(fcluster(Z, t=row[1], criterion='distance'))

  clust_nums_df = pl.DataFrame(clust_nums_list)
  clust_nums = list(range(1, len(df_dist_vs_cluster_numbers)+1))
  clust_nums.reverse()
  clust_num_col_names = list(map(lambda x: 'clust_num_' + str(x), clust_nums))
  clust_nums_df.columns = clust_num_col_names
  clust_nums_df = clust_nums_df.with_columns(df.get_column(label_col_name).alias(label_col_name))
  clust_nums_df = clust_nums_df.select([label_col_name] + clust_num_col_names)

  # デンドログラムの作成
  if len_df < 101:
    plt.rcParams['figure.facecolor'] = '#ffffff'
    plt.figure(figsize=(15, int(0.3*len(df))))
    dn = dendrogram(Z,
                    labels=df[label_col_name].to_list(),
                    color_threshold=ths[0],
                    above_threshold_color='#111111',
                    orientation='right')
    plt.tick_params(labelsize=12)
    plt.xlabel('Distance', fontsize=16)
    plt.ylabel(label_col_name, fontsize=16)
    plt.savefig(fig_save_dir + 'dendrogram.png')
    if plot_fig:
        plt.show()
  else:
      print('Finished without dendrogram (length df is too long (' + str(len_df) + ')')

  return [df_dist_vs_cluster_numbers, clust_nums_df, clust_num_r]


def dist_vs_cluster_numbers(cluster_analysis_result_df, df, fig_save_dir, plot_fig=False):
    n_clusters = len(df)
    n_samples = len(df)
    df1 = pl.DataFrame(cluster_analysis_result_df)
    dists = []
    cluster_nums = []

    for row in df1.rows():
        n_clusters -= 1
        dists.append(row[2])
        cluster_nums.append(n_clusters)
    df2 = pl.DataFrame({'n_cluster': cluster_nums, 'dist': dists})

    # 距離 vs クラスター数のグラフを描画
    plt.plot(dists, cluster_nums, 'yo-')
    plt.title('Threshold dependency of hierarchical clustering')
    plt.xlabel('Distance')
    plt.ylabel('Num of clusters')
    plt.savefig(fig_save_dir + 'distances.png')
    if plot_fig:
        plt.show()

    return df2


# group_name毎にデータフレームを作成し、繰り返しクラスター分析を行うため関数を定義

In [127]:
# クラスター分析に用いるパラメータの順列を求めるための関数（ランク付けのソート順に用いる）
def fetch_params_parmutations_and_titles(param_arr, desc=False):
    param_arr.sort()
    arr_num = list(range(0,len(param_arr)))
    arr_permu_num = list(itertools.permutations(arr_num))
    param_arr_permu = list(map(lambda x: list(map(lambda i: param_arr[i], x)), arr_permu_num))
    permu_num_titles = list(map(lambda x: '_'.join(list(map(str, x))), arr_permu_num))
    dict_param_permu = dict(zip(permu_num_titles,param_arr_permu))
    if desc:
        dict_param_permu = dict(list(reversed(dict_param_permu.items())))
    else:
        dict_param_permu = dict(sorted(dict_param_permu.items()))
    return dict_param_permu


def fetch_fct_sets_with_permutations_and_titles(fct_sets, desc=False):
    dict_res = {}
    for fcts in fct_sets:
        dict_res['__'.join(fcts)] = fetch_params_parmutations_and_titles(fcts)
    print(dict_res)
    return dict_res


# データフレームを指定したカラムでユニークにする関数
def fetch_unique_values_of_df_column(df, column_name):
   return df.unique(subset=[column_name]).sort(column_name).get_column(column_name).to_list()


# クラスター分析に用いるfctカラムの数値を用いてタイトル(name)を生成する
def fetch_names_by_fcts(df,
                        fct_columns,
                        name_formats):
    res = df.select(fct_columns).apply(lambda x: name_formats[0] % x[0] + '_' + name_formats[1] % x[1])
    return res


def add_name_by_fcts_to_df(df, fct_columns, name_formats):
    names = fetch_names_by_fcts(df,
                                fct_columns,
                                name_formats)
    df = df.with_columns(names).rename({"apply": "name"})
    return df


# 全てのgroup_nameでのクラスター分析結果を連結してcsvとして保存するための関数
def union_all_group_name_cluster_results(pdir_fct_set,
                                         cluster_num,
                                         group_name):
    print('union_all_group_name_cluster_results')
    df = pl.read_csv(pdir_fct_set + 'fcts.csv')
    group_names = fetch_unique_values_of_df_column(df, group_name)
    df_clust_nums_all = pl.DataFrame({})

    for g_name in group_names:
        pdir_g_name = pdir_fct_set + g_name + '/'
        if os.path.exists(pdir_g_name):
            flag = True
            for i in list(reversed(list(range(2, cluster_num + 1)))):
                if flag:
                    file_dir = pdir_g_name + 'stats_by_cluster_nums_' + str(i) + '.csv'
                    if os.path.exists(file_dir):
                        print('file_dir: ' + file_dir)
                        sub_df = pl.read_csv(file_dir)
                        df_clust_nums_all = pl.concat([df_clust_nums_all, sub_df])
                        flag = False
        else:
            print(pdir_g_name + 'is not exist.')
    df_result = df.join(df_clust_nums_all, on=[group_name, 'name'])

    return df_result


# fct_setsすべての結果をマージするための関数を定義
def union_all_fct_set_cluster_analysis_results(pdir,
                                               file_name,
                                               fct_sets,
                                               cluster_num):
    pdir_file_name = pdir + file_name + '/'
    df_all = pl.DataFrame({})

    for fct_columns in fct_sets:
        print(fct_columns)
        fct_columns.sort()
        fct_columns_title = '__'.join(fct_columns)
        pdir_fct_set = pdir_file_name + fct_columns_title + '/'
        file_dir = pdir_fct_set + '/stats_by_cluster_nums_' + str(cluster_num) + '.csv'       
        sub_df = pl.read_csv(file_dir)
        sub_df = sub_df.drop(fct_columns)
        sub_df = sub_df.with_columns(pl.Series(name="fct_columns_title", values=[fct_columns_title] * len(sub_df)))
        df_all = pl.concat([df_all, sub_df])
    
    return df_all


# fct_setsとgroup_namesでクラスター分析をバッチ処理
def cluster_analysis_batch_with_fct_sets_and_group_names(pdir,
                                                         file_name,
                                                         fct_sets,
                                                         dict_fcts,
                                                         group_name,
                                                         cluster_num,
                                                         drop_dim_columns,
                                                         plot_fig=False):
    pdir_file_name = pdir + file_name + '/'

    for fct_columns in fct_sets:
        fct_columns.sort()
        fct_columns_title = '__'.join(fct_columns)
        pdir_fct_set = pdir_file_name + fct_columns_title + '/'

        df = pl.read_csv(pdir + file_name + '.csv')
        dict_fct_sets = fetch_fct_sets_with_permutations_and_titles(fct_sets)

        sort_desc_bools = list(map(lambda x: dict_fcts[x]['desc'], fct_columns))
        name_formats = list(map(lambda x: dict_fcts[x]['format'], fct_columns))


        os.makedirs(pdir_file_name, exist_ok=True)
        os.makedirs(pdir_fct_set, exist_ok=True)

        df = df.select([group_name] + fct_columns)
        df = add_name_by_fcts_to_df(df, fct_columns, name_formats)
        df.write_csv(pdir_fct_set + 'fcts.csv')

        columns_for_analysis = [group_name, 'name'] + fct_columns
        unique_df = df.unique(subset=columns_for_analysis).select(columns_for_analysis)
        unique_df = unique_df.sort(*([group_name] + fct_columns), descending=[False] + sort_desc_bools)

        df_clust_nums_all = pl.DataFrame({})
        dict_param_permu = fetch_params_parmutations_and_titles(fct_columns, desc=True)

        # https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.dataframe.groupby.GroupBy.__iter__.html
        for g_name, sub_df in unique_df.select([group_name, 'name'] + fct_columns).groupby(group_name):
            pdir_g_name = pdir_fct_set + g_name + '/'
            if os.path.exists(pdir_g_name + 'cluster_nums.csv'):
                print('skip ' + g_name)
            else:
                print(g_name)
                sub_df = sub_df.drop(group_name)
                print(len(sub_df))
                if len(sub_df) < cluster_num - 1:
                    print('skip cluster analysis len(sub_df) = ' + str(len(sub_df)) + ' is smaller than cluster_num of ' + str(cluster_num))
                else:
                    os.makedirs(pdir_g_name, exist_ok=True)
                    fig_save_dir = pdir_g_name + '/images/'
                    os.makedirs(fig_save_dir, exist_ok=True)
                    df_dist_vs_cluster_numbers, clust_nums_df, clust_num_r = cluster_analysis(sub_df,
                                                                                              'name',
                                                                                              cluster_num,
                                                                                              fig_save_dir,
                                                                                              drop_dim_columns,
                                                                                              plot_fig)
                    sub_df_with_clust_nums = sub_df.join(clust_nums_df.select(['name', 'clust_num_' + str(clust_num_r)]), on='name')
                    cluster_nums = list(range(1, clust_num_r + 1))
                    dict_stats = {'cluster_num': cluster_nums}
                    stat_names = []

                    for i_fct_column, col_name in enumerate(fct_columns):
                        res_describe = list(map(lambda i: sub_df_with_clust_nums.filter(pl.col("clust_num_" + str(clust_num_r))== i).get_column(col_name).describe(), cluster_nums))
                        if len(stat_names) < 1:
                            stat_names = stat_names + list(res_describe[0].get_column('statistic'))
                        for j, stat_name in enumerate(stat_names):
                            dict_stats['fct' + str(i_fct_column) + '__' + stat_name] = list(map(lambda x: list(x.get_column('value'))[j], res_describe))

                    df_stats = pl.DataFrame(dict_stats)
                    for param_title, fct_params in dict_param_permu.items():
                        df_stats = df_stats.sort(*list(map(lambda x: 'fct' + str(fct_params.index(x)) + '__mean', fct_params)), descending=[True]*len(fct_params))
                        df_cluster_nums_r = pl.DataFrame({'cluster_num': list(df_stats.get_column('cluster_num')), 'rank_by_cluster_' + param_title: cluster_nums})
                        df_stats = df_cluster_nums_r.join(df_stats, on='cluster_num')
                    
                    stat_columns = list(map(lambda w: 'rank_by_cluster_' + w, sorted(dict_param_permu.keys()))) + ['cluster_num', 'fct0__count'] + list(itertools.chain.from_iterable(list(map(lambda x: ['fct' + str(x) + '__' + 'mean', 'fct' + str(x) + '__' + 'std'], list(range(len(fct_columns)))))))
                    sub_df_stats = df_stats.select(stat_columns)

                    sub_df_clust_nums = clust_nums_df.select(['name', 'clust_num_' + str(clust_num_r)]).with_columns(pl.col('clust_num_' + str(clust_num_r)).cast(pl.Int64))
                    sub_df_clust_nums = sub_df_clust_nums.rename({'clust_num_' + str(clust_num_r): 'cluster_num'})

                    list_gnames = [g_name] * len(sub_df_clust_nums)
                    dict_gnames = {}
                    dict_gnames[group_name] = list_gnames
                    dict_gnames['name'] = list(sub_df_clust_nums.get_column('name'))
                    df_gname = pl.DataFrame(dict_gnames)
                    sub_df_clust_nums = df_gname.join(sub_df_clust_nums, on='name')
                    sub_df_stats = sub_df_clust_nums.join(sub_df_stats, on='cluster_num')
                    sub_df_stats = sub_df_stats.rename({'fct0__count': 'n_of_stat'})
                    # df_clust_nums_all = pl.concat([df_clust_nums_all, sub_df_stats])

                    # group_name毎の結果を保存
                    df_dist_vs_cluster_numbers.write_csv(pdir_g_name + 'dist_vs_cluster_nums.csv')
                    clust_nums_df.write_csv(pdir_g_name + 'cluster_nums.csv')
                    df_stats.write_csv(pdir_g_name + 'stats_all_by_cluster_nums_' + str(clust_num_r) + '.csv')
                    sub_df_stats.write_csv(pdir_g_name + 'stats_by_cluster_nums_' + str(clust_num_r) + '.csv')

        df_result = union_all_group_name_cluster_results(pdir_fct_set,
                                                         cluster_num,
                                                         group_name)
        # クラスター分析結果を保存
        save_file_name = pdir_fct_set + '/stats_by_cluster_nums_' + str(cluster_num) + '.csv'
        print(save_file_name)
        df_result.write_csv(save_file_name)
    

#主成分分析を行う関数を定義



In [138]:
def pca_analysis(df, label_col_name):
    n_pca = len(df.columns) - 1
    pcs = list(range(1, n_pca + 1))
    pc_names = list(map(lambda x: 'pc' + str(x), pcs))

    print(len(df))
    print(len(df.columns))

    # 目的変数を取得して正規化する
    X = df.select(pl.all().exclude(label_col_name)).to_numpy()
    y = df.select([label_col_name]).to_numpy()
    X = StandardScaler().fit_transform(X)

    # 主成分分析を実行する
    pca = PCA(n_components=n_pca)
    principalComponents = pca.fit_transform(X, y)

    # 主成分分析の結果をデータフレームに変換する
    df_principal = pl.DataFrame(principalComponents, schema = pc_names)
    df_pcs = df_principal.with_columns(df.get_column(label_col_name).alias(label_col_name))
    col_names = [label_col_name] + pc_names
    df_pcs = df_pcs.select(col_names)

    # 因子負荷量を取得する
    df_loadings = pl.DataFrame(pca.components_.T, schema=pc_names)
    features = df.select(pl.all().exclude([label_col_name])).columns
    df_loadings = df_loadings.with_columns(pl.Series(features).alias('feature'))
    df_loadings = df_loadings.select(['feature'] + pc_names)

    # 各成分の寄与率を取得する
    var_ratio = pca.explained_variance_ratio_
    cumulative_var_ratio = [sum(var_ratio[:i+1]) for i in range(len(var_ratio))]
    df_variance_ratio = pl.DataFrame({'pc': pcs,
                                    'var_ratio': var_ratio,
                                    'cumulative_var_ratio': cumulative_var_ratio})

    return [df_variance_ratio, df_pcs, df_loadings]


# 主成分分析結果をグラフにする
def plot_pca_results(label_col_name,
                     fig_save_dir,
                     df_variance_ratio,
                     df_pcs,
                     df_loadings,
                     list_pc_score_colors,
                     plot_fig=False):
    n_pca = len(df_pcs.columns) - 1
    pcs = list(range(1, n_pca + 1))
    pc_names = list(map(lambda x: 'pc' + str(x), pcs))
    plt.rcParams['figure.facecolor'] = '#ffffff'
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(df_variance_ratio.select(['pc']).to_numpy(),
            df_variance_ratio.select(['var_ratio']).to_numpy())
    ax.plot(df_variance_ratio.select(['pc']).to_numpy(),
            df_variance_ratio.select(['cumulative_var_ratio']).to_numpy())
    ax.set_xticks(pcs)
    ax.set_yticks([0,0.2,0.4,0.6,0.8,1])
    plt.xlim(0.8, pcs[-1] + 0.2)
    plt.ylim(0, 1)
    plt.xlabel('PC')
    plt.ylabel('Variance Ratio')

    if not os.path.exists(fig_save_dir):
        os.makedirs(fig_save_dir, exist_ok=True)

    plt.savefig(fig_save_dir + 'var_ratio.png')
    if plot_fig:
        plt.plot()

    max_abs = df_pcs['pc1'].abs().max()*1.05

    pc_plot_colors = []
    if len(list_pc_score_colors) == 0:
        if isinstance(df_pcs[label_col_name][0], int):
            color_list = ['red', 'blue', 'green', 'purple', 'orange', 'pink', 'brown', 'yellow']
            pc_plot_colors = list(df_pcs[label_col_name].apply(lambda x: color_list[x]))

    pc_combinations = list(itertools.combinations(pcs, 2))

    for i, j in pc_combinations:
        # 主成分プロット
        fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))
        if len(list_pc_score_colors) > 0:
            list_pc_score_colors_r = list(map(lambda x: 'C' + str(x), list_pc_score_colors))
            ax1.scatter(x=df_pcs.select(pc_names[i-1]).to_numpy(),
                        y=df_pcs.select([pc_names[j-1]]).to_numpy(),
                        c=list_pc_score_colors_r)
        elif len(pc_plot_colors) > 0:
            ax1.scatter(x=df_pcs.select(pc_names[i-1]).to_numpy(),
                        y=df_pcs.select([pc_names[j-1]]).to_numpy(),
                        color=pc_plot_colors)
        else:
            colors = np.linspace(0, 1, len(df_pcs))
            ax1.scatter(x=df_pcs.select(pc_names[i-1]).to_numpy(),
                        y=df_pcs.select([pc_names[j-1]]).to_numpy(),
                        c=colors,
                        cmap='coolwarm')
        ax1.axhline(0, color='gray', linewidth=0.5)
        ax1.axvline(0, color='gray', linewidth=0.5)
        pc_score_scale = [x * max_abs * 1.05 for x in [-1,1]]
        ax1.set_xlim(pc_score_scale)
        ax1.set_ylim(pc_score_scale)

        xy_labels = [f"PC{x} ({df_variance_ratio.get_column('var_ratio')[x-1]*100:.1f}%)" for x in [i, j]]
        ax1.set_xlabel(xy_labels[0])
        ax1.set_ylabel(xy_labels[1])

        # 因子負荷量プロット
        marker = '+'
        marker_size = 5
        marker_color = 'gray'

        x_coors = df_loadings.select(pc_names[i-1]).to_numpy()
        y_coors = df_loadings.select(pc_names[j-1]).to_numpy()

        ax2.scatter(x=x_coors, y=y_coors, marker=marker, s=marker_size, c=marker_color)
        for k in list(range(0, len(x_coors))):
            ax2.annotate(str(k+1), (x_coors[k], y_coors[k]), ha='center', va='center')

        ax2.axhline(0, color='gray', linewidth=0.5)
        ax2.axvline(0, color='gray', linewidth=0.5)
        ticks = [-1, -0.5, 0, 0.5, 1]
        ax2.set_xticks(ticks)
        ax2.set_yticks(ticks)
        loading_scale = [-1.1, 1.1]
        ax2.set_xlim(loading_scale)
        ax2.set_ylim(loading_scale)
        ax2.set_xlabel(xy_labels[0])
        ax2.set_ylabel(xy_labels[1])

        plt.savefig(fig_save_dir + 'pcs_and_loadings_pc' + '_'.join([str(i), str(j)]) + '.png')
        if plot_fig:
            plt.show()

# 以下バッチ処理用の関数
# 全てのgroup_nameでの主成分分析結果を連結してcsvとして保存するための関数
def union_all_group_name_pca_results(pdir_fct_set,
                                     group_name):
    print('union_all_group_name_pca_results')
    df = pl.read_csv(pdir_fct_set + 'fcts.csv')
    group_names = fetch_unique_values_of_df_column(df, group_name)
    pca_params = ['pcs', 'loadings', 'var_ratio']

    for pca_param in pca_params:
        df_all = pl.DataFrame({})
        for g_name in group_names:
            pdir_g_name = pdir_fct_set + g_name + '/'
            if os.path.exists(pdir_g_name):
                sub_df = pl.read_csv(pdir_g_name + pca_param + '.csv')
                col_names = sub_df.columns
                sub_df = sub_df.with_columns(pl.Series(name=group_name, values=[g_name] * len(sub_df)))
                sub_df = sub_df.select([group_name] + col_names)
                df_all = pl.concat([df_all, sub_df])
            else:
                print(pdir_g_name + 'is not exist.')

        df_all.write_csv(pdir_fct_set + pca_param + '.csv')


# fct_setsすべての主成分結果をマージするための関数を定義
def union_all_fct_set_pca_results(pdir,
                                  file_name,
                                  fct_sets):
    pdir_file_name = pdir + file_name + '/'
    df_alls = {}
    pca_params = ['pcs', 'loadings', 'var_ratio']
    for pca_param in pca_params:
        df_all = pl.DataFrame({})

        for fct_columns in fct_sets:
            fct_columns.sort()
            fct_columns_title = '__'.join(fct_columns)
            pdir_fct_set = pdir_file_name + fct_columns_title + '/'
            file_dir = pdir_fct_set + pca_param + '.csv'
            sub_df = pl.read_csv(file_dir)
            col_names = sub_df.columns
            sub_df = sub_df.with_columns(pl.Series(name="fct_columns_title", values=[fct_columns_title] * len(sub_df)))
            sub_df = sub_df.select(['fct_columns_title'] + col_names)
            df_all = pl.concat([df_all, sub_df])
        
        df_alls[pca_param] = df_all
    
    return df_alls


# fct_setsとgroup_namesで主成分分析をバッチ処理
def pca_batch_with_fct_sets_and_group_names(pdir,
                                            file_name,
                                            fct_sets,
                                            dict_fcts,
                                            group_name):
    pdir_file_name = pdir + file_name + '/'

    for fct_columns in fct_sets:
        fct_columns.sort()
        fct_columns_title = '__'.join(fct_columns)
        pdir_fct_set = pdir_file_name + fct_columns_title + '/'
        sort_desc_bools = list(map(lambda x: dict_fcts[x]['desc'], fct_columns))
        name_formats = list(map(lambda x: dict_fcts[x]['format'], fct_columns))
        fcts_file_name = pdir_fct_set + 'fcts.csv'

        if os.path.exists(fcts_file_name):
            df = pl.read_csv(fcts_file_name)
        else:
            df = pl.read_csv(pdir + file_name + '.csv')
            os.makedirs(pdir_file_name, exist_ok=True)
            os.makedirs(pdir_fct_set, exist_ok=True)
            df = df.select([group_name] + fct_columns)
            df = add_name_by_fcts_to_df(df, fct_columns, name_formats)
            df.write_csv(fcts_file_name)
        print(df)

        columns_for_analysis = [group_name, 'name'] + fct_columns
        unique_df = df.unique(subset=columns_for_analysis).select(columns_for_analysis)
        unique_df = unique_df.sort(*([group_name] + fct_columns), descending=[False] + sort_desc_bools)

        df_clust_nums_all = pl.DataFrame({})
        th_pca = 2

        # https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.dataframe.groupby.GroupBy.__iter__.html
        for g_name, sub_df in unique_df.select([group_name, 'name'] + fct_columns).groupby(group_name):
            pdir_g_name = pdir_fct_set + g_name + '/'
            if len(sub_df) > th_pca:
                if os.path.exists(pdir_g_name + 'pcs.csv'):
                    print('skip ' + g_name)
                else:
                    sub_df = sub_df.drop(group_name)
                    print(g_name)
                    print(len(sub_df))
                    if len(sub_df) < th_pca:
                        print('skip PCA analysis len(sub_df) < ' + str(th_pca))
                    else:
                        print(g_name)
                        os.makedirs(pdir_g_name, exist_ok=True)
                        df_variance_ratio, df_pcs, df_loadings = pca_analysis(sub_df, label_col_name)

                        # group_name毎の結果を保存
                        df_pcs.write_csv(pdir_g_name + 'pcs.csv')
                        df_loadings.write_csv(pdir_g_name + 'loadings.csv')
                        df_variance_ratio.write_csv(pdir_g_name + 'var_ratio.csv')
            else:
                print('skip PCA analysis len(sub_df) < ' + str(th_pca))

        union_all_group_name_pca_results(pdir_fct_set, group_name)


# 主成分の結果をバッチで図にする
def plot_pca_results_batch_with_fct_sets_and_group_names(pdir,
                                                         file_name,
                                                         group_name,
                                                         plot_fig=False):
    pdir_file_name = pdir + file_name + '/'
    param_names = ['var_ratio', 'pcs', 'loadings', 'clusters']
    df_results = {}

    for param_name in param_names:
        file_dir = pdir_file_name + param_name + '.csv'
        if os.path.exists(file_dir):
            df = pl.read_csv(file_dir)
            if param_name == 'clusters':
                df = df.select(['fct_columns_title', group_name, 'name', 'rank_by_cluster_0_1']).unique()
            df_results[param_name] = df

    df_fct_set_group_names = df_results['pcs'].select(['fct_columns_title', group_name]).unique().sort(['fct_columns_title', group_name])
    df_params = {'index': list(range(len(df_fct_set_group_names)))}
    df_params[group_name] = df_fct_set_group_names.get_column(group_name).to_list()
    df_params['fct_columns_title'] = df_fct_set_group_names.get_column('fct_columns_title').to_list()
    
    for i in df_params['index']:
        df_filter = {}
        for col_name in ['fct_columns_title', group_name]:
            df_filter[col_name] = df_params[col_name][i]
        
        sub_df_results = {}
        for param_name in param_names:
            sub_df = df_results[param_name]
            for filter_name, filter_value in df_filter.items():
                sub_df = sub_df.filter(pl.col(filter_name) == filter_value)
            sub_df = sub_df.drop(['fct_columns_title', group_name])            
            sub_df_results[param_name] = sub_df
        
        pcs = sub_df_results['pcs'].join(sub_df_results['clusters'].select(['name', 'rank_by_cluster_0_1']), on=['name']).sort('rank_by_cluster_0_1')
        sub_df_results['pcs'] = pcs.drop('rank_by_cluster_0_1')
        clusters = pcs.get_column('rank_by_cluster_0_1').to_list()
        sub_df_results['clusters'] = [] + clusters
        
        pdir_fct_set = pdir_file_name + df_filter['fct_columns_title'] + '/'
        pdir_g_name = pdir_fct_set + df_filter[group_name] + '/'
        fig_save_dir = pdir_g_name + 'images/'
        plot_pca_results('name', fig_save_dir, *sub_df_results.values(), plot_fig)


# パラメータを定義

In [119]:
cluster_num = 5
pdir = '/content/drive/MyDrive/csv/'
file_name = 'access__env_daily_20230429_test3'
group_name = 'environment_uuid'
dict_fcts = {
    'ac': {'format': '%.0f', 'desc': True},
    'uu': {'format': '%.0f', 'desc': True},
    'norm': {'format': '%.1f', 'desc': True},
    'angle': {'format': '%.1f', 'desc': True},
    'log_ac': {'format': '%.2f', 'desc': True},
    'log_uu': {'format': '%.2f', 'desc': True},
    'log_norm': {'format': '%.2f', 'desc': True}
}
fct_sets = [['ac', 'uu'],
            ['norm', 'angle'],
            ['log_ac', 'log_uu'],
            ['log_norm', 'angle']]
drop_dim_columns = []
plot_fig = False


# クラスター分析を実行

In [None]:
cluster_analysis_batch_with_fct_sets_and_group_names(pdir,
                                                     file_name,
                                                     fct_sets,
                                                     dict_fcts,
                                                     group_name,
                                                     cluster_num,
                                                     drop_dim_columns,
                                                     plot_fig)

In [None]:
# すべての結果を結合してファイルに保存
res_df = union_all_fct_set_cluster_analysis_results(pdir,
                                                    file_name,
                                                    fct_sets,
                                                    cluster_num)
print(res_df)
res_df.write_csv(pdir + file_name + '/' + 'clusters.csv')

# 主成分分析を実行

In [None]:
label_col_name = 'name'
pca_batch_with_fct_sets_and_group_names(pdir,
                                        file_name,
                                        fct_sets,
                                        dict_fcts,
                                        group_name)


In [None]:
# 結果を結合
df_pca_results = union_all_fct_set_pca_results(pdir,
                                               file_name,
                                               fct_sets)
print(df_pca_results)

# 保存
for key, item in df_pca_results.items():
    item.write_csv(pdir + file_name + '/' + key + '.csv')

In [None]:
# 主成分分析の図を保存
plot_pca_results_batch_with_fct_sets_and_group_names(pdir,
                                                     file_name,
                                                     group_name,
                                                     plot_fig)