## 本程序统计label（配位数和平均键长）的情况。

### 程序处理对象为“_untar_fealab”文件夹中各seed的lab或feature文件，输出结果在“_untar_fealab_stastistics”下，包含txt文档和图片。该文件夹下会有“cluster”文件夹，内有统计各cluster各个位点统计的配位数柱状图和第一配位键长的统计图。

### 假设需要统计的文件的目录结构为“AuPd-4000_au150-0_WT_untar”（一级目录）——“500”（二级目录）——“1/2/3/4等解压缩的文件夹”（三级目录），将本程序安放在与一级目录平行的另一个一级目录（或一级目录本身）的任意一个二级文件目录内，即可运行。

last update： 2022.5.27

contacts: zhaohf@ihep.ac.cn

# 配置环境

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

输出导入模板的版本。

In [2]:
print('python version:',sys.version)
print('numpy version:',np.__version__)
print('panda version:',pd.__version__)
print('matplotlib version:',mpl.__version__)

python version: 3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]
numpy version: 1.21.5
panda version: 1.4.2
matplotlib version: 3.5.1


# 参数设置

## 设置输入和输出文件夹名

In [3]:
dir_src = 'AuPd-4000_au150-0_DW_R0_cncr_WT_untar_fealab'
stat = '_statistics'
# 目标文件夹
dir_work = dir_src + stat
# 目标文件夹下包含详细各seed统计信息的文件夹名称
dir_seed_work = 'seeds'

## 设置需要统计的对象

In [4]:
# label_stat is a dictionary, value of key(label) is 'd'(dispersed) or the step to be counted (continous)
# For example, 'cr1':0.05 means that label 'cr1' is continous, and it will be counted in each 0.05 step,
# if min of cr1 is 2.32, then the count range will be [2.30,2.35),[2.35,2.40)...
label_stat = {'cr1'  :  {'type'  : 'c'
                        ,'step'  : 0.1
                        ,'xlabel': 'Coordinate bond length($\AA$)'
                        ,'ylabel': 'Number'
                        ,'title' : 'CR statistics'
                        }
#             ,'cn1'  :  {'type'  : 'd'
#                        ,'step'  : 1
#                        ,'xlabel': 'Coordinate number'
#                        ,'ylabel': 'Number'
#                        ,'title' : 'CN statistics'
#                        }
             }

## 设置统计原始还是已经经过pick的label

In [5]:
# pick_count = False : count all labels
#            = True  : count labels having been picked. (work with check)
pick_count = True
# 如果pick_out = True， 则需要设置如下需要pick out的label情况
dir_check = 'check'
file_checknum = 'check_num.txt'
# 设置输出文件名字前缀，如经过筛选的，建议pick，若没有（即pick_cout = False）则不读
prf_pick = 'pick'

## 其他一些参数

In [6]:
trivial = 0.0001

# 部分函数模块

## 设置绘图格式和保存图片的模块

注意这里的save_fig与神经网络中的有所不同，特别时路径和文件名位置换了。

In [7]:
# To plot pretty figures
# %matplotlib具体作用是调用matplotlib.pyplot的绘图函数plot()进行绘图时，
# 或生成一个figure画布时，可以直接在你的python console里面生成图像。
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)   # matplotlib.rc 什么意思没找到
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

def save_fig(IMAGES_PATH, fig_id, tight_layout=True, fig_extension="png", resolution=300):
    os.makedirs(IMAGES_PATH, exist_ok=True)
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
#    plt.show()

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

## 扩展字典

In [8]:
# 在间隔为step的网格上进行补0的字典补足。
def dict_expand(dict0, dc, step, type='all'):
    """
    Expand the dictionary 'dict0' to the one with keys located at the girds with distance 'step'
    and the corresponding value of keys are 0 for dc = 'c'(continuous);
    wile for dc = 'd'(dispersed), the keys of expanded dictionary will be located in the 
    most sparse linear grid which covers all the keys in dict0.
    Input :
        dict0   :   dictionary to be treat
                    dictionary
        dc      :   property of label(key in dict0)
                    String
                    'c' for continuous
                    'd' for dispersed
                    else for wrong case
        step    :   the interval for the two adjacent label indices
                    numeral
        type    :   type for dictionary expansion
                    String
                    'all'  : Expand to all grids
                    'adj0' : Expand to grids only adjacent to non-zero grids
    """
#
    trivial = 0.0001
#
    a = list(dict0.keys())
#    print(f"dict0={dict0}")
#    print(f"dict0.keys()={dict0.keys()}")
#    print(f"list(dict0.keys())={a}")
    a.sort()
#    print(f"a.sort()={a}")
    new = {}
    if dc == 'd':
        dist = []
        for i in range(len(a)-1):
            dist.append(a[i+1]-a[i])
        dist.sort()
        step = dist[0]
#
    step_split = str(step).split(".")
    if len(step_split) == 2:
        digit = len(str(step).split(".")[1])
    else:
        digit = 0
#
    if dc == 'c' or dc == 'd':
        b = (a[-1] - a[0])/step
        if abs(b - int(b+trivial)) > trivial:
            print()
            print(f"** Error!! The max and min of keys in dictionary '{dict0}' don't satisfy the definition of step = {step}!")
#
        n_step = int(b+trivial) + 1
        n_len = len(a)
        if n_len < n_step:
            for c in np.arange(a[0], a[-1], step):
#                print(f"c={c}")
                c = round(c,digit)
                if c not in a:
                    if type == 'all':
                        new[c] = 0
                    elif type == 'adj0':
                        c1 = c - step
#                        c2 = c + step
#                        if c1 in a or c2 in a:
                        if c1 in a:
                            new[c] = 0
                    else:
                        pass
        elif n_len == n_step:
            pass
        else:
            print()
            print(f"** Error!! N of items in dictionary '{dict0}' outranges the space that its key allows!")
    else:
        print()
        print(f"** Error!! label type is {dc}, Not continuous nor Dispersed!")
    new.update(dict0)
#    print(f"new={new}")
    return new

## 字典提取key列表

In [9]:
# 将字典按照key的顺序排序，返回key和对应于key的value列表
def sort_dict2lists (dict0,reverse = False):
    """
    sort the input dictionary 'dict0', return two lists with sorted keys and the corresponding values.
    """
#
    list_key = sorted(list(dict0.keys()), reverse=reverse)
#    print(f"list_key={list_key}")
    list_value = []
    for key in list_key:
        list_value.append(dict0[key])
    return list_key, list_value

## 文件命名

In [10]:
def file_make(label,dir_seed, dc, step, pick_count, prf_pick, seedyn = 'y'):
    """
    Define the name of file to be ploted for each seed or label, no suffix
    """
    file = label
    if seedyn == 'y':
        file = f"{dir_seed}_" + file
    if dc == 'c':
        file = file + f"_step={step}" 
    if pick_count:
        file = file + f"_{prf_pick}" 
#
    return file

## 离散点绘图

In [11]:
# 绘制各个cluster的CN统计图
def plot_cn(x, y, path, file, xlabel='Coordinate number', ylabel='Number', title='CN statistics'):
    """
    绘制CN统计图。
    x：平均配位数列表
    y: x对应的样本数
    path：图的保存路径
    file：图的名称（不包含后缀）    
    """
#
    #fig = plt.figure(num=1, figsize=(15, 8),dpi=80) #开启一个窗口，同时设置大小，分辨率
    fig = plt.figure(1)
    ax1 = fig.add_subplot(1,1,1)
#
# n0, index for plot show.
    n0 = 15
    n1 = 30
#
    x_index = np.array(x)
    y_index = np.array(y)
    y_max = max(y_index)
#
    len_x = len(x)
    ind = np.linspace(1,len_x,len_x)
#
    if len_x <= n0:
        fontsize = 10
        rotation = 0
    elif len_x <= n1:
        fontsize = 8
        rotation = 45
    else:
        fontsize = 5
        rotation = 45
#
    bar_width = 0.8 #定义一个数字代表每个独立柱的宽度
#
#    rects1 = ax1.bar(cn_list,cn_number,width=0.8,alpha=0.4,color='blue',edgecolor='red',label=label_index,lw=1)
#
#    print(ind-bar_width/2)
#    print(y_index)
#
    rects1 = ax1.bar(ind, y_index, width=bar_width,alpha=0.7, color='blue',label=file)
#
    for a,b in zip(ind,y_index):
        plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=fontsize)
#
#    plt.ylim(0,cn_max*1.2)
#    plt.legend(loc='upper right') #显示图例
    plt.legend()
    plt.tight_layout() #自动控制图像外部边缘，此方法不能够很好的控制图像间的间隔
#
#    plt.xticks(np.arange(min(cn_list),max(cn_list)+1,1), rotation=0)#rotation控制倾斜角度   #x轴刻度线
    ax1.set_xticks(ind)
    ax1.set_xticklabels(x,rotation=rotation,fontsize=fontsize)
    ax1.set_xlabel(xlabel)
    ax1.set_ylabel(ylabel)
    ax1.set_title(title)
    ax1.set_ylim(0,y_max*1.2)
#
    save_fig(path,file)
    plt.close(fig)

## 连续点绘图

In [12]:
# 绘制各个cluster的CR统计图
def plot_cr(x, y, path, file, delta, xlabel='Coordinate bond length($\AA$)', ylabel='Number', title='CR statistics'):
    """
    绘制CR统计图。
    x：平均配位键长列表
    y: x对应的样本数
    digit：自然数，用于给出绘制图中x轴数字的有效位数，如果步长为0.1，则为1，若步长为0.05， 则为2
    path：图的保存路径
    file：图的名称（不包含后缀）    
    """
#
#
    #fig = plt.figure(num=1, figsize=(15, 8),dpi=80) #开启一个窗口，同时设置大小，分辨率
    fig = plt.figure()
    ax1 = fig.add_subplot(1,1,1)
#
# n0, index for plot show.
    n0 = 13
    n1 = 26
#
    x_index = np.array(x)
    y_index = np.array(y)
    y_max = max(y_index)
#
    len_x = len(x)
#
    digit = len(str(delta).split(".")[1])
#
    ind = np.linspace(1,len_x,len_x)
#
    if len_x <= n0:
        fontsize = 10
        rotation = 0
    elif len_x <= n1:
        fontsize = 8
        rotation = 45
    else:
        fontsize = 5
        rotation = 45
#
    bar_width = 1 #定义一个数字代表每个独立柱的宽度
#
    rects1 = ax1.bar(ind, y_index, width=bar_width,align='edge',alpha=0.5, color='yellow',edgecolor='red',label=file)
    for a,b in zip(ind,y_index):
        plt.text(a+0.5, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=fontsize)
#
#    plt.xticks(x_index + bar_width/2, x_data) #x轴刻度线
#    plt.legend('upper right') #显示图例
    plt.legend() #显示图例
    plt.tight_layout() #自动控制图像外部边缘，此方法不能够很好的控制图像间的间隔
#
    ax1.set_xticks(np.append(ind,len_x+1))
    xx = list(np.append(x_index,round(x[-1]+delta,digit)))
    ax1.set_xticklabels(xx,rotation=rotation,fontsize=fontsize)
    ax1.set_xlabel(xlabel)
    ax1.set_ylabel(ylabel)
    ax1.set_title(title)
    ax1.set_ylim(0,y_max*1.2)
#
    save_fig(path,file)
    plt.close(fig)

## label绘图

In [13]:
def plot_label(label,label_dict,label_stat_info,path,file):
    """
    绘制label_dict中统计的label值分布
    input:
        label           :   label to be counted
                            String
        label_dict      :   dictionary to be ploted
                            Dictionary
        label_stat_info :   dictionary for label 'label'
                        :   Dictionary
        path            :   the directory path of plot-figure file
                            String
        file            :   the name of plot-figure, without suffix
                            String
    Return  :   figure file in 'path' with name 'file'  
                    
    """
    x,y = sort_dict2lists(label_dict)
    dc = label_stat_info['type']
    step = label_stat_info['step']
    xlabel = label_stat_info['xlabel']
    ylabel = label_stat_info['ylabel']
    title = label_stat_info['title']
    if dc == 'c':
        plot_cr(x, y, path, file, step, xlabel=xlabel, ylabel=ylabel, title=title)
    elif dc == 'd':
        plot_cn(x, y, path, file, xlabel=xlabel, ylabel=ylabel, title=title)
    else:
        print()
        print(f"** Error!! label type is {dc}, Not continuous nor Dispersed! 2")
#

## label统计

In [14]:
def label_count(label, label_stat_info,
                list_seed, path_src,
                pick_count, dir_check, file_checknum, trivial
               ):
    """
    统计label
    """
#
    dc = label_stat_info['type']
    if dc == 'd' or dc == 'D':
        dc = 'd'
        step = trivial
    elif dc == 'c' or dc == 'C':
        dc = 'c'
        step = label_stat_info['step']
        if step <= 0:
            print()
            print(f"** step = {step} <= 0, WRONG!")
            sys.exit()
    else:
        print()
        print(f"** Error!! label type is {dc}, Not continuous nor Dispersed! 4")
#
    step_split = str(step).split(".")
    if len(step_split) == 2:
        digit = len(str(step).split(".")[1])
    else:
        digit = 0
#
    label_dict = {}
    label_list_dict_seed = []
# 进入各个cluster，即100、200等
    for iseed, dir_seed in enumerate(list_seed):
        path_dir_seed = os.path.join(path_src, dir_seed)
# 进入seed下的label文件夹，查找其下是否存在label文件
        path_dir_label = os.path.join(path_dir_seed, label)
        exist = os.path.exists(path_dir_label)
        label_dict_seed = {}
        if exist:
    # 源seed中各个label文件
            list_label = os.listdir(path_dir_label)
            result = os.path.splitext(list_label[0])
# 统计是否扣除check文件夹中的label
            if pick_count:
                path_dir_check = os.path.join(path_dir_seed, dir_check)
                path_file_checknum = os.path.join(path_dir_check, file_checknum)
                exist_checknum = os.path.exists(path_file_checknum)
                if exist_checknum:
# csv读取方式
                    number_pd = pd.read_csv(path_file_checknum,header=None)
                    number_list = list(number_pd.values)
                    for i, number in enumerate(number_list):
                        number_list[i] = str(int(max(abs(number+trivial),abs(number-trivial))))
# readlines读取方式
#                    with open(path_file_checknum) as f:
#                        number_list0 = f.read().splitlines()
#                    number_list = []
#                    for i, number in enumerate(number_list0):
#                        number_list.append(number.strip())
# 
                    for i, number in enumerate(number_list):
                        file_label_num = label + '_' + number + result[1]
                        list_label.remove(file_label_num)
                else:
                    print()
                    print(f"** Warning! File '{file_checknum}' in seed {dir_seed} for label '{label}' NOT found!")
                    print(f"** All the label will be counted!")
#               end if exist_checknum
            else:
                pass
#           end if pick_count
# 开始读取label文件夹中的label_N.dat
            for i, labelN in enumerate(list_label):
                path_file_labelN = os.path.join(path_dir_label, labelN)
                labelN_pd = pd.read_csv(path_file_labelN,header=None)
#                labelN_list = labelN_pd.values
#                print(f"labelN_pd.values={labelN_pd.values}")
#                print(f"labelN_list={labelN_list}")
#                a = labelN_list[0][0]
                a = labelN_pd.iloc[0,0]
#                print(f"a={a}")
                if dc == 'd':
                    labelN_data = a
                elif dc == 'c':
                    if a >= 0 :
                        labelN_data = int((a+trivial)/step)*step
                    else:
                        labelN_data = (int((a-trivial)/step)-1)*step
                    labelN_data = round(labelN_data,digit)
                else:
                    pass
#                print(f"a={a}")
#                print(f"labelN_data={labelN_data}")
                if labelN_data in label_dict_seed.keys():
                    label_dict_seed[labelN_data] = label_dict_seed[labelN_data] + 1
                else:
                    label_dict_seed[labelN_data] = 1
                if labelN_data in label_dict.keys():
                    label_dict[labelN_data] = label_dict[labelN_data] + 1
                else:
                    label_dict[labelN_data] = 1
#           end for i, labelN in enumerate(list_label)
        else:
            print()
            print(f"** Error!! No directory '{label}' found in seed {dir_seed}")
            print()
    #        sys.exit()
#       end if exist = os.path.exists(path_dir_label)
        label_list_dict_seed.append(label_dict_seed)
#   end for iseed, dir_seed in enumerate(list_seed):
    return label_list_dict_seed, label_dict

## label统计绘图

In [15]:
def label_count_plot(label, label_stat_info,
                     label_list_dict_seed, label_dict,
                     list_seed,
                     pick_count, prf_pick, trivial,
                     path_dir_seed_work, path_work):
    """
    绘制统计的label
    """
#
    dc = label_stat_info['type']
    if dc == 'd' or dc == 'D':
        dc = 'd'
        step = trivial
    elif dc == 'c' or dc == 'C':
        dc = 'c'
        step = label_stat_info['step']
        if step <= 0:
            print()
            print(f"** step = {step} <= 0, WRONG!")
            sys.exit()
    else:
        print()
        print(f"** Error!! label type is {dc}, Not continuous nor Dispersed! 5")
    for iseed, dir_seed in enumerate(list_seed):
        label_dict_seed = label_list_dict_seed[iseed]
#
        label_dict_seed_exp_adj0 = dict_expand(label_dict_seed, dc, step, type='adj0')
        file_plot_seed = file_make(label,dir_seed, dc, step, pick_count, prf_pick, seedyn = 'y')
        plot_label(label, label_dict_seed_exp_adj0, label_stat_info, path_dir_seed_work, file_plot_seed)
#
# end for iseed, dir_seed in enumerate(list_seed)
    label_dict_exp_adj0 = dict_expand(label_dict, dc, step, type='adj0')
    file_plot = file_make(label,'any', dc, step, pick_count, prf_pick, seedyn = 'n')
    plot_label(label, label_dict_exp_adj0, label_stat_info, path_work, file_plot)
#end for label in label_stat.keys()
    return

## label统计输出为文件

In [16]:
def label_count_writeout(label, label_stat_info,
                         label_list_dict_seed, label_dict,
                         list_seed,
                         pick_count, prf_pick,
                         path_work):
    """
    将统计的label输出到txt文件
    """
#
    x,y = sort_dict2lists(label_dict)
    
#
    dc = label_stat_info['type']
    if dc == 'd' or dc == 'D':
        dc = 'd'
        dist = []
        for i in range(len(x)-1):
            dist.append(x[i+1]-x[i])
        dist.sort()
        step = dist[0]
    elif dc == 'c' or dc == 'C':
        dc = 'c'
        step = label_stat_info['step']
        if step <= 0:
            print()
            print(f"** step = {step} <= 0, WRONG!")
            sys.exit()
    else:
        print()
        print(f"** Error!! label type is {dc}, Not continuous nor Dispersed! 6")
#
    step_split = str(step).split(".")
    if len(step_split) == 2:
        digit = len(str(step).split(".")[1])
    else:
        digit = 0
#
#
    a = file_make(label,'anything', dc, step, pick_count, prf_pick, seedyn = 'n')
    file_write = f"{a}.txt"
#    file_write_with0 = f"{a}_with0.txt"
#
    path_file_write = os.path.join(path_work, file_write)
    fout = open(path_file_write,'w')
#
#    path_file_write_with0 = os.path.join(path_work, file_write_with0)
#    fout1 = open(path_file_write_with0,'w')
#
# find the space length of 1st index column,  for writeout formate
#
    index_cluster = 'Cluster'
    index_sum = 'Sum'
    index_sum_all = 'Sum_all'
    n_ind = max(len(index_cluster),len(index_sum),len(index_sum_all))
    for a in list_seed:
        n_ind = max(n_ind, len(str(a)))
#
# find the space length of label, for writeout formated
    n_data = 0
    for a in x+y:
        n_data = max(n_data, len(str(a)))
    ndddf = f"{n_data}.{digit}f"
#
#
#  Start to writeout
#
    if dc == 'd':
# Cluster    1    2    3    4
        print(f"{index_cluster:>{n_ind}}", end=' ')
        print(f"{index_cluster:>{n_ind}}", end=' ', file=fout)
        for a in x[:-1]:
            print(f"{a:>{ndddf}}", end = ' ')
            print(f"{a:>{ndddf}}", end = ' ', file=fout)
        print(f"{x[-1]:>{ndddf}}", end = '\n')
        print(f"{x[-1]:>{ndddf}}", end = '\n', file=fout)
# ----
        print(f"{'-'*n_ind}", end='-')
        print(f"{'-'*n_ind}", end='-', file=fout)
        for a in x[:-1]:
            print(f"{'-'*n_data}", end = '-')
            print(f"{'-'*n_data}", end = '-', file=fout)
        print(f"{'-'*n_data}", end = '\n')
        print(f"{'-'*n_data}", end = '\n', file=fout)
# major part
        for j, dir_seed in enumerate(list_seed):
            print(f"{dir_seed:>{n_ind}}", end=' ')
            print(f"{dir_seed:>{n_ind}}", end=' ', file=fout)
            label_dict_seed = label_list_dict_seed[j]
#
            a = max(list(label_dict_seed.keys()))
            for b in x:
                if b < a:
                    if b in label_dict_seed.keys():
                        c = label_dict_seed[b]
                        print(f"{c:>{n_data}d}", end = ' ')
                        print(f"{c:>{n_data}d}", end = ' ', file=fout)
                    else:
                        print(f"{' '*n_data}", end = ' ')
                        print(f"{' '*n_data}", end = ' ', file=fout)
                elif b == a:
                    c = label_dict_seed[b]
                    print(f"{c:>{n_data}d}", end = '\n')
                    print(f"{c:>{n_data}d}", end = '\n', file=fout)
                    break
                else:
                    print(f"** Error! {b} is larger than max of seed {dir_seed} label {a}.")
                    sys.exit()
# sum part
        print(f"{index_sum:>{n_ind}}", end=' ')
        print(f"{index_sum:>{n_ind}}", end=' ', file=fout)
        for a in y[:-1]:
            print(f"{a:>{n_data}d}", end = ' ')
            print(f"{a:>{n_data}d}", end = ' ', file=fout)
        print(f"{y[-1]:>{n_data}d}", end = '\n')
        print(f"{y[-1]:>{n_data}d}", end = '\n', file=fout)
# sum_all part
        print(f"{index_sum_all:>{n_ind}}", end=' ')
        print(f"{index_sum_all:>{n_ind}}", end=' ', file=fout)
        a = sum(y)
        print(f"{a}")
        print(f"{a}", file=fout)
# ----
        print(f"{'-'*n_ind}", end='-')
        print(f"{'-'*n_ind}", end='-', file=fout)
        for a in x[:-1]:
            print(f"{'-'*n_data}", end = '-')
            print(f"{'-'*n_data}", end = '-', file=fout)
        print(f"{'-'*n_data}", end = '\n')
        print(f"{'-'*n_data}", end = '\n', file=fout)
# Cluster    1    2    3    4
        print(f"{index_cluster:>{n_ind}}", end=' ')
        print(f"{index_cluster:>{n_ind}}", end=' ', file=fout)
        for a in x[:-1]:
            print(f"{a:>{ndddf}}", end = ' ')
            print(f"{a:>{ndddf}}", end = ' ', file=fout)
        print(f"{x[-1]:>{ndddf}}", end = '\n')
        print(f"{x[-1]:>{ndddf}}", end = '\n', file=fout)
#
#
    elif dc == 'c':
# Cluster [2.4,2.5) [2.5,2.6) [2.6,2.7)
        print(f"{index_cluster:>{n_ind}}", end=' ')
        print(f"{index_cluster:>{n_ind}}", end=' ', file=fout)
        for a in x[:-1]:
            b = round(a+step,digit)
            print(f"[{a:>{ndddf}},{b:>{ndddf}})", end = ' ')
            print(f"[{a:>{ndddf}},{b:>{ndddf}})", end = ' ', file=fout)
        b = round(x[-1]+step,digit)
        print(f"[{x[-1]:>{ndddf}},{b:>{ndddf}})", end = '\n')
        print(f"[{x[-1]:>{ndddf}},{b:>{ndddf}})", end = '\n', file=fout)
# ----
        print(f"{'-'*n_ind}", end='-')
        print(f"{'-'*n_ind}", end='-', file=fout)
        aa = 3+2*n_data
        for a in x[:-1]:
            print(f"{'-'*aa}", end = '-')
            print(f"{'-'*aa}", end = '-', file=fout)
        print(f"{'-'*aa}", end = '\n')
        print(f"{'-'*aa}", end = '\n', file=fout)
# major part
        for j, dir_seed in enumerate(list_seed):
            print(f"{dir_seed:>{n_ind}}", end=' ')
            print(f"{dir_seed:>{n_ind}}", end=' ', file=fout)
            label_dict_seed = label_list_dict_seed[j]
#
            a = max(list(label_dict_seed.keys()))
            for b in x:
                if b < a:
                    if b in label_dict_seed.keys():
                        c = label_dict_seed[b]
                        print(f"{c:>{aa}d}", end = ' ')
                        print(f"{c:>{aa}d}", end = ' ', file=fout)
                    else:
                        print(f"{' '*aa}", end = ' ')
                        print(f"{' '*aa}", end = ' ', file=fout)
                elif b == a:
                    c = label_dict_seed[b]
                    print(f"{c:>{aa}d}", end = '\n')
                    print(f"{c:>{aa}d}", end = '\n', file=fout)
                    break
                else:
                    print(f"** Error! label {b} is larger than max of seed {dir_seed} label {a}.")
                    sys.exit()
# sum part
        print(f"{index_sum:>{n_ind}}", end=' ')
        print(f"{index_sum:>{n_ind}}", end=' ', file=fout)
        for a in y[:-1]:
            print(f"{a:>{aa}d}", end = ' ')
            print(f"{a:>{aa}d}", end = ' ', file=fout)
        print(f"{y[-1]:>{aa}d}", end = '\n')
        print(f"{y[-1]:>{aa}d}", end = '\n', file=fout)
# sum_all part
        print(f"{index_sum_all:>{n_ind}}", end=' ')
        print(f"{index_sum_all:>{n_ind}}", end=' ', file=fout)
        a = sum(y)
        print(f"{a}")
        print(f"{a}", file=fout)
# ----
        print(f"{'-'*n_ind}", end='-')
        print(f"{'-'*n_ind}", end='-', file=fout)
        aa = 3+2*n_data
        for a in x[:-1]:
            print(f"{'-'*aa}", end = '-')
            print(f"{'-'*aa}", end = '-', file=fout)
        print(f"{'-'*aa}", end = '\n')
        print(f"{'-'*aa}", end = '\n', file=fout)
# Cluster [2.4,2.5) [2.5,2.6) [2.6,2.7)
        print(f"{index_cluster:>{n_ind}}", end=' ')
        print(f"{index_cluster:>{n_ind}}", end=' ', file=fout)
        for a in x[:-1]:
            b = round(a+step,digit)
            print(f"[{a:>{ndddf}},{b:>{ndddf}})", end = ' ')
            print(f"[{a:>{ndddf}},{b:>{ndddf}})", end = ' ', file=fout)
        b = round(x[-1]+step,digit)
        print(f"[{x[-1]:>{ndddf}},{b:>{ndddf}})", end = '\n')
        print(f"[{x[-1]:>{ndddf}},{b:>{ndddf}})", end = '\n', file=fout)
#
    else:
        pass
    fout.close()
#    fout1.close()

# 程序正文

## 当前目录

In [17]:
path = os.path.abspath('./')
path

'D:\\haifeng\\work_20211031\\XAS-ML\\code\\datasets setup\\jupyterbook'

In [18]:
# 上一级目录
path_par = os.path.abspath(os.path.pardir)
path_par

'D:\\haifeng\\work_20211031\\XAS-ML\\code\\datasets setup'

## 源文件夹目录

In [19]:
# 源文件夹目录
path_src = os.path.join(path_par, dir_src)
# 检查源文件夹是否存在，存在则列出其内部文件夹，不存在则打印错误
exist = os.path.exists(path_src)
if exist:
    list_seed = os.listdir(path_src)
else:
    print()
    print('** Error!! WT untared directory', dir_src, 'is not found!' )
    print()
#    sys.exit()
list_seed

['100',
 '1000',
 '1100',
 '1200',
 '1300',
 '1400',
 '1500',
 '1600',
 '1700',
 '1800',
 '1900',
 '200',
 '2000',
 '2100',
 '2200',
 '2300',
 '2400',
 '2500',
 '2600',
 '2700',
 '2800',
 '2900',
 '300',
 '3000',
 '3100',
 '3200',
 '3300',
 '3400',
 '3500',
 '3600',
 '3700',
 '3800',
 '3900',
 '400',
 '4000',
 '500',
 '600',
 '700',
 '800',
 '900',
 'modify.log',
 'pick.log']

In [20]:
# 去除list_seed中不是文件夹的部分
file_del = []
for dir_seed in list_seed:
    path_seed_src = os.path.join(path_src, dir_seed)
    if os.path.isfile(path_seed_src):
        file_del.append(dir_seed)
for file in file_del:
    list_seed.remove(file)

## 输出文件夹路径

In [21]:
path_work = os.path.join(path_par, dir_work)
os.makedirs(path_work, exist_ok=True)
path_work

'D:\\haifeng\\work_20211031\\XAS-ML\\code\\datasets setup\\AuPd-4000_au150-0_DW_R0_cncr_WT_untar_fealab_statistics'

In [22]:
path_dir_seed_work = os.path.join(path_work, dir_seed_work)
os.makedirs(path_dir_seed_work, exist_ok=True)
path_dir_seed_work

'D:\\haifeng\\work_20211031\\XAS-ML\\code\\datasets setup\\AuPd-4000_au150-0_DW_R0_cncr_WT_untar_fealab_statistics\\seeds'

## 开始统计abel分布。

a = [[1,2,3]]
b = [[4,5,6]]
a+b = [[1, 2, 3], [4, 5, 6]]

a = [], b = [1,2] ,a.append(b) = [[1, 2]]

a = [], b = [1,2], a+b = [1, 2]

int(2.6) = 2

int(-2.6) =int(-2.2)= -2

In [23]:
for label in label_stat.keys():
    label_stat_info = label_stat[label]
#
#  label count statistics
    label_list_dict_seed, label_dict = label_count(label, label_stat_info,
                list_seed, path_src,
                pick_count, dir_check, file_checknum, trivial)
#  label statistics plot
    label_count_plot(label, label_stat_info,
                     label_list_dict_seed, label_dict,
                     list_seed,
                     pick_count, prf_pick, trivial,
                     path_dir_seed_work, path_work)
# label statistics write to file
    label_count_writeout(label, label_stat_info,
                         label_list_dict_seed, label_dict,
                         list_seed,
                         pick_count, prf_pick,
                         path_work)


** All the label will be counted!

** All the label will be counted!

** All the label will be counted!

** All the label will be counted!

** All the label will be counted!

** All the label will be counted!

** All the label will be counted!

** All the label will be counted!
Saving figure 100_cr1_step=0.1_pick
Saving figure 1000_cr1_step=0.1_pick
Saving figure 1100_cr1_step=0.1_pick
Saving figure 1200_cr1_step=0.1_pick
Saving figure 1300_cr1_step=0.1_pick
Saving figure 1400_cr1_step=0.1_pick
Saving figure 1500_cr1_step=0.1_pick
Saving figure 1600_cr1_step=0.1_pick
Saving figure 1700_cr1_step=0.1_pick
Saving figure 1800_cr1_step=0.1_pick
Saving figure 1900_cr1_step=0.1_pick
Saving figure 200_cr1_step=0.1_pick
Saving figure 2000_cr1_step=0.1_pick
Saving figure 2100_cr1_step=0.1_pick
Saving figure 2200_cr1_step=0.1_pick
Saving figure 2300_cr1_step=0.1_pick
Saving figure 2400_cr1_step=0.1_pick
Saving figure 2500_cr1_step=0.1_pick
Saving figure 2600_cr1_step=0.1_pick
Saving figure 2700_

#### CN

for label in label_stat.keys():
    label_stat_info = label_stat[label]
    dc = label_stat_info['type']
    if dc == 'd' or dc == 'D':
        dc = 'd'
        step = trivial
    elif dc == 'c' or dc == 'C':
        dc = 'c'
        step = label_stat_info['step']
        if step <= 0:
            print()
            print(f"** step = {step} <= 0, WRONG!")
            sys.exit()
    else:
        print()
        print(f"** Error!! label type is {dc}, Not continuous nor Dispersed! 4")
#
    step_split = str(step).split(".")
    if len(step_split) == 2:
        digit = len(str(step).split(".")[1])
    else:
        digit = 0
#
    label_dict = {}
# 进入各个cluster，即100、200等
    for iseed, dir_seed in enumerate(list_seed):
        path_dir_seed = os.path.join(path_src, dir_seed)
# 进入seed下的label文件夹，查找其下是否存在label文件
        path_dir_label = os.path.join(path_dir_seed, label)
        exist = os.path.exists(path_dir_label)
        label_dict_seed = {}
        if exist:
    # 源seed中各个label文件
            list_label = os.listdir(path_dir_label)
            result = os.path.splitext(list_label[0])
# 统计是否扣除check文件夹中的label
            if pick_count:
                path_dir_check = os.path.join(path_dir_seed, dir_check)
                path_file_checknum = os.path.join(path_dir_check, file_checknum)
                exist_checknum = os.path.exists(path_file_checknum)
                if exist_checknum:
# csv读取方式
                    number_pd = pd.read_csv(path_file_checknum,header=None)
                    number_list = list(number_pd.values)
                    for i, number in enumerate(number_list):
                        number_list[i] = str(int(max(abs(number+trivial),abs(number-trivial))))
# readlines读取方式
#                    with open(path_file_checknum) as f:
#                        number_list0 = f.read().splitlines()
#                    number_list = []
#                    for i, number in enumerate(number_list0):
#                        number_list.append(number.strip())
# 
                    for i, number in enumerate(number_list):
                        file_label_num = label + '_' + number + result[1]
                        list_label.remove(file_label_num)
                else:
                    print()
                    print(f"** Warning! File '{file_checknum}' in seed {dir_seed} for label '{label}' NOT found!")
                    print(f"** All the label will be counted!")
#               end if exist_checknum
            else:
                pass
#           end if pick_count
# 开始读取label文件夹中的label_N.dat
            for i, labelN in enumerate(list_label):
                path_file_labelN = os.path.join(path_dir_label, labelN)
                labelN_pd = pd.read_csv(path_file_labelN,header=None)
#                labelN_list = labelN_pd.values
#                print(f"labelN_pd.values={labelN_pd.values}")
#                print(f"labelN_list={labelN_list}")
#                a = labelN_list[0][0]
                a = labelN_pd.iloc[0,0]
#                print(f"a={a}")
                if dc == 'd':
                    labelN_data = a
                elif dc == 'c':
                    if a >= 0 :
                        labelN_data = int((a+trivial)/step)*step
                    else:
                        labelN_data = (int((a-trivial)/step)-1)*step
                    labelN_data = round(labelN_data,digit)
                else:
                    pass
#                print(f"a={a}")
#                print(f"labelN_data={labelN_data}")
                if labelN_data in label_dict_seed.keys():
                    label_dict_seed[labelN_data] = label_dict_seed[labelN_data] + 1
                else:
                    label_dict_seed[labelN_data] = 1
                if labelN_data in label_dict.keys():
                    label_dict[labelN_data] = label_dict[labelN_data] + 1
                else:
                    label_dict[labelN_data] = 1
#           end for i, labelN in enumerate(list_label)
        else:
            print()
            print(f"** Error!! No directory '{label}' found in seed {dir_seed}")
            print()
    #        sys.exit()
#       end if exist = os.path.exists(path_dir_label)
#
        label_dict_seed_exp_adj0 = dict_expand(label_dict_seed, dc, step, type='adj0')
        file_plot_seed = file_make(label,dir_seed, dc, step, pick_count, prf_pick, seedyn = 'y')
        plot_label(label, label_dict_seed_exp_adj0, label_stat_info, path_dir_seed_work, file_plot_seed)
#
# end for iseed, dir_seed in enumerate(list_seed)
    label_dict_exp_adj0 = dict_expand(label_dict, dc, step, type='adj0')
    file_plot = file_make(label,dir_seed, dc, step, pick_count, prf_pick, seedyn = 'n')
    plot_label(label, label_dict_exp_adj0, label_stat_info, path_work, file_plot)
#end for label in label_stat.keys()