## 本程序对提取出的非等效位点进行三成分线性组合，给出组合后的chi谱、配位数文件和信息文件

### 默认已经提取出非等效位点的文件夹为“cu_crystal_different_xyz_WT_untar_symmetry”（一级目录，即源文件夹），该文件夹可由程序“symmetry_check.ipynb”运行获得，将本程序放置在一级目录并行的任何一个文件夹下即可运行（不要放置在“cu_crystal_different_xyz_WT_untar_symmetry”中）。 

### 程序会读取文件夹“cu_crystal_different_xyz_WT_untar_symmetry”下所有的格点的“chi_FEFF.dat”“coor.dat”文件，运行后生成“cu_crystal_different_xyz_WT_untar_symmetry_ex”文件夹（一级目录，目标文件夹），该文件夹下有“chi_FEFF”、“coor”和“info”三个文件夹，分别对应组合后体系的chi谱、结构文件和信息文件（即由哪三个结构组合而来）。

last update: 2022.5.21

修订了symmetry中如果存在文件的小问题

contacts: zhaohf@ihep.ac.cn

# 配置环境

## 导入各种模块

In [1]:
import os
import sys
import pandas as pd
import numpy as np

输出导入模板的版本。

In [2]:
print('python version:',sys.version)
print('numpy version:',np.__version__)
print('panda version:',pd.__version__)

python version: 3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]
numpy version: 1.21.5
panda version: 1.4.2


# 设置输入文件夹和输出文件夹

## 设置源文件夹，即要处理的文件夹

In [3]:
# 设置源文件夹，即经过等效位点筛查的文件夹
dir_src = 'cu_crystal_different_xyz_WT_untar_symmetry'

## 设置输出文件夹名

In [4]:
dir_work = dir_src + '_comb'

# 一些函数、类

## 删除某一目录下的所有文件或文件夹

In [5]:
def del_file(filepath):
    """
    删除某一目录下的所有文件或文件夹，但不会删除filepath这个文件夹，即留下filepath这个文件夹
    :param filepath: 路径
    :return:
    """
    
    import shutil
    del_list = os.listdir(filepath)
    for f in del_list:
        file_path = os.path.join(filepath, f)
        if os.path.isfile(file_path):
            os.remove(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)

## 查找文件

In [6]:
def pick_file(path_src, file):
    """
    给定文件夹（path_src），查找文件（file），然后给出文件的路径，存储在数组（list_path_file）中
    """
#
# 检查源文件夹是否存在，存在则列出其内部文件夹，不存在则打印错误
    exist = os.path.exists(path_src)
    if exist:
        list_seed = os.listdir(path_src)
    else:
        print()
        print('** Error!! WT untared directory', path_src, 'is not found!' )
        print()
        sys.exit()
#
    list_path_file = []
    list_info_seed = []
    list_info_xyz = []
    for dir_seed in list_seed:
#
# 源seed的路径
        path_seed_src = os.path.join(path_src, dir_seed)
# 判断是否是文件夹，因为有的时候有文件，有文件就会出问题
        isdirectory = os.path.isdir(path_seed_src)
#
        if isdirectory:
# 源seed中各个xyz文件夹  
            list_xyz = os.listdir(path_seed_src)
#
            for dir_xyz in list_xyz:
#
# 源xyz的路径
                path_xyz_src = os.path.join(path_seed_src, dir_xyz)
# 查找是否存在file
                path_file_src = os.path.join(path_xyz_src, file)
                exist = os.path.exists(path_file_src)
                if exist:
                    list_path_file.append(path_file_src)
                    list_info_seed.append(dir_seed)
                    list_info_xyz.append(dir_xyz)
                else:
                    list_path_file.append('NaN')
                    list_info_seed.append(dir_seed)
                    list_info_xyz.append(dir_xyz)
#
    return list_path_file, list_info_seed, list_info_xyz

## 读取chi_FEFF.dat的数组

In [7]:
def read_chi_FEFF(path):
    """"""
    data_df = pd.read_csv(path,header=None,delim_whitespace=True)
    a = data_df.iloc[0,0]
    if a == 0 :
        data_df = pd.read_csv(path,header=0,delim_whitespace=True)
#    data_np = data_df.iloc[:,1].values
    data_np = data_df.values
#
# 数据由list改为numpy array的形式
#
#    data_np_array = np.array(data_np)
#
    return data_np
#    return data_np_array

## 读取coor.dat文件中的数组

In [8]:
def read_coor(path_coor):
    """
    读取coor.dat中的数据
    """
    coor = pd.read_csv(path_coor,delim_whitespace=True,header=None, skiprows=[0,1])
    coor_data = coor.values
#    N_shell = coor_data.shape[0]
#    cn = coor_data[0:N_shell,1]
#    r = coor_data[0:N_shell,2]
#    r_d = coor_data[0:N_shell,3]
#
#    return cn,r,r_d
    return coor_data

# 程序正文

## 设置工作文件夹

In [9]:
# 当前目录
path = os.path.abspath('./')
path

'D:\\haifeng\\work_20211031\\XAS-ML\\code\\datasets setup\\jupyterbook'

In [10]:
# 上一级目录
path_par = os.path.abspath(os.path.pardir)
path_par

'D:\\haifeng\\work_20211031\\XAS-ML\\code\\datasets setup'

In [11]:
# 设置源文件夹，即经过等效位点筛查的文件夹
#dir_src = 'cu_crystal_different_xyz_WT_untar_symmetry'
path_src = os.path.join(path_par, dir_src)
# 检查源文件夹是否存在，存在则列出其内部文件夹，不存在则打印错误
exist = os.path.exists(path_src)
if exist:
    list_seed = os.listdir(path_src)
else:
    print()
    print('** Error!! WT untared directory', dir_src, 'is not found!' )
    print()
#    sys.exit()
list_seed

['Cu-bulk',
 'Cu-fcc-22',
 'Cu-fcc-22-1',
 'Cu-fcc-23-1',
 'Cu-fcc-33',
 'Cu-fcc-33-1',
 'Cu-fcc-33-2',
 'Cu-fcc-331',
 'Cu-fcc-333',
 'Cu-fcc-34-2',
 'Cu-fcc-342',
 'Cu-fcc-42',
 'Cu-fcc-43-1',
 'Cu-fcc-43-2',
 'Cu-fcc-44-2',
 'Cu-fcc-53',
 'Cu-wulff-size13',
 'Cu-wulff-size201',
 'symmetry.dat']

In [12]:
# 创建工作文件夹和目录
#dir_work = dir_src + '_exp'
path_work = os.path.join(path_par, dir_work)
os.makedirs(path_work, exist_ok=True)
path_work

'D:\\haifeng\\work_20211031\\XAS-ML\\code\\datasets setup\\cu_crystal_different_xyz_WT_untar_symmetry_comb'

In [13]:
#  在工作文件夹内，新建“chi_FEFF”、“coor”和“info”文件夹
dir_chi_FEFF = 'chi_FEFF'    # 存储组合后的chi_FEFF.dat文件
dir_coor = 'coor'            # 对应的coor.dat文件
dir_info = 'info'            # 组合前的源文件信息
path_chi_FEFF_work = os.path.join(path_work, dir_chi_FEFF)
path_coor_work = os.path.join(path_work, dir_coor)
path_info_work = os.path.join(path_work, dir_info)
#
#  检查是否已经存在这三个文件夹，若有，删除其中文件；若没有则构建
os.makedirs(path_chi_FEFF_work, exist_ok=True)
del_file(path_chi_FEFF_work)
os.makedirs(path_coor_work, exist_ok=True)
del_file(path_coor_work)
os.makedirs(path_info_work, exist_ok=True)
del_file(path_info_work)

## 查找源文件

In [14]:
# 获得“chi_FEFF.dat”文件的路径列表
file = 'chi_FEFF.dat'
list_chi_FEFF_path, list_info_seed, list_info_xyz = pick_file(path_src, file)

In [15]:
N_chi_FEFF = len(list_chi_FEFF_path)
N_chi_FEFF

84

In [16]:
# 获得“coor.dat”文件的路径列表
file = 'coor.dat'
list_coor_path, list_info_seed, list_info_xyz = pick_file(path_src, file)

In [17]:
N_coor = len(list_coor_path)
N_coor

84

In [18]:
if N_chi_FEFF != N_coor:
    print()
    print('*****  Attension!  ******* ')
    print('N_chi_FEFF = ', N_chi_FEFF)
    print('N_coor     = ', N_coor)
    print()

## 开始组合

chi谱都为（400,2）数组，可直接相加，而coor文件为（N_shell，4），无法直接相加，处理方式为将短的数组用零扩充，然后相加

In [19]:
#  index 是新的组合体系的编号。
index = 0
# i,j,k分别为三个谱（结构）的标记
for i in range(N_chi_FEFF):
    path_chi_FEFF_i = list_chi_FEFF_path[i]
    chi_FEFF_data_i = read_chi_FEFF(path_chi_FEFF_i)
#
    path_coor_i = list_coor_path[i]
    coor_data_i = read_coor(path_coor_i)
    N_shell_i = coor_data_i.shape[0]
#
    for j in range(i+1, N_chi_FEFF):
        path_chi_FEFF_j = list_chi_FEFF_path[j]
        chi_FEFF_data_j = read_chi_FEFF(path_chi_FEFF_j)
#
        path_coor_j = list_coor_path[j]
        coor_data_j = read_coor(path_coor_j)
        N_shell_j = coor_data_j.shape[0]
#
        for k in range(j+1, N_chi_FEFF):
            index = index + 1
#
            path_chi_FEFF_k = list_chi_FEFF_path[k]
            chi_FEFF_data_k = read_chi_FEFF(path_chi_FEFF_k)
#
            path_coor_k = list_coor_path[k]
            coor_data_k = read_coor(path_coor_k)
            N_shell_k = coor_data_k.shape[0]
#
            chi_FEFF = (chi_FEFF_data_i + chi_FEFF_data_j + chi_FEFF_data_k)/3.0
#
#    考虑到各个cluster的大小，每个cluster的N_shell都不同，需要将所有体系的coor.dat按照最大的N_shell进行扩充（都补零），然后相加
            N_shell_max = max(N_shell_i, N_shell_j, N_shell_k)
            coor_data_i_ex = np.pad(coor_data_i, ((0,N_shell_max - N_shell_i),(0,0)), 'constant')
            coor_data_j_ex = np.pad(coor_data_j, ((0,N_shell_max - N_shell_j),(0,0)), 'constant')
            coor_data_k_ex = np.pad(coor_data_k, ((0,N_shell_max - N_shell_k),(0,0)), 'constant')
            coor = (coor_data_i_ex + coor_data_j_ex + coor_data_k_ex)/3.0
#  输出
            index_str = str(index)
            file_chi_FEFF_N = 'chi_FEFF_' + index_str + '.dat'
            path_chi_FEFF_chi_FEFF_N = os.path.join(path_chi_FEFF_work, file_chi_FEFF_N)
            fout = open(path_chi_FEFF_chi_FEFF_N, 'wt')
#            print(chi_FEFF, file=fout)
            N_k = chi_FEFF.shape[0]
            for i_k in range(N_k):
                print("{:>12.4f}{:>14.6E}".format(chi_FEFF[i_k,0], chi_FEFF[i_k,1]), file=fout)
            fout.close()
#
            file_coor_N = 'coor_' + index_str + '.dat'
            path_coor_coor_N = os.path.join(path_coor_work, file_coor_N)
            fout = open(path_coor_coor_N, 'wt')
            print(file=fout)
            print("  SHELL  N_COOR  BOND_AV  DELTA", file=fout)
#            print(coor, file=fout)
            for i_sh in range(N_shell_max):
                print("{:>7.1f}{:>8.3f}{:>9.3f}{:>7.3f}".format(coor[i_sh,0], coor[i_sh,1], coor[i_sh,2], coor[i_sh,3]), file=fout)
            fout.close()
#
            file_info_N = 'info_' + index_str + '.dat'
            path_info_info_N = os.path.join(path_info_work, file_info_N)
            fout = open(path_info_info_N, 'wt')
            print(list_info_seed[i], list_info_xyz[i], file=fout)
            print(list_info_seed[j], list_info_xyz[j], file=fout)
            print(list_info_seed[k], list_info_xyz[k], file=fout)
            fout.close()