## 本程序读取含有feature、label文件路径的csv文件（位于“datasets_csv”文件夹下），从feature和label中读取feature和label值，生成数据集文件，包括train、validation、test数据集。数据保存到“datasets”文件夹下

###  这里读取的是chi，xmu文件与chi不一样，若读取xmu，请采用“_xmu”代码

last update: 2022.05.18

相对于21版本，将文件/文件夹参数提前到前面进行设置，并调整了输入和输出文件夹相对于本代码的相对路径

contacts：zhaohf@ihep.ac.cn

# 配置环境

## 所需模块

In [1]:
import sys
import os
import pandas as pd
import numpy as np

输出导入模板的版本。

In [2]:
print('python version:',sys.version)
print('numpy version:',np.__version__)
print('panda version:',pd.__version__)

python version: 3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]
numpy version: 1.21.5
panda version: 1.4.2


# 设置参数

## 设置输入的csv文件和文件相对路径

In [3]:
dir_csv = './../datasets_csv'
file_csv  = 'au150_DW_chi_FEFF.csv'

## 设置输出的用于保存数据的文件夹和文件

In [4]:
dir_data = './../datasets'
# feature文件将由 index + '_' + feature + suffix 组成
# label文件将由 file_label = index + '_' + label[] + suffix 组成
index = 'Au'
feature = 'chi'
label = ['cn', 'cr']
suffix = '.txt'

## 设置 train、validation、test数据的比例

In [5]:
# 让训练、验证、测试数据集的比例为7:2:1
ratio_train = 0.7
ratio_valid = 0.2
ratio_test = 1 - ratio_train - ratio_valid

# 类、函数模块

## csv--> data

In [6]:
# 对于给定的csv文件“data_file”（chi_FEFF），读取其中的feature和label，制成数据集。
import pandas as pd
import numpy as np
def create_dataset(data_file):
    pd_dataset = pd.read_csv(data_file)
    feature_list = []
    label_list = []
    for i in range(len(pd_dataset)):
#
# 数据预处理 特征
        data_file_path = pd_dataset['file_path'][i]
        data_df = pd.read_csv(data_file_path,header=None,delim_whitespace=True)
        a = data_df.iloc[0,0]
        if a == 0 :
            data_df = pd.read_csv(data_file_path,header=0,delim_whitespace=True)
        data_np = data_df.iloc[:,1].values
        feature_list.append(data_np)
#
# 数据预处理 label
        label_file_path = pd_dataset['label_path'][i]
        label_df = pd.read_csv(label_file_path,delim_whitespace=True,header=0, skiprows=[0])
#
#  这里可能需要确保lable有值，zhaohf
#
        label = label_df.iloc[0,1:3].values
        label_list.append(label)
#
#        del label_df
#        del data_np
#
  # 特征和label数据由list改为numpy array的形式
#
    data_np_array = np.array(feature_list)
    label_np_array = np.array(label_list)
#
#    del feature_list
#    del label_list
#
    return data_np_array, label_np_array

# 正文

## 确定当前目录和工作目录

In [7]:
path = os.path.abspath(os.curdir)
path

'D:\\haifeng\\work_20211031\\XAS-ML\\code\\neural network module\\jupyternotebook_code'

## 检查csv文件和文件相对路径

In [8]:
# csv文件所在位置
path_dir_csv = os.path.join(path,dir_csv)
path_csv = os.path.join(path_dir_csv,file_csv)
#
# 检查csv是否存在，若不存在，报错
exist = os.path.exists(path_csv)
if not exist:
    print()
    print('** Error!! cannot find csv file in', dir_csv)
    print()
    sys.exit()

## 对于给定的csv文件“data_file”，读取其中的feature和label，制成数据集。

这可能需要几十秒种（5996条谱和配位数、平均键长，大概需要25秒）

In [9]:
data_np_array, label_np_array = create_dataset(path_csv)

看一下feature值

In [10]:
data_np_array

array([[ 3.204381e-01,  3.179742e-01,  3.107868e-01, ...,  1.588940e-06,
         1.217988e-06,  8.223206e-07],
       [ 2.272433e-01,  2.254551e-01,  2.202515e-01, ..., -1.081169e-06,
        -1.537145e-06, -1.827965e-06],
       [ 2.434066e-01,  2.416660e-01,  2.365912e-01, ...,  1.017455e-06,
         9.860825e-07,  8.813536e-07],
       ...,
       [ 1.633016e-01,  1.616820e-01,  1.571817e-01, ..., -1.616210e-06,
        -1.438373e-06, -1.191754e-06],
       [ 2.649083e-01,  2.628461e-01,  2.568242e-01, ...,  5.872027e-07,
         7.930178e-07,  9.127764e-07],
       [ 2.487297e-01,  2.467340e-01,  2.409130e-01, ..., -2.460035e-06,
        -2.220299e-06, -1.854691e-06]])

In [11]:
data_np_array.shape

(5966, 400)

看一下label值

In [12]:
label_np_array

array([[ 8.   ,  2.819],
       [ 8.   ,  2.92 ],
       [ 8.   ,  2.835],
       ...,
       [12.   ,  2.864],
       [ 8.   ,  2.902],
       [ 7.   ,  2.865]])

In [13]:
label_np_array.shape

(5966, 2)

## 将 feature值和label值输出到文件中

### 创建输出文件夹

In [14]:
path_data = os.path.join(os.curdir, dir_data)
os.makedirs(path_data, exist_ok=True)

### 整体feature和label文件

创建feature文件

In [15]:
file_feature = index + '_' + feature + suffix
path_file_feature = os.path.join(path_data, file_feature)
fout = open(path_file_feature,'w')
for i in range(data_np_array.shape[0]):
    for j in range(data_np_array.shape[1]):
        fout.write(f"{data_np_array[i][j]:14.6e}")
    fout.write('\n')
fout.close()

创建label文件

In [16]:
for j in range(len(label)):
    file_label = index + '_' + label[j] + suffix
    path_file_label = os.path.join(path_data, file_label)
    fout = open(path_file_label,'w')
    for i in range(label_np_array.shape[0]):
        if j == 0 : fout.write(f"{label_np_array[i][j]:3.0f}\n")
        if j == 1 : fout.write(f"{label_np_array[i][j]:6.3f}\n")
#        if j == 1 : print(f"{label_np_array[i][j]:6.3f}",file=fout)   # same as previous
    fout.close()

### train、validation、test文件

In [17]:
N_train_full = data_np_array.shape[0]
N_train = int(N_train_full * ratio_train)
N_valid = int(N_train_full * ratio_valid)
N_test = N_train_full - N_train - N_valid
print ('N. of train :', N_train)
print ('N. of validation :', N_valid)
print ('N. of test :', N_test)
print (('-')*30)
print ('N. of all :', N_train_full)

N. of train : 4176
N. of validation : 1193
N. of test : 597
------------------------------
N. of all : 5966


In [18]:
train_feature = data_np_array[:N_train]
valid_feature = data_np_array[N_train:-N_test]
test_feature = data_np_array[-N_test:]

In [19]:
train_label = label_np_array[:N_train]
valid_label = label_np_array[N_train:-N_test]
test_label = label_np_array[-N_test:]

创建train、validation、test的feature文件

In [20]:
t_v_t = ['train', 'valid', 'test']
array_feature_tvt = [train_feature, valid_feature, test_feature]
array_label_tvt = [train_label, valid_label, test_label]

In [21]:
for k in range(len(t_v_t)):
    file_feature = index + '_' + feature + '_' + t_v_t[k] + suffix
    path_file_feature = os.path.join(path_data, file_feature)
    fout = open(path_file_feature,'w')
    for i in range(array_feature_tvt[k].shape[0]):
        for j in range(array_feature_tvt[k].shape[1]):
            fout.write(f"{array_feature_tvt[k][i][j]:14.6e}")
        fout.write('\n')
    fout.close()

创建train、validation、test的label文件

In [22]:
for j in range(len(label)):
    for k in range(len(t_v_t)):
        file_label = index + '_' + label[j] + '_' + t_v_t[k] + suffix
        path_file_label = os.path.join(path_data, file_label)
        fout = open(path_file_label,'w')
        for i in range(array_label_tvt[k].shape[0]):
            if j == 0 : fout.write(f"{array_label_tvt[k][i][j]:3.0f}\n")
            if j == 1 : fout.write(f"{array_label_tvt[k][i][j]:6.3f}\n")
        fout.close()