## 本程序读取数据集文件，对数据进行分析，如给出最大值、均值、方差，以及数据集本身的维度、大小等。并绘图。结果输出为txt和图片。

## 默认数据集文件在“datasets”文件夹下，输出结果在“datasets_analysis”文件夹下。

last update: 2021.9.6

contacts：zhaohf@ihep.ac.cn

contributor: zhaohf@ihep.ac.cn, yuqj@ihep.ac.cn

# 配置环境

## 所需的模块

In [1]:
import os
import sys
#
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from itertools import groupby

查看导入模块的版本

In [2]:
print('python version:',sys.version)
print('numpy version:',np.__version__)
print('panda version:',pd.__version__)
print('matplotlib version:',mpl.__version__)
#print('itertools version:', itertools.__version__)

python version: 3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]
numpy version: 1.19.5
panda version: 1.2.4
matplotlib version: 3.3.4


## 设置输出文件夹

设置文件夹，用于放置分析结果

In [3]:
import os
dir_attr = 'datasets_analysis'
path_dir_attr = os.path.join(os.curdir, dir_attr)
os.makedirs(path_dir_attr, exist_ok=True)

设置文件夹，用于放置图片，原来是“images”，现在改为都保存到一个文件夹下，即dir_attr

In [4]:
#dir_images = 'images'
dir_images = dir_attr
path_images = os.path.join(os.curdir, dir_images)
os.makedirs(path_images, exist_ok=True)

## 设置保存图片情况

  确保绘制的图形美观、能保存下来。

In [5]:
# To plot pretty figures
# %matplotlib具体作用是调用matplotlib.pyplot的绘图函数plot()进行绘图时，
# 或生成一个figure画布时，可以直接在你的python console里面生成图像。
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)   # matplotlib.rc 什么意思没找到
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
#PROJECT_ROOT_DIR = "."
#path_images = os.path.join(PROJECT_ROOT_DIR, "images")
#os.makedirs(path_images, exist_ok=True)

def save_fig(fig_id, path_fig = path_images, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(path_fig, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# 正文

## 确定当前目录和工作目录

In [6]:
import os
path = os.path.abspath(os.curdir)
path

'D:\\haifeng\\work_20210709\\ML+XAS\\code\\neural network module\\latest'

## 对数据集进行统计分析并保存

### 设置数据集的相对路径和文件名

In [8]:
dir_data = 'datasets'
file_data = ['Au_chi.txt',
             'Au_chi_train.txt',
             'Au_chi_valid.txt',
             'Au_chi_test.txt',
             'Au_cn.txt',
             'Au_cn_train.txt',
             'Au_cn_valid.txt',
             'Au_cn_test.txt',
             'Au_cr.txt',
             'Au_cr_train.txt',
             'Au_cr_valid.txt',
             'Au_cr_test.txt'
            ]

### 导入文件，统计分析并保存分析结果

In [9]:
import pandas as pd
import numpy as np
import sys
for i in range(len(file_data)):
    path_dir_data = os.path.join(path,dir_data)
    path_file_data = os.path.join(path_dir_data,file_data[i])
#
#   check whether the data file exists, stop if not exists.
#======================================
    exist = os.path.exists(path_file_data)
    if not exist:
        print()
        print(f"** Error!! cannot find file {file_data[i]} in directory {dir_data}! **")
        print()
        sys.exit()
#
#    read the file data
#==================================
    data_df = pd.read_csv(path_file_data,header=None,delim_whitespace=True)
#   df.values, df.as_matrix() and np.array(df) all works.
    data_np_array = data_df.values
#
#    define the name of file attribute
#==================================
    file_name = os.path.splitext(file_data[i])
    file_attr_txt  = file_name[0] + '_attributes.txt'
    path_attr_txt = os.path.join(path_dir_attr,file_attr_txt)
#
#   data attributes
#==============================
    attr_dict1 = {
                  'file'                         : file_data[i],
                  'data shape'                   : data_np_array.shape,
                  'number of dimension'          : data_np_array.ndim,
                  'data number'                  : data_np_array.size,
                  'data memory (bytes)'          : data_np_array.nbytes,
                  'data element memory (bytes)'  : data_np_array.itemsize,
                  'data type'                    : data_np_array.dtype
                 }
    fout = open(path_attr_txt,'wt')
#    print()
    for key in attr_dict1.keys():
#        print(f"  {key:<33}: {attr_dict1[key]}")
        print(f"  {key:<33}: {attr_dict1[key]}",file=fout)
#
#   data properties (max, min, mean, variance, std)
#==============================
#
# axis =0表示按行方向计算，即每列求值；1表示按列方向计算，即每行求值
# 方差函数var()相当于函数mean(abs(x - x.mean())**2),其中x为矩阵；
# 标准方差std()相当于sqrt(mean(abs(x - x.mean())**2))，或相当于sqrt(x.var())。
# 中值指的是将序列按大小顺序排列后，排在中间的那个值，如果有偶数个数，则是排在中间两个数的平均值。
    data_mean = np.mean(data_np_array, axis=0)
    delta = data_np_array - data_mean
    delta_abs = np.abs(delta)
    error = delta / data_np_array
    error_abs = np.abs(error)
#
    attr_dict2 = {
                  'Mean'              : np.mean(data_np_array, axis=0),
                  'Max'               : np.max(data_np_array, axis=0),
                  'Min'               : np.min(data_np_array, axis=0),
                  'Delta_max'         : np.max(delta, axis=0),
                  'Delta_min'         : np.min(delta, axis=0),
                  '|Delta|_mean'      : np.mean(delta_abs, axis=0),
                  '|Delta|_max'       : np.max(delta_abs, axis=0),
                  '|Delta|_min'       : np.min(delta_abs, axis=0),
                  'Var'               : np.var(data_np_array, axis=0),
                  'Std'               : np.std(data_np_array, axis=0),
                  'Error_mean'        : np.mean(error, axis=0),
                  'Error_max'         : np.max(error, axis=0),
                  'Error_min'         : np.min(error, axis=0),
                  '|Error|_mean'      : np.mean(error_abs, axis=0),
                  '|Error|_max'       : np.max(error_abs, axis=0),
                  '|Error|_min'       : np.min(error_abs, axis=0),
                  'Midd'              : np.median(data_np_array, axis=0)
                 }
#
#  print out attributes
#====================================
#    print()
    print(file=fout)
    for key in attr_dict2.keys():
        fout.write(f"  {key:<15}:")
        for j in range(data_np_array.shape[1]):
            fout.write(f"{attr_dict2[key][j]:14.6e}")
        fout.write('\n')
#        print(f"  {key:<15}: {attr_dict2[key]}")
    fout.close()

## label详细统计和绘图

即给出label值的分布，如配位数的分布（柱状图）、平均键长的分布（柱状图）

### 离散数据统计和绘图

#### 设置需要具体分析的离散数据集的文件名，以及对应的label名称（用于绘图等标记）

如果有不同种类的离散数据集的情况，请多运行以下两个cell，未来将进行整合。

In [10]:
file_data = ['Au_cn.txt',
             'Au_cn_train.txt',
             'Au_cn_valid.txt',
             'Au_cn_test.txt'
            ]
label_index = 'CN'

#### 离散数据统计和绘图

In [11]:
from itertools import groupby
for i in range(len(file_data)):
    path_dir_data = os.path.join(path,dir_data)
    path_file_data = os.path.join(path_dir_data,file_data[i])
#
#    read the file data
#==============================
    data_df = pd.read_csv(path_file_data,header=None,delim_whitespace=True)
    data_np_array = data_df.values
#
#  count label
#==============================
#将列表转换为pandas的Series的格式（因为里面有自带的函数可以统计元素出现的个数）
    data_pd_cn = pd.Series(data_np_array.flatten())
    cn_list =list(data_pd_cn.value_counts().index)
    cn_number = list(data_pd_cn.value_counts().values)
    cn_max = max(cn_number)
    cn_list_min = min(cn_list)
    cn_list_max = max(cn_list)
#
#  find out the attribute file
#==============================
#    define the name of file attribute
    file_name = os.path.splitext(file_data[i])
    file_attr_txt  = file_name[0] + '_attributes.txt'
    path_attr_txt = os.path.join(path_dir_attr,file_attr_txt)
    fout = open(path_attr_txt,'a')
#
#  save count data into attribute file
#==============================
#    print(f"{label_index} group in sets:")
    print(f"\n{label_index} group in sets:", file=fout)
    for k, g in groupby(sorted(data_np_array.flatten()), key=lambda x: x//1):
        l = len(list(g))
#        print(f"{k:10.0f} : {l}")
        print(f"{k:10.0f} : {l}", file=fout) 
    fout.close()
#
#  print the bar figure
#==========================================
    file_bar = file_name[0] + '_bar'

#    plt.style.use('seaborn')
    fig = plt.figure()
    ax1 = fig.add_subplot(1,1,1)
    rects1 = ax1.bar(cn_list,cn_number,width=0.8,alpha=0.4,color='blue',edgecolor='red',label=label_index,lw=1)
#
    plt.xlabel(label_index)
    plt.ylabel("Number")
    plt.title(f"{file_name[0]}")
    plt.legend(loc='upper left')
    plt.xticks(np.arange(min(cn_list),max(cn_list)+1,1), rotation=0)#rotation控制倾斜角度
#    plt.yticks(np.arange(0, max(cn_number)*1.2))
    plt.ylim(0,cn_max*1.2)
#
    for a,b in zip(cn_list,cn_number):
        plt.text(a, b+cn_max/100, f"{b:.0f}", ha='center', va= 'bottom',fontsize=10)

    save_fig(file_bar)
    plt.close()

Saving figure Au_cn_bar
Saving figure Au_cn_train_bar
Saving figure Au_cn_valid_bar
Saving figure Au_cn_test_bar


### 连续数据统计和绘图

#### 设置需要具体分析的连续数据集的文件名，以及对应的label名称（用于绘图等标记）

如果有不同种类的连续数据集的情况，请多运行以下两个cell，未来将进行整合。

In [12]:
file_data = ['Au_cr.txt',
             'Au_cr_train.txt',
             'Au_cr_valid.txt',
             'Au_cr_test.txt'
            ]
label_index = 'CR'

#### 连续数据统计和绘图

In [13]:
from itertools import groupby
for i in range(len(file_data)):
    path_dir_data = os.path.join(path,dir_data)
    path_file_data = os.path.join(path_dir_data,file_data[i])
#
#    read the file data
#==============================
    data_df = pd.read_csv(path_file_data,header=None,delim_whitespace=True)
    data_np_array = data_df.values
#
#  count label
#==============================
#将列表转换为pandas的Series的格式（因为里面有自带的函数可以统计元素出现的个数）
    data_pd_cr = pd.Series(np.trunc(data_np_array.flatten()*10)/10)
    cr_list =list(data_pd_cr.value_counts().index+0.05)
    cr_number = list(data_pd_cr.value_counts().values)
    cr_max = max(cr_number)
#
#  find out the attribute file
#==============================
#    define the name of file attribute
    file_name = os.path.splitext(file_data[i])
    file_attr_txt  = file_name[0] + '_attributes.txt'
    path_attr_txt = os.path.join(path_dir_attr,file_attr_txt)
    fout = open(path_attr_txt,'a')
#
#  save count data into attribute file
#==============================
#    print(f"{label_index} group in sets:")
    print(f"\n{label_index} group in sets:", file=fout)
    for k, g in groupby(sorted(data_np_array.flatten()*10), key=lambda x: x//1):
        l = len(list(g))
#        print(f"{(k/10):10.1f} - {((k+1)/10-0.001):.3f} : {l}")
        print(f"{(k/10):10.1f} - {((k+1)/10-0.001):.3f} : {l}", file=fout) 
    fout.close()
#
#  print the bar figure
#==========================================
    file_bar = file_name[0] + '_bar'

#    plt.style.use('seaborn')
    fig = plt.figure()
    ax1 = fig.add_subplot(1,1,1)
    rects1 = ax1.bar(cr_list,cr_number,width=0.1,alpha=0.4,color='yellow',edgecolor='red',label=label_index,lw=1)
#
    plt.xlabel(label_index)
    plt.ylabel("Number")
#    plt.title(f"{file_name[0]} Count")
    plt.title(f"{file_name[0]}")
    plt.legend(loc='upper left')
    plt.xticks(np.arange(min(cr_list)-0.05,max(cr_list)+0.06,0.1), rotation=0)#rotation控制倾斜角度
#    plt.yticks(np.arange(0, max(cr_number)*1.2))
    plt.ylim(0,cr_max*1.2)
#
    for a,b in zip(cr_list,cr_number):
        plt.text(a, b+cr_max/100, f"{b:.0f}", ha='center', va= 'bottom',fontsize=10)

    save_fig(file_bar)
    plt.close()
#

Saving figure Au_cr_bar
Saving figure Au_cr_train_bar
Saving figure Au_cr_valid_bar
Saving figure Au_cr_test_bar
