In [13]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
data = pd.read_csv("data.csv")

In [15]:
data

Unnamed: 0,formula,elements,nsites,element_symbols,coordinates,energy,spacegroup,spacegroup_num
0,Ag11Hg9,"Ag, Hg",20,"['Ag', 'Ag', 'Ag', 'Ag', 'Ag', 'Ag', 'Ag', 'Ag...",[[ 1.609567 -0.92928553 33.90805742]\n [ 1.6...,-677.652749,P3m1,156
1,Ag2Br3,"Ag, Br",10,"['Ag', 'Ag', 'Ag', 'Ag', 'Br', 'Br', 'Br', 'Br...",[[-3.46990705e+00 -2.00392334e+00 1.22402099e...,-168.691087,R-3c,167
2,Ag2Cl3,"Ag, Cl",10,"['Ag', 'Ag', 'Ag', 'Ag', 'Cl', 'Cl', 'Cl', 'Cl...",[[-3.30941698e+00 -1.91047971e+00 1.71721380e...,-127.332050,R-3c,167
3,Ag2F,"Ag, F",3,"['Ag', 'Ag', 'F']",[[-1.83755070e-06 1.72922338e+00 3.97018472e...,-48.352742,P-3m1,164
4,Ag2F3,"Ag, F",10,"['Ag', 'Ag', 'Ag', 'Ag', 'F', 'F', 'F', 'F', '...",[[-2.87395102e+00 -1.65949299e+00 2.26233906e...,-116.183864,R-3c,167
...,...,...,...,...,...,...,...,...
10623,ZrZn16,"Zr, Zn",34,"['Zr', 'Zr', 'Zn', 'Zn', 'Zn', 'Zn', 'Zn', 'Zn...",[[-2.17422601 6.06378581 8.71633518]\n [ 6.4...,-333.573781,Cmcm,63
10624,ZrZn2,"Zr, Zn",6,"['Zr', 'Zr', 'Zn', 'Zn', 'Zn', 'Zn']",[[ 5.25615317e+00 3.71666061e+00 9.10392570e...,-83.354040,Fd-3m,227
10625,ZrZn22,"Zr, Zn",46,"['Zr', 'Zr', 'Zn', 'Zn', 'Zn', 'Zn', 'Zn', 'Zn...",[[ 4.26511414e+00 3.01589110e+00 7.38739309e...,-440.455327,Fd-3m,227
10626,ZrZn3,"Zr, Zn",8,"['Zr', 'Zr', 'Zn', 'Zn', 'Zn', 'Zn', 'Zn', 'Zn']",[[ 2.24703936e-06 3.37600716e+00 3.24801562e...,-101.493900,P6_3/mmc,194


In [2]:
from element_data import atomic_radius, electron_config_full

In [3]:
compounds = {}

In [59]:
for index, row in data.iterrows():
    formula = row['formula']
    elements = row['elements'].split(', ')
    elements_site = row['element_symbols']
    lst = [x.replace("'", "").replace('"', '') for x in elements_site[1:-1].split(', ')]
    nsites = row['nsites']
    coordinates = row['coordinates']
    energy = row['energy']
    space_group = row['spacegroup']
    space_group_number = row['spacegroup_num']
    
    element_count = {}
    for el in lst:
        if el not in element_count:
            element_count[el] = 0
        element_count[el] += 1
            
    compound_str = ""
    for element, count in element_count.items():
        radius = atomic_radius.get(element)
        electronic_config = electron_config_full.get(element)
        proportion = count / nsites
        
        compound_str += f"{element}_r{radius:.2f}_e{electronic_config}_p{proportion:.1f}_"
        
    compounds[formula] = compound_str[:-1]

In [60]:
# 将compounds转换为字符串列表，并将元素符号转换为小写形式
compounds_list = [x.lower() for x in compounds.values()]

In [61]:
compounds_list[:5]

['ag_r153.00_e[kr]4d10 5s1_p0.6_hg_r149.00_e[xe]4f14 5d10 6s2_p0.5',
 'ag_r153.00_e[kr]4d10 5s1_p0.4_br_r114.00_e[ar]3d10 4s2 4p5_p0.6',
 'ag_r153.00_e[kr]4d10 5s1_p0.4_cl_r99.00_e[ne]3s2 3p5_p0.6',
 'ag_r153.00_e[kr]4d10 5s1_p0.7_f_r57.00_e[he]2s2 2p5_p0.3',
 'ag_r153.00_e[kr]4d10 5s1_p0.4_f_r57.00_e[he]2s2 2p5_p0.6']

In [62]:
from sklearn.feature_extraction.text import CountVectorizer

In [63]:
# 创建CountVectorizer对象
vectorizer = CountVectorizer(token_pattern='(?u)\\b\\w+\\b')

In [64]:
# 将字符串列表转换为词向量矩阵
X = vectorizer.fit_transform(compounds_list).toarray()

In [65]:
X.shape

(10628, 744)

In [66]:
X[1,:]

array([0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,

In [88]:
import pandas as pd

# 从data.csv文件中加载晶体结构数据集
data = pd.read_csv('data.csv')

In [3]:
Y = []
compounds = {}

In [7]:
for index, row in data.iterrows():
    formula = row['formula']
    elements = row['elements'].split(', ')
    elements_site = row['element_symbols']
    lst = [x.replace("'", "").replace('"', '') for x in elements_site[1:-1].split(', ')]
    nsites = row['nsites']
    coordinates = row['coordinates']
    energy = row['energy']
    space_group = row['spacegroup']
    space_group_number = row['spacegroup_num']
    
    # 使用正则表达式提取数字
    pattern = r'[-+]?\d*\.\d+|\d+'
    coordinates_list = [list(map(float, re.findall(pattern, c))) for c in coordinates.split('\n') if c.strip()]
    coordinates_array = []
    # 将列表转换为NumPy数组
    try:
        coordinates_array = np.array(coordinates_list)
    except ValueError:
        # 如果出现ValueError异常，则删除该行数据
        data = data.drop(index)
        continue
        
    if nsites > 12:
        continue
    
    coordinates_str = '_'.join([f"{e}{tuple(c)}" for e, c in zip(lst, coordinates_list)])
    
    # 将空间群信息添加到字符串末尾
    Y_str = f"{coordinates_str}_SG{space_group_number}"
    
    # 将Y字符串添加到Y列表
    Y.append(Y_str)
    
    element_count = {}
    for el in lst:
        if el not in element_count:
            element_count[el] = 0
        element_count[el] += 1
            
    compound_str = ""
    for element, count in element_count.items():
        radius = atomic_radius.get(element)
        electronic_config = electron_config_full.get(element)
        proportion = count / nsites
        
        compound_str += f"{element}_r{radius:.2f}_e{electronic_config}_p{proportion:.1f}_"
        
    compounds[formula] = compound_str[:-1]

In [8]:
# 创建CountVectorizer对象
vectorizer = CountVectorizer(token_pattern='(?u)\\b\\w+\\b')

In [9]:
Y = [s.lower() for s in Y]
# 将compounds转换为字符串列表，并将元素符号转换为小写形式
compounds_list = [x.lower() for x in compounds.values()]

In [10]:
X = compounds_list

In [11]:
with open('Y.txt', 'w') as f:
    for item in Y:
        f.write("%s\n" % item)

In [12]:
with open('X.txt', 'w') as f:
    for item in X:
        f.write("%s\n" % item)