# 通过聚类算法将总的数据划分成相似的几个组

## 10个组

In [1]:
CLUSTER_NUMS = 10
TEST_SIZE = 0.3
RANDOM_STATE = 42
## 定义介电常数类型
DIELECTRIC_CONSTANT = 'e_electronic'

In [2]:
# 导库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

In [3]:
# 导入数据
data = pd.read_csv('../../data/data_training.csv')

In [4]:
data.head()

Unnamed: 0,material_id,composition,crystal_system,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,...,MagpieData range Number,MagpieData mean Number,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity
0,mp-28967,Ba1 Pd2 S4,Monoclinic,11,0.7792,5.022717,22.595815,-1.124079,17.048334,7.118969,...,40.0,30.285714,105.0,215.0,110.0,130.428571,0.89,2.58,1.69,2.23
1,mp-766094,Nb1 O2 F1,Orthorhombic,19,2.898,3.764366,15.869706,-3.099174,17.57201,13.469477,...,33.0,16.5,57.0,164.0,107.0,88.25,1.6,3.98,2.38,3.115
2,mp-36577,Sr1 As2 S4,Triclinic,1,1.7212,3.094976,28.031499,-0.7661,18.488667,12.443616,...,22.0,24.0,105.0,195.0,90.0,121.857143,0.95,2.58,1.63,2.232857
3,mp-1102092,Na1 Fe1 S2 O8,Monoclinic,12,2.0944,2.90126,12.923684,-1.948264,9.596025,6.499905,...,18.0,11.083333,66.0,166.0,100.0,86.333333,0.93,3.44,2.51,2.953333
4,mp-720391,B1 H4 N1 F4,Orthorhombic,62,7.4812,1.860992,9.35501,-1.970766,6.216546,4.350314,...,8.0,5.2,31.0,84.0,53.0,50.7,2.04,3.98,1.94,2.98


In [5]:
X = data.drop(columns=['material_id', 'composition', 'crystal_system'])
y = data[DIELECTRIC_CONSTANT]

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7277 entries, 0 to 7276
Data columns (total 20 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   space_group                           7277 non-null   int64  
 1   band_gap                              7277 non-null   float64
 2   density                               7277 non-null   float64
 3   density_atomic                        7277 non-null   float64
 4   formation_energy_per_atom             7277 non-null   float64
 5   e_total                               7277 non-null   float64
 6   e_ionic                               7277 non-null   float64
 7   e_electronic                          7277 non-null   float64
 8   MagpieData minimum Number             7277 non-null   float64
 9   MagpieData maximum Number             7277 non-null   float64
 10  MagpieData range Number               7277 non-null   float64
 11  MagpieData mean N

- 先进行训练集和测试集划分

In [7]:
# 训练集测试集划分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [8]:
kmeans = KMeans(n_clusters=CLUSTER_NUMS, random_state=RANDOM_STATE)

In [9]:
kmeans.fit(X_train)

  super()._check_params_vs_input(X, default_n_init=10)


In [10]:
X_train['cluster'] = kmeans.labels_

In [11]:
# 预测测试集中每一个样本的类别
X_test['cluster'] = kmeans.predict(X_test)

In [12]:
X_test.head()

Unnamed: 0,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData mean Number,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,cluster
6059,160,0.7921,4.086507,10.994694,-2.019264,14.078999,6.862966,7.216033,3.0,25.0,...,12.769231,66.0,139.0,73.0,93.230769,0.98,3.44,2.46,2.677692,9
6010,186,1.9661,5.059669,22.397666,-2.460065,23.198326,15.958896,7.23943,8.0,57.0,...,29.75,66.0,207.0,141.0,138.583333,1.1,3.44,2.34,2.0725,7
4475,14,0.8712,6.411909,23.33201,-1.547133,21.690907,11.098378,10.592529,29.0,57.0,...,38.5,120.0,207.0,87.0,144.75,1.1,2.55,1.45,2.025,8
3689,127,1.2992,2.349037,12.536103,-0.274379,37.388675,30.495613,6.893062,1.0,20.0,...,8.857143,31.0,176.0,145.0,105.571429,0.98,2.55,1.57,1.832857,7
4515,5,1.6206,4.385036,25.392872,-0.587003,11.310439,3.828769,7.48167,13.0,47.0,...,29.4,107.0,145.0,38.0,120.0,1.61,2.55,0.94,2.322,3


In [13]:
groups = X_train.groupby('cluster')

In [14]:
for cluster, df in groups:
    print(f'Cluster {cluster}: {len(df)} samples')
    display(df.head(5))

Cluster 0: 76 samples


Unnamed: 0,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData mean Number,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,cluster
3238,1,1.7321,5.514924,18.847965,-0.755654,170.092113,166.365845,3.726269,7.0,81.0,...,26.0,66.0,145.0,79.0,87.0,1.62,3.44,1.82,2.885,0
2202,119,1.39,4.644875,14.418757,-3.17923,350.543507,345.804906,4.738601,3.0,57.0,...,18.0,66.0,207.0,141.0,102.272727,0.98,3.44,2.46,2.669091,0
5803,119,1.0174,6.324197,13.532553,-2.808559,184.488616,175.683643,8.804973,7.0,57.0,...,22.333333,66.0,207.0,141.0,118.166667,1.1,3.44,2.34,2.438333,0
7001,65,0.6695,7.28274,11.284284,-2.919847,179.816949,173.865535,5.951413,8.0,73.0,...,21.5,66.0,170.0,104.0,95.5,1.5,3.44,1.94,2.82,0
6846,19,1.4637,7.242374,13.289026,-2.178425,287.518989,280.885539,6.63345,8.0,74.0,...,24.5,66.0,162.0,96.0,90.0,2.36,3.44,1.08,3.17,0


Cluster 1: 1 samples


Unnamed: 0,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData mean Number,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,cluster
260,109,0.0,7.854668,17.740081,-0.670993,57262.630716,10404.720206,46857.91051,33.0,41.0,...,37.0,119.0,164.0,45.0,141.5,1.6,2.18,0.58,1.89,1


Cluster 2: 1 samples


Unnamed: 0,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData mean Number,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,cluster
5112,12,1.1429,5.114826,30.542419,-0.816408,25244.446111,95.629586,25148.816525,34.0,51.0,...,40.25,120.0,144.0,24.0,127.75,1.69,2.96,1.27,2.42,2


Cluster 3: 1570 samples


Unnamed: 0,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData mean Number,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,cluster
7036,15,2.5261,4.426263,10.875263,-2.371639,8.808742,5.420374,3.388368,8.0,32.0,...,13.6,66.0,141.0,75.0,91.8,1.31,3.44,2.13,2.728,3
4169,14,5.1473,2.219458,8.511242,-2.553598,6.89891,4.268476,2.630435,3.0,8.0,...,5.428571,66.0,128.0,62.0,95.142857,0.98,3.44,2.46,2.185714,3
6157,61,0.6371,6.274966,14.895708,-0.612116,30.141843,5.864121,24.277722,15.0,34.0,...,25.333333,107.0,126.0,19.0,117.666667,1.88,2.55,0.67,2.206667,3
2571,65,0.0734,10.664275,11.463193,-1.186381,18.658185,2.437054,16.22113,8.0,78.0,...,30.7,66.0,139.0,73.0,94.3,1.55,3.44,1.89,2.903,3
1121,63,2.3991,3.218591,12.063844,-2.182629,8.007591,4.384588,3.623003,8.0,24.0,...,11.333333,66.0,141.0,75.0,90.666667,1.31,3.44,2.13,2.788333,3


Cluster 4: 7 samples


Unnamed: 0,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData mean Number,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,cluster
497,129,3.6069,3.155165,16.912987,-3.115056,981.669996,977.424942,4.245054,8.0,21.0,...,15.333333,66.0,170.0,104.0,112.666667,1.36,3.44,2.08,2.653333,4
5936,129,0.8578,6.88389,13.981062,-2.18172,1182.46461,1175.49736,6.96725,8.0,74.0,...,24.5,66.0,162.0,96.0,90.0,2.36,3.44,1.08,3.17,4
4161,167,1.6615,8.945674,12.504217,-2.403107,1501.073704,1494.344015,6.729689,8.0,73.0,...,28.8,66.0,170.0,104.0,102.6,1.5,3.44,1.94,2.75,4
3800,191,1.1401,6.392982,15.054648,-2.176713,1273.229423,1266.841423,6.388,8.0,74.0,...,24.5,66.0,162.0,96.0,90.0,2.36,3.44,1.08,3.17,4
3368,44,0.2119,4.00099,11.992913,-2.823312,1190.504081,1255.341414,-64.837332,8.0,23.0,...,13.416667,57.0,153.0,96.0,91.25,1.63,3.98,2.35,3.061667,4


Cluster 5: 7 samples


Unnamed: 0,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData mean Number,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,cluster
4119,62,0.0623,5.979208,22.475555,-1.660664,856.798534,4.853354,851.94518,33.0,39.0,...,35.333333,119.0,190.0,71.0,143.0,1.22,2.55,1.33,1.983333,5
1313,185,0.064,2.434313,24.538387,-0.552974,963.340396,2.199834,961.140562,11.0,33.0,...,16.5,119.0,166.0,47.0,154.25,0.93,2.18,1.25,1.2425,5
3364,227,0.0,11.695506,15.125104,-0.81939,1258.840822,42.026733,1216.814089,7.0,72.0,...,44.714286,71.0,175.0,104.0,137.428571,1.3,3.04,1.74,1.698571,5
4790,5,0.0,3.151394,12.108053,-2.228627,641.604402,19.17994,622.424463,8.0,26.0,...,11.2,66.0,132.0,66.0,80.8,1.83,3.44,1.61,3.029,5
4286,216,0.1098,9.442611,23.784505,-1.364784,669.672961,0.935531,668.73743,39.0,78.0,...,56.0,136.0,190.0,54.0,155.0,1.22,2.28,1.06,1.85,5


Cluster 6: 725 samples


Unnamed: 0,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData mean Number,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,cluster
2628,164,3.8971,3.810445,42.690617,-1.928175,5.369542,2.132453,3.23709,20.0,53.0,...,42.0,139.0,176.0,37.0,151.333333,1.0,2.66,1.66,2.106667,6
3014,122,2.0955,4.947128,23.865003,-2.385492,20.952346,13.461579,7.490766,16.0,59.0,...,31.428571,105.0,203.0,98.0,145.857143,0.95,2.58,1.63,1.932857,6
1504,225,0.1689,3.117889,21.52529,-0.343897,23.350282,6.960672,16.38961,12.0,32.0,...,18.666667,120.0,141.0,21.0,134.0,1.31,2.01,0.7,1.543333,6
7236,98,0.1332,5.705092,25.444131,-0.061231,22.242846,7.05397,15.188876,33.0,48.0,...,38.0,119.0,144.0,25.0,127.333333,1.69,2.18,0.49,2.016667,6
1490,122,0.0,5.619185,31.593161,-0.103688,23.761439,1.448506,22.312932,30.0,51.0,...,45.5,122.0,139.0,17.0,134.75,1.65,2.05,0.4,1.9275,6


Cluster 7: 838 samples


Unnamed: 0,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData mean Number,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,cluster
2154,225,3.0173,6.892776,14.37159,-3.590597,46.967161,42.0059,4.961261,8.0,73.0,...,25.4,66.0,215.0,149.0,116.6,0.89,3.44,2.55,2.528,7
1344,225,1.4541,3.880169,32.606816,-1.744127,9.885204,6.939711,2.945493,17.0,82.0,...,32.666667,102.0,244.0,142.0,138.444444,0.79,3.16,2.37,2.541111,7
952,160,5.9967,3.562179,16.508656,-4.088083,20.756191,18.495867,2.260324,9.0,39.0,...,16.142857,57.0,203.0,146.0,95.928571,0.82,3.98,3.16,3.162857,7
4294,225,1.3192,4.015077,25.770601,-1.128565,14.106113,10.614688,3.491425,8.0,37.0,...,27.333333,66.0,220.0,154.0,168.666667,0.82,3.44,2.62,1.693333,7
4691,164,3.8624,3.468948,17.245667,-2.370439,6.94081,4.010906,2.929904,8.0,42.0,...,16.5,66.0,220.0,154.0,98.083333,0.82,3.44,2.62,2.855833,7


Cluster 8: 1062 samples


Unnamed: 0,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData mean Number,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,cluster
3499,33,2.1425,4.296195,19.895105,-1.534687,15.471959,8.665735,6.806224,5.0,62.0,...,23.0,84.0,198.0,114.0,119.4,1.17,2.58,1.41,2.19,8
3734,69,0.8026,3.938838,22.263024,-0.946828,10.264583,3.413772,6.850811,16.0,37.0,...,24.666667,105.0,220.0,115.0,136.888889,0.82,2.58,1.76,1.965556,8
564,73,2.6752,5.014852,10.377929,-1.092791,10.471226,5.278071,5.193154,3.0,73.0,...,13.25,71.0,170.0,99.0,111.875,0.98,3.04,2.06,1.8175,8
4559,12,1.0155,3.457608,22.599944,-1.45378,12.757161,9.513923,3.243238,8.0,37.0,...,21.384615,66.0,220.0,154.0,144.615385,0.82,3.44,2.62,1.983077,8
2004,14,6.5991,3.407601,17.98057,-3.570267,8.075949,5.988104,2.087845,9.0,37.0,...,16.8,57.0,220.0,163.0,115.5,0.82,3.98,3.16,2.77,8


Cluster 9: 806 samples


Unnamed: 0,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData mean Number,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,cluster
5456,164,3.1183,3.792487,16.363602,-2.462943,11.347476,7.620234,3.727241,8.0,42.0,...,17.090909,66.0,175.0,109.0,91.909091,1.33,3.44,2.11,3.015455,9
4557,215,3.3499,4.02526,10.658447,-1.409948,9.04413,4.142382,4.901747,7.0,31.0,...,12.25,71.0,122.0,51.0,96.125,1.61,3.04,1.43,2.35,9
2932,129,1.473,7.588488,25.666841,-1.366896,48.63873,41.993581,6.645149,8.0,83.0,...,48.0,66.0,148.0,82.0,117.666667,2.02,3.44,1.42,2.706667,9
2412,225,1.3445,7.256076,14.916655,-1.749808,22.956366,15.567205,7.389161,8.0,78.0,...,28.222222,66.0,142.0,76.0,101.222222,1.78,3.44,1.66,2.663333,9
2534,215,4.2525,2.719423,11.344975,-2.709732,8.455842,5.881868,2.573975,4.0,14.0,...,9.085714,66.0,166.0,100.0,103.142857,0.93,3.44,2.51,2.333143,9


In [15]:
# 保存数据
X_train['class'] = 'train'
X_test['class'] = 'test'
display(X_train.head())
display(X_test.head())

Unnamed: 0,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,cluster,class
2628,164,3.8971,3.810445,42.690617,-1.928175,5.369542,2.132453,3.23709,20.0,53.0,...,139.0,176.0,37.0,151.333333,1.0,2.66,1.66,2.106667,6,train
3014,122,2.0955,4.947128,23.865003,-2.385492,20.952346,13.461579,7.490766,16.0,59.0,...,105.0,203.0,98.0,145.857143,0.95,2.58,1.63,1.932857,6,train
2154,225,3.0173,6.892776,14.37159,-3.590597,46.967161,42.0059,4.961261,8.0,73.0,...,66.0,215.0,149.0,116.6,0.89,3.44,2.55,2.528,7,train
1344,225,1.4541,3.880169,32.606816,-1.744127,9.885204,6.939711,2.945493,17.0,82.0,...,102.0,244.0,142.0,138.444444,0.79,3.16,2.37,2.541111,7,train
7036,15,2.5261,4.426263,10.875263,-2.371639,8.808742,5.420374,3.388368,8.0,32.0,...,66.0,141.0,75.0,91.8,1.31,3.44,2.13,2.728,3,train


Unnamed: 0,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,cluster,class
6059,160,0.7921,4.086507,10.994694,-2.019264,14.078999,6.862966,7.216033,3.0,25.0,...,66.0,139.0,73.0,93.230769,0.98,3.44,2.46,2.677692,9,test
6010,186,1.9661,5.059669,22.397666,-2.460065,23.198326,15.958896,7.23943,8.0,57.0,...,66.0,207.0,141.0,138.583333,1.1,3.44,2.34,2.0725,7,test
4475,14,0.8712,6.411909,23.33201,-1.547133,21.690907,11.098378,10.592529,29.0,57.0,...,120.0,207.0,87.0,144.75,1.1,2.55,1.45,2.025,8,test
3689,127,1.2992,2.349037,12.536103,-0.274379,37.388675,30.495613,6.893062,1.0,20.0,...,31.0,176.0,145.0,105.571429,0.98,2.55,1.57,1.832857,7,test
4515,5,1.6206,4.385036,25.392872,-0.587003,11.310439,3.828769,7.48167,13.0,47.0,...,107.0,145.0,38.0,120.0,1.61,2.55,0.94,2.322,3,test


In [16]:
X_save = pd.concat([X_train, X_test])
X_save.head()
X_save.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7277 entries, 2628 to 6415
Data columns (total 22 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   space_group                           7277 non-null   int64  
 1   band_gap                              7277 non-null   float64
 2   density                               7277 non-null   float64
 3   density_atomic                        7277 non-null   float64
 4   formation_energy_per_atom             7277 non-null   float64
 5   e_total                               7277 non-null   float64
 6   e_ionic                               7277 non-null   float64
 7   e_electronic                          7277 non-null   float64
 8   MagpieData minimum Number             7277 non-null   float64
 9   MagpieData maximum Number             7277 non-null   float64
 10  MagpieData range Number               7277 non-null   float64
 11  MagpieData mean Num

In [17]:
# 保存X_save
X_save.to_csv(f'./{DIELECTRIC_CONSTANT}_data_cluster.csv', index=False)