# 通过聚类算法将总的数据划分成相似的几个组

## 五个组

In [27]:
CLUSTER_NUMS = 5
RANDOM_STATE = 42

In [2]:
# 导库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [3]:
# 导入数据
data = pd.read_csv('../../data/data_training.csv')

In [4]:
data.head()

Unnamed: 0,material_id,composition,crystal_system,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,...,MagpieData range Number,MagpieData mean Number,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity
0,mp-28967,Ba1 Pd2 S4,Monoclinic,11,0.7792,5.022717,22.595815,-1.124079,17.048334,7.118969,...,40.0,30.285714,105.0,215.0,110.0,130.428571,0.89,2.58,1.69,2.23
1,mp-766094,Nb1 O2 F1,Orthorhombic,19,2.898,3.764366,15.869706,-3.099174,17.57201,13.469477,...,33.0,16.5,57.0,164.0,107.0,88.25,1.6,3.98,2.38,3.115
2,mp-36577,Sr1 As2 S4,Triclinic,1,1.7212,3.094976,28.031499,-0.7661,18.488667,12.443616,...,22.0,24.0,105.0,195.0,90.0,121.857143,0.95,2.58,1.63,2.232857
3,mp-1102092,Na1 Fe1 S2 O8,Monoclinic,12,2.0944,2.90126,12.923684,-1.948264,9.596025,6.499905,...,18.0,11.083333,66.0,166.0,100.0,86.333333,0.93,3.44,2.51,2.953333
4,mp-720391,B1 H4 N1 F4,Orthorhombic,62,7.4812,1.860992,9.35501,-1.970766,6.216546,4.350314,...,8.0,5.2,31.0,84.0,53.0,50.7,2.04,3.98,1.94,2.98


In [5]:
X = data.drop(columns=['material_id', 'composition', 'crystal_system'])

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7277 entries, 0 to 7276
Data columns (total 20 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   space_group                           7277 non-null   int64  
 1   band_gap                              7277 non-null   float64
 2   density                               7277 non-null   float64
 3   density_atomic                        7277 non-null   float64
 4   formation_energy_per_atom             7277 non-null   float64
 5   e_total                               7277 non-null   float64
 6   e_ionic                               7277 non-null   float64
 7   e_electronic                          7277 non-null   float64
 8   MagpieData minimum Number             7277 non-null   float64
 9   MagpieData maximum Number             7277 non-null   float64
 10  MagpieData range Number               7277 non-null   float64
 11  MagpieData mean N

In [28]:
kmeans = KMeans(n_clusters=CLUSTER_NUMS, random_state=RANDOM_STATE)

In [29]:
kmeans.fit(X)

  super()._check_params_vs_input(X, default_n_init=10)


In [30]:
X['cluster'] = kmeans.labels_

In [31]:
groups = X.groupby('cluster')

In [32]:
for cluster, df in groups:
    print(f'Cluster {cluster}: {len(df)} samples')
    display(df.head(5))

Cluster 0: 3384 samples


Unnamed: 0,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData mean Number,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,cluster
9,173,1.4423,5.957552,25.814819,-1.615458,17.189158,8.379924,8.809235,14.0,59.0,...,39.666667,111.0,203.0,92.0,142.083333,1.13,2.55,1.42,2.089167,0
16,148,0.0,5.698406,13.352739,-2.310143,8.953313,5.759627,3.193686,9.0,80.0,...,19.75,57.0,139.0,82.0,76.625,1.66,3.98,2.32,3.4425,0
17,152,4.0656,2.032981,27.083309,-0.268661,3.975662,1.825061,2.150601,6.0,55.0,...,14.666667,71.0,244.0,173.0,101.5,0.79,3.04,2.25,2.501667,0
19,225,0.1981,6.182079,23.64023,-1.09324,58.674453,23.928634,34.745818,20.0,79.0,...,38.0,119.0,176.0,57.0,151.75,1.0,2.54,1.54,1.68,0
21,139,4.3016,3.419061,15.23614,-2.856015,9.056451,6.910176,2.146275,9.0,30.0,...,14.857143,57.0,203.0,146.0,108.0,0.82,3.98,3.16,2.744286,0


Cluster 1: 1 samples


Unnamed: 0,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData mean Number,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,cluster
5354,127,0.5537,4.401575,47.7105,-1.20193,126575.316823,126567.273642,8.043181,50.0,55.0,...,52.8,139.0,244.0,105.0,160.0,0.79,2.66,1.87,2.146,1


Cluster 2: 1 samples


Unnamed: 0,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData mean Number,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,cluster
260,109,0.0,7.854668,17.740081,-0.670993,57262.630716,10404.720206,46857.91051,33.0,41.0,...,37.0,119.0,164.0,45.0,141.5,1.6,2.18,0.58,1.89,2


Cluster 3: 1 samples


Unnamed: 0,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData mean Number,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,cluster
5112,12,1.1429,5.114826,30.542419,-0.816408,25244.446111,95.629586,25148.816525,34.0,51.0,...,40.25,120.0,144.0,24.0,127.75,1.69,2.96,1.27,2.42,3


Cluster 4: 3890 samples


Unnamed: 0,space_group,band_gap,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData mean Number,MagpieData minimum CovalentRadius,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,cluster
0,11,0.7792,5.022717,22.595815,-1.124079,17.048334,7.118969,9.929365,16.0,56.0,...,30.285714,105.0,215.0,110.0,130.428571,0.89,2.58,1.69,2.23,4
1,19,2.898,3.764366,15.869706,-3.099174,17.57201,13.469477,4.102533,8.0,41.0,...,16.5,57.0,164.0,107.0,88.25,1.6,3.98,2.38,3.115,4
2,1,1.7212,3.094976,28.031499,-0.7661,18.488667,12.443616,6.045051,16.0,38.0,...,24.0,105.0,195.0,90.0,121.857143,0.95,2.58,1.63,2.232857,4
3,12,2.0944,2.90126,12.923684,-1.948264,9.596025,6.499905,3.09612,8.0,26.0,...,11.083333,66.0,166.0,100.0,86.333333,0.93,3.44,2.51,2.953333,4
4,62,7.4812,1.860992,9.35501,-1.970766,6.216546,4.350314,1.866232,1.0,9.0,...,5.2,31.0,84.0,53.0,50.7,2.04,3.98,1.94,2.98,4


In [34]:
# 保存组0和组4的数据
for cluster, df in groups:
    if cluster in [0, 4]:
        df.to_csv(f'../../data/groups/data_training_cluster_{cluster}of{CLUSTER_NUMS}.csv', index=False)