# Cluster
Use clustering algorithm(K-means) to divide the total data by the Top 10 most important features into several similar groups

In [1]:
CLUSTER_NUMS = 4
RANDOM_STATE = 42

# Top 10 most important features
TOP_IMPORTANCE = [
 'MagpieData mode CovalentRadius',
 'MagpieData maximum Electronegativity',
 'MagpieData mode Number',
 'MagpieData maximum NpValence',
 'MagpieData minimum CovalentRadius',
 'MagpieData maximum Column',
 'MagpieData minimum NValence',
 'MagpieData minimum Row',
 'MagpieData maximum MendeleevNumber',
 'MagpieData minimum GSbandgap']

In [2]:
# import libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [3]:
current_path = os.getcwd()

In [4]:
# Load data
file_dir = os.path.join(current_path, './Data')
# dft data
dft_train = pd.read_csv(os.path.join(file_dir, 'train_data', 'mp_plus_train_set.csv'))
dft_test = pd.read_csv(os.path.join(file_dir, 'test_data', 'mp_plus_test_set.csv'))



In [5]:
kmeans = KMeans(n_clusters=CLUSTER_NUMS, random_state=RANDOM_STATE)

In [6]:
scaler = StandardScaler()
# Scale the data
scaled_X = scaler.fit_transform(dft_train[TOP_IMPORTANCE])
kmeans.fit(scaled_X)

In [7]:
dft_train['cluster'] = kmeans.labels_

In [8]:
# predict clusters for test data
scaled_test_X = scaler.transform(dft_test[TOP_IMPORTANCE])
dft_test['cluster'] = kmeans.predict(scaled_test_X)

In [9]:
dft_train.head()

Unnamed: 0,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,...,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber,cluster
0,Ag8B48Cl48,104,0,205,1920.521818,2.666186,18.466556,0.446,-8.609923,-0.736951,...,0.0,0.0,0.0,64.0,225.0,161.0,123.461538,54.887574,64.0,3
1,Ag40Te16Br12,68,3,63,1688.336988,7.194734,24.828485,2.536429,-3.067222,-0.366432,...,0.0,0.0,0.0,64.0,225.0,161.0,179.411765,53.633218,225.0,0
2,Ag30P8S32Cl6,76,0,220,1709.591255,4.587131,22.494622,2.458781,-4.062215,-0.481548,...,0.0,0.0,0.0,2.0,225.0,223.0,123.552632,80.090028,70.0,1
3,Ag4Ge2S6,12,3,36,267.880017,4.767775,22.323335,2.600055,-4.253843,-0.466513,...,0.0,0.0,0.0,70.0,225.0,155.0,147.5,77.5,70.0,1
4,Ag8H16S12O48,84,3,33,1057.273255,3.191155,12.586586,0.517819,-5.785262,-1.449207,...,0.0,0.0,0.0,12.0,225.0,213.0,75.238095,73.768707,12.0,3


In [10]:
dft_test.head()

Unnamed: 0,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,...,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber,cluster
0,Ag4C2O6,12,4,11,150.041421,6.103459,12.503452,1.709619,-6.049623,-1.089885,...,0.0,0.0,0.0,12.0,225.0,213.0,113.333333,101.333333,12.0,3
1,Ag12Ge6S36O126,180,6,147,2544.837686,3.197654,14.137987,0.457826,-6.23463,-1.599755,...,0.0,0.0,0.0,12.0,225.0,213.0,44.9,46.06,12.0,3
2,Ag6P3H3O12,24,6,151,310.718968,4.997597,12.946624,1.408234,-5.859486,-1.573441,...,0.0,0.0,0.0,2.0,225.0,223.0,86.75,95.9375,12.0,3
3,Ag8Se4O12,24,4,14,379.079939,6.00462,15.794997,1.41195,-4.829366,-0.825087,...,0.0,0.0,0.0,12.0,225.0,213.0,83.333333,94.444444,12.0,3
4,Ag8Se4O12,24,4,14,347.441675,6.551405,14.476736,5.866702,-13.060735,-0.572339,...,0.0,0.0,0.0,12.0,225.0,213.0,83.333333,94.444444,12.0,3


In [11]:
groups = dft_train.groupby('cluster')

In [12]:
for cluster, df in groups:
    print(f'Cluster {cluster}: {len(df)} samples')
    # display(df.head(5))

Cluster 0: 663 samples
Cluster 1: 1067 samples
Cluster 2: 591 samples
Cluster 3: 2050 samples


In [13]:
# save the cluster data
dft_train.to_csv(os.path.join(file_dir, 'train_data', 'mp_plus_cluster_train_set.csv'), index=False)
dft_test.to_csv(os.path.join(file_dir, 'test_data', 'mp_plus_cluster_test_set.csv'), index=False)