# Cluster
Use clustering algorithm(K-means) to divide the total data by the Top 10 most important features into several similar groups

In [15]:
# import libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [16]:
current_path = os.getcwd()

In [17]:
CLUSTER_NUMS = 3
RANDOM_STATE = 42

# Top 10 most important features
feature_importance_df = pd.read_csv(os.path.join(current_path, './metrics/feature_importance/mp_plus/plus_XGBoost_feature_importance.csv'))
top_importance = feature_importance_df['Feature'].tolist()
top_importance

['mode ThermalConductivity',
 'mode HeatVaporization',
 'maximum Electronegativity',
 'minimum HeatVaporization',
 'minimum ThermalConductivity',
 'range NfUnfilled',
 'minimum GSestBCClatcnt',
 'minimum AtomicVolume',
 'minimum MolarVolume',
 'maximum NUnfilled']

In [18]:
# Load data
file_dir = os.path.join(current_path, './Data')
# dft data
dft_train = pd.read_csv(os.path.join(file_dir, 'rfe_train_data', 'mp_plus_train_set.csv'))
dft_test = pd.read_csv(os.path.join(file_dir, 'rfe_test_data', 'mp_plus_test_set.csv'))

dft_train.shape, dft_test.shape

((7869, 130), (1963, 130))

In [19]:
kmeans = KMeans(n_clusters=CLUSTER_NUMS, random_state=RANDOM_STATE)

In [20]:
scaler = StandardScaler()
# Scale the data
scaled_X = scaler.fit_transform(dft_train[top_importance])
kmeans.fit(scaled_X)

In [21]:
dft_train['cluster'] = kmeans.labels_

In [22]:
# predict clusters for test data
scaled_test_X = scaler.transform(dft_test[top_importance])
dft_test['cluster'] = kmeans.predict(scaled_test_X)

In [23]:
dft_train.head()

Unnamed: 0,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,...,mean NpUnfilled,avg_dev NpUnfilled,mean NpValence,avg_dev NpValence,avg_dev NsValence,maximum ZungerPP-r_sigma,mean ZungerPP-r_sigma,avg_dev ZungerPP-r_sigma,range ZungerPP-r_sigma,cluster
0,Ag2Au4F16,22,4,14,312.794323,6.941567,14.217924,-1.966392,-14.614052,-1.201868,...,0.727273,0.396694,3.636364,1.983471,0.396694,2.66,0.994091,0.85686,2.255,2
1,Ag8B48Cl48,104,0,205,1920.521818,2.666186,18.466556,0.446,-8.609923,-0.736951,...,2.769231,2.059172,2.769231,2.059172,0.142012,2.375,1.015769,0.209112,1.58,2
2,Ag40Te16Br12,68,3,63,1688.336988,7.194734,24.828485,2.536429,-3.067222,-0.366432,...,0.647059,0.761246,1.823529,2.145329,0.484429,2.375,2.001765,0.4391,1.175,1
3,Ag30P8S32Cl6,76,0,220,1709.591255,4.587131,22.494622,2.458781,-4.062215,-0.481548,...,1.236842,1.01385,2.394737,1.890582,0.477839,2.375,1.610921,0.60322,1.365,2
4,Ag8Bi4O12,24,3,34,366.571761,8.565436,15.273823,3.0267,-4.987277,-0.917311,...,1.5,1.0,2.5,1.666667,0.444444,2.375,1.357,0.892,1.91,2


In [24]:
dft_test.head()

Unnamed: 0,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,...,mean NpUnfilled,avg_dev NpUnfilled,mean NpValence,avg_dev NpValence,avg_dev NsValence,maximum ZungerPP-r_sigma,mean ZungerPP-r_sigma,avg_dev ZungerPP-r_sigma,range ZungerPP-r_sigma,cluster
0,Ag12Ge6S36O126,180,6,147,2544.837686,3.197654,14.137987,0.457826,-6.23463,-1.599755,...,1.933333,0.257778,3.666667,0.6,0.124444,2.375,0.755833,0.407167,1.91,2
1,Ag8H16S12O48,84,3,33,1057.273255,3.191155,12.586586,0.517819,-5.785262,-1.449207,...,1.428571,0.816327,2.857143,1.632653,0.408163,2.375,0.887143,0.482449,1.91,2
2,Ag16P8Se24,48,3,19,1108.881842,5.79337,23.101705,2.942116,-15.94416,-0.169408,...,1.5,1.0,2.5,1.666667,0.444444,2.375,1.640833,0.489444,1.135,1
3,Ag12Te6O24,42,4,15,571.396405,7.102537,13.604676,3.100366,-5.053196,-0.964478,...,1.428571,0.816327,2.857143,1.632653,0.408163,2.375,1.182857,0.820408,1.91,2
4,Ag3As1F12,16,5,2,257.772769,4.035878,16.110798,-1.540041,-4.061584,-1.801261,...,0.9375,0.351562,3.9375,1.59375,0.304688,2.375,0.8375,0.64875,1.97,2


In [25]:
groups = dft_train.groupby('cluster')

In [26]:
for cluster, df in groups:
    print(f'Cluster {cluster}: {len(df)} samples')
    # display(df.head(5))

Cluster 0: 783 samples
Cluster 1: 2440 samples
Cluster 2: 4646 samples


In [27]:
# save the cluster data
dft_train.to_csv(os.path.join(file_dir, 'rfe_train_data', 'mp_plus_cluster_train_set.csv'), index=False)
dft_test.to_csv(os.path.join(file_dir, 'rfe_test_data', 'mp_plus_cluster_test_set.csv'), index=False)