# Cluster
Use clustering algorithm(K-means) to divide the total data by **all the features** into several similar groups

In [1]:
CLUSTER_NUMS = 4
RANDOM_STATE = 42

In [2]:
# import libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [3]:
current_path = os.getcwd()

In [6]:
# Load data
file_dir = os.path.join(current_path, './Data')
# dft data
dft_train = pd.read_csv(os.path.join(file_dir, 'train_data', 'mp_plus_train_set.csv'))
dft_test = pd.read_csv(os.path.join(file_dir, 'test_data', 'mp_plus_test_set.csv'))
dft_train.shape, dft_test.shape


((8268, 145), (2074, 145))

In [7]:
kmeans = KMeans(n_clusters=CLUSTER_NUMS, random_state=RANDOM_STATE)

In [8]:
scaler = StandardScaler()
# Scale the data
scaled_X = scaler.fit_transform(dft_train.drop(columns=['composition', 'band_gap']))
kmeans.fit(scaled_X)

In [9]:
dft_train['cluster'] = kmeans.labels_

In [10]:
# predict clusters for test data
scaled_test_X = scaler.transform(dft_test.drop(columns=['composition', 'band_gap']))
dft_test['cluster'] = kmeans.predict(scaled_test_X)

In [11]:
dft_train.head()

Unnamed: 0,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,...,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber,cluster
0,Ac16S24,40,1,122,1118.407852,6.535149,27.960196,5.79182,-34.768478,-2.492486,...,0.0,0.0,0.0,70.0,225.0,155.0,132.0,74.4,70.0,0
1,Ac1Ag1Te2,4,0,225,122.518406,7.997421,30.629602,6.031096,-36.203183,-0.996232,...,0.0,0.0,0.0,152.0,225.0,73.0,188.5,36.5,152.0,3
2,Ac1Cr1O3,5,0,221,61.362845,8.848788,12.272569,6.364737,-8.862593,-3.138972,...,0.0,0.0,0.0,12.0,229.0,217.0,98.0,103.2,12.0,0
3,Ac1Fe1O3,5,0,221,61.797311,8.889999,12.359462,6.509045,-8.258555,-2.771539,...,0.422133,0.675412,0.0,12.0,229.0,217.0,98.0,103.2,12.0,0
4,Ac1Ga1O3,5,0,221,61.455078,9.314495,12.291016,5.476935,-7.461883,-3.063253,...,0.0,0.0,0.0,12.0,225.0,213.0,65.0,64.0,12.0,0


In [12]:
dft_test.head()

Unnamed: 0,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,...,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber,cluster
0,Ac1H3,4,0,225,49.065716,7.784734,12.266429,6.594725,-20.422506,-0.671802,...,0.0,0.0,0.0,194.0,225.0,31.0,201.75,11.625,194.0,1
1,Ac2N2,4,2,186,99.880235,8.013618,24.970059,5.276132,-40.205678,-1.322563,...,0.0,0.0,0.0,194.0,225.0,31.0,209.5,15.5,194.0,0
2,Ag4Ge2Pb2S8,16,3,40,364.725495,5.680462,22.795343,3.729416,-4.371751,-0.549878,...,0.0,0.0,0.0,70.0,225.0,155.0,147.5,77.5,70.0,2
3,Ag4Hg2S2I4,12,3,36,397.001425,5.874191,33.083452,1.311221,-2.638433,-0.441444,...,0.0,0.0,0.0,64.0,225.0,161.0,135.666667,69.666667,64.0,3
4,Ag4P4Pd2O14,24,4,15,346.044808,4.761193,14.418534,1.535796,-6.480136,-1.785767,...,0.0,0.0,0.0,2.0,225.0,223.0,63.583333,80.708333,12.0,2


In [13]:
groups = dft_train.groupby('cluster')

In [14]:
for cluster, df in groups:
    print(f'Cluster {cluster}: {len(df)} samples')
    # display(df.head(5))

Cluster 0: 3188 samples
Cluster 1: 1539 samples
Cluster 2: 1551 samples
Cluster 3: 1990 samples


In [16]:
# save the cluster data
dft_train.to_csv(os.path.join(file_dir, 'train_data', 'mp_plus_cluster_all_train_set.csv'), index=False)
dft_test.to_csv(os.path.join(file_dir, 'test_data', 'mp_plus_cluster_all_test_set.csv'), index=False)