# Hierarchical Clustering on Boron clusters

* author: Xin Chen
* email: Bismarrck@me.com

This jupyter notebook is used to repeat the work of http://pubs.acs.org/doi/abs/10.1021/acs.jctc.6b01119. 

I will apply the clustering scheme on Boron clusters of $\textrm{CoB}_{21}$.

## 1. Overview

## 2. Declarations

In this section we will import python modules and declare global constants.

In [1]:
%matplotlib inline

In [2]:
import numpy as np

## 3. Prepare the Data

In [None]:
def extract_xyz(filename, verbose=True):
  """
  Extract symbols, coordiantes and forces (for later usage) from the raw file.
  
  Args:
    filename: a str.
    verbose: a bool.

  Returns
    energies: Array[N,]
    coordinates: Array[N, 17, 3], a 3D array containing the atomic coordinates.
    forces: Array[N, 17, 3], a 3D array containing the atomic forces.
  
  """
  energies = np.zeros((TOTAL_SIZE,), dtype=float)
  coordinates = np.zeros((TOTAL_SIZE, NUM_SITES, 3), dtype=float)
  forces = np.zeros((TOTAL_SIZE, NUM_SITES, 3), dtype=float)
  stage = 0
  i = 0
  j = 0
  energy_patt = re.compile(r".*energy=([\d.-]+).*")
  string_patt = re.compile(r"([A-Za-z]{1,2})\s+([\d.-]+)\s+([\d.-]+)\s+([\d.-]+)\s+"
                            "\d+\s+\d.\d+\s+\d+\s+([\d.-]+)\s+([\d.-]+)\s+([\d.-]+)")
  tic = time.time()
  if verbose:
    sys.stdout.write("Extract cartesian coordinates ...\n")
  with open(filename) as f:
    for line in f:
      if i == TOTAL_SIZE:
        break
      l = line.strip()
      if l == "":
        continue
      if stage == 0:
        if l.isdigit():
          n = int(l)
          if n != NUM_SITES:
            raise ValueError("The parsed size %d != NUM_SITES" % n)
          stage += 1
      elif stage == 1:
        m = energy_patt.search(l)
        if m:
          energies[i] = float(m.group(1))
          stage += 1
      elif stage == 2:
        m = string_patt.search(l)
        if m:
          coordinates[i, j, :] = float(m.group(2)), float(m.group(3)), float(m.group(4))
          forces[i, j, :] = float(m.group(5)), float(m.group(6)), float(m.group(7))
          j += 1
          if j == NUM_SITES:
            j = 0
            stage = 0
            i += 1
            if verbose and i % 1000 == 0:
              sys.stdout.write("\rProgress: %7d  /  %7d" % (i, TOTAL_SIZE))
    if verbose:
      print("")
      print("Total time: %.3f s\n" % (time.time() - tic))

  return energies, coordinates, forces