<h1><center> Welcome to alternatives to ML-MotEx </center></h1>

# First import modules, set seed parameters and import functions

In [31]:
import numpy as np
import matplotlib as mpl
from multiprocessing import Pool
from functools import partial
import matplotlib.pyplot as plt
import time, shap, random
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from bayes_opt import BayesianOptimization
import pandas as pd
from ase.io import read
from diffpy.Structure import Structure, Atom
from diffpy.srfit.pdf import PDFContribution, PDFParser, PDFGenerator
from diffpy.srfit.fitbase import FitRecipe, FitResults, Profile, FitContribution
from diffpy.srreal.pdfcalculator import DebyePDFCalculator
from scipy.optimize.minpack import leastsq

np.random.seed(14)

In [17]:
def Import_Dataset(FileName):
    """This function loads a catalogue of structures with their corresponding Rwp values and split the dataset 
    into a training set and validation set with features and labels."""
    # load data
    dataset = np.loadtxt(FileName, delimiter=" ", skiprows=0)
    dataset_original = dataset.copy()

    # Split into training and validation set
    dataset_train = dataset[:int(len(dataset)*0.8)]
    dataset_val = dataset[int(len(dataset)*0.8):len(dataset)]
    
    # split data into features (X) and labels (y)
    X_train = dataset_train[:,2:len(dataset)+1]
    y_train = dataset_train[:,1]
    X_val = dataset_val[:,2:len(dataset)+1]
    y_val = dataset_val[:,1]
    
    print("Number of Training Data:", len(y_train))
    print("Number of Validation Data:", len(y_val))
        
    return X_train, y_train, X_val, y_val
    
def fitting(structure_catalogue, plot, index):
    """This function takes in a 'starting_model', and an 'index' from the 'structure_catalogue'. It generates the 
    corresponding structure and fit it to the 'Experimental_Data'."""
    
    # Read structure and divide it into two lists: Atoms we want to iterate (W) and atoms we do not iterate (O)
    stru = read(starting_model)
    xyz = stru.get_positions()
    xyz_W = xyz[:NumW].copy()
    xyz_O = xyz[NumW:len(xyz)].copy()
    keep_O = np.zeros(len(xyz_O))
    h = 0
    # Cycle through W atoms and delete W according to index 0's from permutation
    permutations = np.asarray(structure_catalogue)[:,1:]
    for j in range(len(xyz_W)):
        if permutations[index][j] == 0:
            xyz_W = np.delete(xyz_W,j - h,0)
            h = h+1   
    # Cycle through all atoms that is not iteratable and test if it is within the threshold distance. Delete atoms with no bonds
    for j in range(len(xyz_O)):        
        for k in range(len(xyz_W)):
            dist = np.linalg.norm(xyz_W[k] - xyz_O[j])
            if dist < threshold:    
                keep_O[j] = 1
                break
    h = 0            
    for j in range(len(xyz_O)):
        if keep_O[j] == 0:
            xyz_O = np.delete(xyz_O,j - h, 0)
            h += 1
            
    # Create structure for iterable (W) and non-iterable (O) atoms and combine them
    W_cluster = Structure([Atom('W', xi) for xi in xyz_W])
    O_cluster = Structure([Atom('O', xi) for xi in xyz_O])
    cluster = W_cluster + O_cluster
    
    # Make a standard cluster refinement using Diffpy-CMI
    # Import the data and make it a PDFprofile. Define the range of the data that will be used in the fit.
    pdfprofile = Profile()
    pdfparser = PDFParser()
    pdfparser.parseFile(Experimental_Data)
    pdfprofile.loadParsedData(pdfparser)
    pdfprofile.setCalculationRange(xmin = 1.6, xmax = 10)

    # Setup the PDFgenerator that calculates the PDF from the structure
    pdfgenerator_cluster = PDFGenerator("G")
    # Add the profile and both generators to the PDFcontribution
    pdfcontribution = FitContribution("pdf")
    pdfcontribution.setProfile(pdfprofile, xname="r") 
    pdfcontribution.addProfileGenerator(pdfgenerator_cluster)
    
    pdfgenerator_cluster.setQmin(0.7)
    pdfgenerator_cluster.setQmax(20)
    pdfgenerator_cluster._calc.evaluatortype = 'OPTIMIZED'
    pdfgenerator_cluster.setStructure(cluster, periodic = False)

    # Use scaling factors proportional to molar content
    pdfcontribution.setEquation('mc*G')

    # Define the recipe to do the fit and add it to the PDFcontribution
    recipe = FitRecipe()
    recipe.addContribution(pdfcontribution)

    # Avoid too much output during fitting 
    recipe.clearFitHooks()

    # Add the scale factor.
    recipe.addVar(pdfcontribution.mc, 1.0, tag = "scale")
    
    # Add the instrumental parameters to the two generators
    pdfgenerator_cluster.qdamp.value = 0.05
    pdfgenerator_cluster.qbroad.value = 0.01

    # Add the delta2 parameters, and make sure it cannot take unphysical values
    recipe.addVar(pdfgenerator_cluster.delta2, 0, name = "delta2_cluster", tag = "delta2")

    # Add ADP and "cell" for the cluster
    phase_cluster = pdfgenerator_cluster.phase
    atoms = phase_cluster.getScatterers()
    lat = phase_cluster.getLattice()

    recipe.newVar("zoomscale1", 1.0, tag = "lat")
    recipe.newVar("zoomscale2", 1.0, tag = "lat")
    recipe.newVar("zoomscale3", 1.0, tag = "lat")
    recipe.constrain(lat.a, 'zoomscale1')
    recipe.constrain(lat.b, 'zoomscale2')
    recipe.constrain(lat.c, 'zoomscale3')

    W_cluster = recipe.newVar("W_Biso_cluster1", 0.4, tag = 'adp_w')
    O_cluster = recipe.newVar("O_Biso_cluster1", 0.4, tag = 'adp_o')

    for atom in atoms:
        if atom.element.title() == "W":
            recipe.constrain(atom.Biso, W_cluster)
        elif atom.element.title() == "O":
            recipe.constrain(atom.Biso, O_cluster)

    recipe.restrain("zoomscale1", lb = 0.99, ub = 1.01, sig = 0.001)
    recipe.restrain("zoomscale2", lb = 0.99, ub = 1.01, sig = 0.001)
    recipe.restrain("zoomscale3", lb = 0.99, ub = 1.01, sig = 0.001)
    
    #free parameters are set
    recipe.fix('all')
    recipe.free("scale", "lat")

    # Turn off printout of iteration number.
    #recipe.clearFitHooks()

    # We can now execute the fit using scipy's least square optimizer.
    leastsq(recipe.residual, recipe.getValues())
    
    # We calculate the goodness-of-fit, Rwp
    g = recipe.pdf.profile.y
    gcalc = recipe.pdf.evaluate()
    rfactor1 = np.sqrt(sum((g - gcalc)**2) / sum((g)**2))
    
    # if plot == 1 it will also plot the fit
    if plot == 1:
        print ("FIT RESULTS\n")
        res1 = FitResults(recipe)
        print (res1)

        # Plot the observed and refined PDF.
        # Get the experimental data from the recipe
        r = recipe.pdf.profile.x
        gobs = recipe.pdf.profile.y

        # Get the calculated PDF and compute the difference between the calculated and measured PDF
        gcalc = recipe.pdf.evaluate()
        baseline = 1.1 * gobs.min()
        gdiff = gobs - gcalc

        # Plot!
        plt.figure()
        plt.plot(r, gobs, 'bo', label="G(r) data")
        plt.plot(r, gcalc, 'r-', label="G(r) fit")
        plt.plot(r, gdiff + baseline, 'g-', label="G(r) diff")
        plt.plot(r, np.zeros_like(r) + baseline, 'k:')
        plt.xlabel(r"$r (\AA)$")
        plt.ylabel(r"$G (\AA^{-2})$")
        plt.legend()

        plt.show()
    return rfactor1

def fitting_multiprocess(structure_catalogue, SaveName, cores=1):
    """This function runs the refinement of all the structures in the structure catalogue using multiprocessing"""
    start_time = time.time()
    values = []
    # Set up multiprocessing refinement
    fitindex = range(len(structure_catalogue))
    p = Pool(processes=cores)
    plot = 0
    func = partial(fitting, structure_catalogue, plot)
    results = p.map(func, fitindex)
    p.close()
    p.join()
    
    # Start refinement and append results to lists
    for i in fitindex:
        if i % 100 == 0:
            print ("I have now fitted: ", str(i) + " structures out of " + str(len(structure_catalogue)))
        rw = results[i]
        values.append(i)
        values.append(rw)
    values = np.reshape(values,(int(len(values)/2) , 2))
    
    # Save results in format that is suitable for Machine Learning
    print ("Best fit")
    print (values[np.argmin(values[:,1])])
    print("Total execution time: %.3fs" % (time.time()-start_time))
    Result = np.column_stack([values, np.asarray(structure_catalogue)[values[:,0].astype(int)]])
    np.savetxt(SaveName, Result)
    return Result


# First we try to weight the atom contributions without use of ML

In [18]:
saveFits = "Training_Data/structure_109725.txt" # Name of the saved fits file

# Import dataset
X_train, y_train, X_val, y_val = Import_Dataset(saveFits)

for structure_number in range(len(y_train)):
    where_one = np.where(X_train[structure_number,:] == 1)
    X_train[structure_number,where_one] = y_train[structure_number]
X_train = X_train.mean(0)

X_train[1:]

Number of Training Data: 8000
Number of Validation Data: 2000


array([0.34211816, 0.35514468, 0.34529134, 0.35309727, 0.33877448,
       0.35206208, 0.34034462, 0.35062888, 0.34047383, 0.34878394,
       0.34502808, 0.34782042, 0.33975733, 0.35009632, 0.3418833 ,
       0.35845513, 0.34278846, 0.34962394, 0.342858  , 0.34319284,
       0.34110077, 0.34727624, 0.33939427, 0.35016531])

In [19]:
AtomContributionValues = X_train

Norm_AtomContributionValues = AtomContributionValues.copy()
# Normalise the AtomContributionValues and get the RGB color in viridis.reverse
amin, amax = min(Norm_AtomContributionValues), max(Norm_AtomContributionValues)
for i, val in enumerate(Norm_AtomContributionValues):
    Norm_AtomContributionValues[i] = (val-amin) / (amax-amin)
Norm_AtomContributionValues_ph = Norm_AtomContributionValues.copy()
Norm_AtomContributionValues_ph.sort()

# Normalise such that a threshold is set on the the 10 % lowest and 10 % highest atoms
norm = mpl.colors.Normalize(vmin=Norm_AtomContributionValues_ph[round((len(Norm_AtomContributionValues))/10)], vmax=Norm_AtomContributionValues_ph[-round((len(Norm_AtomContributionValues))/10)])
cmap = mpl.cm.cividis_r

print ("Atom contribution are calculated to: ")
for i in range(1,len(AtomContributionValues)):
    m = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)
    print ("Atom #", str(i) + ": ", str(AtomContributionValues[i]), " Colorcode: ", mpl.colors.rgb2hex(m.to_rgba(Norm_AtomContributionValues[i])))


Atom contribution are calculated to: 
Atom # 1:  0.3421181568279513  Colorcode:  #ddc858
Atom # 2:  0.3551446799287627  Colorcode:  #2e416d
Atom # 3:  0.3452913361911336  Colorcode:  #afa471
Atom # 4:  0.3530972691010328  Colorcode:  #4c556c
Atom # 5:  0.3387744833823796  Colorcode:  #fee838
Atom # 6:  0.3520620826769287  Colorcode:  #595e6e
Atom # 7:  0.3403446204115771  Colorcode:  #f8df3c
Atom # 8:  0.350628875409687  Colorcode:  #6b6d72
Atom # 9:  0.34047382771166057  Colorcode:  #f7de3e
Atom # 10:  0.34878393871216035  Colorcode:  #807f78
Atom # 11:  0.3450280834129964  Colorcode:  #b3a670
Atom # 12:  0.34782042132729524  Colorcode:  #8d8878
Atom # 13:  0.3397573275252268  Colorcode:  #fee838
Atom # 14:  0.3500963218754674  Colorcode:  #717274
Atom # 15:  0.341883304770805  Colorcode:  #e0cb56
Atom # 16:  0.35845512549338987  Colorcode:  #00224e
Atom # 17:  0.3427884640247057  Colorcode:  #d3c05f
Atom # 18:  0.3496239413092981  Colorcode:  #767676
Atom # 19:  0.34285799592703825  

# Now we try simply to remove each atom’s partial and comparing the Rw?  

In [35]:
starting_model = "Structure_Models/109725.xyz" # Name of the starting model file
Number_of_structures = 10000 # Number of structures made to the structure catalogue
NumW = 24 # Number of atoms that should be permuted in the starting model
threshold = 2.5 # Thredshold for W - O bond

def structure_catalogue_maker(Number_of_atoms):
    structure_catalogue = []
    for i in range(Number_of_atoms):
        my_list = np.ones((25,))
        my_list[0] = 23
        my_list[i+1] = 0
        my_list = list(my_list)
        structure_catalogue.append(my_list)
    print ("Permutations Succeeded")
    return structure_catalogue



structure_catalogue = structure_catalogue_maker(Number_of_atoms=NumW)
print ("We show the first 10 structures in the catalogue:")
structure_catalogue


Permutations Succeeded
We show the first 10 structures in the catalogue:


[[23.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0],
 [23.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0],
 [23.0,
  1.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0],
 [23.0,
  1.0,
  1.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0],
 [23.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0],
 [23.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  

In [33]:
Experimental_Data = "Experimental_Data/DanMAX_AlphaKeggin.gr" # Name of the experimental file
saveFits = "Training_Data/structure_109725_Alternative.txt" # Name of the saved fits file


In [46]:
Result = fitting_multiprocess(structure_catalogue, SaveName=saveFits, cores=None)
print ("The best fitting structure is:")
Result


I have now fitted:  0 structures out of 24
Best fit
[1.         0.57189986]
Total execution time: 4.194s
The best fitting structure is:


array([[ 0.        ,  0.61399193, 23.        ,  0.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ],
       [ 1.        ,  0.57189986, 23.        ,  1.        ,  0.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ],
       [ 2.        ,  0.6180346 , 23.        ,  1.        ,  1.        ,
         0.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  1.    

In [58]:
AtomContributionValues = []
for i in range(len(Result)):
    AtomContributionValues.append(1-Result[i,1])

In [59]:
Norm_AtomContributionValues = AtomContributionValues.copy()
# Normalise the AtomContributionValues and get the RGB color in viridis.reverse
amin, amax = min(Norm_AtomContributionValues), max(Norm_AtomContributionValues)
for i, val in enumerate(Norm_AtomContributionValues):
    Norm_AtomContributionValues[i] = (val-amin) / (amax-amin)
Norm_AtomContributionValues_ph = Norm_AtomContributionValues.copy()
Norm_AtomContributionValues_ph.sort()

# Normalise such that a threshold is set on the the 10 % lowest and 10 % highest atoms
norm = mpl.colors.Normalize(vmin=Norm_AtomContributionValues_ph[round((len(Norm_AtomContributionValues))/10)], vmax=Norm_AtomContributionValues_ph[-round((len(Norm_AtomContributionValues))/10)])
cmap = mpl.cm.cividis_r

print ("Atom contribution are calculated to: ")
for i in range(len(AtomContributionValues)):
    m = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)
    print ("Atom #", str(i+1) + ": ", str(AtomContributionValues[i]), " Colorcode: ", mpl.colors.rgb2hex(m.to_rgba(Norm_AtomContributionValues[i])))


Atom contribution are calculated to: 
Atom # 1:  0.3860080717514075  Colorcode:  #e8d24f
Atom # 2:  0.4281001421674805  Colorcode:  #00224e
Atom # 3:  0.38196540095101417  Colorcode:  #fee838
Atom # 4:  0.42556716179839205  Colorcode:  #002c66
Atom # 5:  0.38963045928590356  Colorcode:  #d2c060
Atom # 6:  0.4187124169054345  Colorcode:  #36466c
Atom # 7:  0.38600807170540996  Colorcode:  #e8d24f
Atom # 8:  0.4281001420552061  Colorcode:  #00224e
Atom # 9:  0.3819654009830128  Colorcode:  #fee838
Atom # 10:  0.4255671618861764  Colorcode:  #002c66
Atom # 11:  0.38963045921410233  Colorcode:  #d2c060
Atom # 12:  0.41871241694226435  Colorcode:  #36466c
Atom # 13:  0.38600807158521844  Colorcode:  #e8d24f
Atom # 14:  0.4281001421205406  Colorcode:  #00224e
Atom # 15:  0.3819654009906399  Colorcode:  #fee838
Atom # 16:  0.4255671617868457  Colorcode:  #002c66
Atom # 17:  0.38963045924543427  Colorcode:  #d2c060
Atom # 18:  0.4187124169770613  Colorcode:  #36466c
Atom # 19:  0.3860080718206