# Formulations Design of Experiment
## Aniket Chitre

In [35]:
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm
import itertools
from scipy.stats import qmc
from skopt.sampler import Lhs
from skopt.space import Space

In [2]:
# formulations ingredients

surfactants_list = [
 'Texapon SB 3 KC',
 'Plantapon ACG 50',
 'Plantapon LC 7',
 'Plantacare 818',
 'Plantacare 2000',
 'Dehyton MC',
 'Dehyton PK 45',
 'Dehyton ML',
 'Dehyton AB 30',
 'Plantapon Amino SCG-L',
 'Plantapon Amino KG-L',
 'Dehyquart A-CA'
]

conditioning_polymer = [
    "Luviquat Excellence",
    "Dehyquart CC6",
    "Dehyquart CC7 Benz",
    "Salcare Super 7"
]

thickener = [
    "Arlypon F",
    "Arlypon TT"
]

In [39]:
S_pairs = list(itertools.combinations(surfactants_list, 2))

In [40]:
S_pairs

[('Texapon SB 3 KC', 'Plantapon ACG 50'),
 ('Texapon SB 3 KC', 'Plantapon LC 7'),
 ('Texapon SB 3 KC', 'Plantacare 818'),
 ('Texapon SB 3 KC', 'Plantacare 2000'),
 ('Texapon SB 3 KC', 'Dehyton MC'),
 ('Texapon SB 3 KC', 'Dehyton PK 45'),
 ('Texapon SB 3 KC', 'Dehyton ML'),
 ('Texapon SB 3 KC', 'Dehyton AB 30'),
 ('Texapon SB 3 KC', 'Plantapon Amino SCG-L'),
 ('Texapon SB 3 KC', 'Plantapon Amino KG-L'),
 ('Texapon SB 3 KC', 'Dehyquart A-CA'),
 ('Plantapon ACG 50', 'Plantapon LC 7'),
 ('Plantapon ACG 50', 'Plantacare 818'),
 ('Plantapon ACG 50', 'Plantacare 2000'),
 ('Plantapon ACG 50', 'Dehyton MC'),
 ('Plantapon ACG 50', 'Dehyton PK 45'),
 ('Plantapon ACG 50', 'Dehyton ML'),
 ('Plantapon ACG 50', 'Dehyton AB 30'),
 ('Plantapon ACG 50', 'Plantapon Amino SCG-L'),
 ('Plantapon ACG 50', 'Plantapon Amino KG-L'),
 ('Plantapon ACG 50', 'Dehyquart A-CA'),
 ('Plantapon LC 7', 'Plantacare 818'),
 ('Plantapon LC 7', 'Plantacare 2000'),
 ('Plantapon LC 7', 'Dehyton MC'),
 ('Plantapon LC 7', 'Dehyt

[('Texapon SB 3 KC', 'Plantapon ACG 50'),
 ('Texapon SB 3 KC', 'Plantapon LC 7'),
 ('Texapon SB 3 KC', 'Plantacare 818'),
 ('Texapon SB 3 KC', 'Plantacare 2000'),
 ('Texapon SB 3 KC', 'Dehyton MC'),
 ('Texapon SB 3 KC', 'Dehyton PK 45'),
 ('Texapon SB 3 KC', 'Dehyton ML'),
 ('Texapon SB 3 KC', 'Dehyton AB 30'),
 ('Texapon SB 3 KC', 'Plantapon Amino SCG-L'),
 ('Texapon SB 3 KC', 'Plantapon Amino KG-L'),
 ('Texapon SB 3 KC', 'Dehyquart A-CA'),
 ('Plantapon ACG 50', 'Plantapon LC 7'),
 ('Plantapon ACG 50', 'Plantacare 818'),
 ('Plantapon ACG 50', 'Plantacare 2000'),
 ('Plantapon ACG 50', 'Dehyton MC'),
 ('Plantapon ACG 50', 'Dehyton PK 45'),
 ('Plantapon ACG 50', 'Dehyton ML'),
 ('Plantapon ACG 50', 'Dehyton AB 30'),
 ('Plantapon ACG 50', 'Plantapon Amino SCG-L'),
 ('Plantapon ACG 50', 'Plantapon Amino KG-L'),
 ('Plantapon ACG 50', 'Dehyquart A-CA'),
 ('Plantapon LC 7', 'Plantacare 818'),
 ('Plantapon LC 7', 'Plantacare 2000'),
 ('Plantapon LC 7', 'Dehyton MC'),
 ('Plantapon LC 7', 'Dehyt

In [3]:
# concentrations discretised in 1 w/w% steps

C_S1 = [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
C_S2 = [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
C_P = [2.0, 3.0, 4.0, 5.0, 6.0]
C_T = [2.0, 3.0, 4.0, 5.0, 6.0]

Made a couple of choices here: 

* Discretised the concentrations into 1 w/w % otherwise the combinations were becoming computationally prohibitive. 
* Similar, case with factoring out P & T; in 8 dimensions, the kernel would repeatedly die on my laptop. Furthermore, I'm going to be developing a phase stability classifier, for which I'm going to ignore P and T based on the following reasoning: i) Thickeners are developed in such a way that they can modulate the viscosity of the formulation without interefering too much with the chemistry of the formulation in other ways, in that respect only studying the concentration of thickener should be sufficient to have a rough understanding of phase stability; ii) The surfactant - polyelectrolyte interactions are defintely going to have significant control on the phase stability of the formulation, however, all the ingredients are similar in the fact that they're positively charged conditioning polymer, and so I'm assuming there electrostatic contributions should be roughly similar to the phase stabiity problem. For sure, their contribution to viscosity and turbidity will vary signficantly with a broad range of MWs being represented by the 4 conditioning polymers used in this study, however, to first approximation, I'm assuming simply the concentration of conditioning polymer should suffice. 

In [4]:
interested_combinations = surfactants_list + C_S1 + C_S2 + C_P + C_T

In [5]:
experiments = list(itertools.combinations(interested_combinations, 6))

In [6]:
DoE_list = [i for i in experiments if (i[0] in surfactants_list) and (i[1] in surfactants_list) 
            and (i[2] in C_S1) and (i[3] in C_S2) and (i[4] in C_P) and (i[5] in C_T)]

In [7]:
DoE_df = pd.DataFrame(DoE_list, columns = ['S1', 'S2','$C_{S1}$', '$C_{S2}$', '$C_{P}$', '$C_{T}$'])
DoE_df = DoE_df.drop_duplicates().reset_index()
DoE_df.drop(['index'], axis=1, inplace=True)
DoE_df

Unnamed: 0,S1,S2,$C_{S1}$,$C_{S2}$,$C_{P}$,$C_{T}$
0,Texapon SB 3 KC,Plantapon ACG 50,2.0,3.0,4.0,5.0
1,Texapon SB 3 KC,Plantapon ACG 50,2.0,3.0,4.0,6.0
2,Texapon SB 3 KC,Plantapon ACG 50,2.0,3.0,4.0,2.0
3,Texapon SB 3 KC,Plantapon ACG 50,2.0,3.0,4.0,3.0
4,Texapon SB 3 KC,Plantapon ACG 50,2.0,3.0,4.0,4.0
...,...,...,...,...,...,...
80845,Plantapon Amino KG-L,Dehyquart A-CA,8.0,8.0,6.0,2.0
80846,Plantapon Amino KG-L,Dehyquart A-CA,8.0,8.0,6.0,3.0
80847,Plantapon Amino KG-L,Dehyquart A-CA,8.0,8.0,6.0,4.0
80848,Plantapon Amino KG-L,Dehyquart A-CA,8.0,8.0,6.0,5.0


Unnamed: 0,S1,S2,$C_{S1}$,$C_{S2}$,$C_{P}$,$C_{T}$
0,Texapon SB 3 KC,Plantapon ACG 50,2.0,3.0,4.0,5.0
1,Texapon SB 3 KC,Plantapon ACG 50,2.0,3.0,4.0,6.0
2,Texapon SB 3 KC,Plantapon ACG 50,2.0,3.0,4.0,2.0
3,Texapon SB 3 KC,Plantapon ACG 50,2.0,3.0,4.0,3.0
4,Texapon SB 3 KC,Plantapon ACG 50,2.0,3.0,4.0,4.0
...,...,...,...,...,...,...
80845,Plantapon Amino KG-L,Dehyquart A-CA,8.0,8.0,6.0,2.0
80846,Plantapon Amino KG-L,Dehyquart A-CA,8.0,8.0,6.0,3.0
80847,Plantapon Amino KG-L,Dehyquart A-CA,8.0,8.0,6.0,4.0
80848,Plantapon Amino KG-L,Dehyquart A-CA,8.0,8.0,6.0,5.0


In [8]:
surf_itertools = list(itertools.combinations(surfactants_list, 2))

surf_combos = [i for i in surf_itertools if (i[0] in surfactants_list) and (i[1] in surfactants_list)]

S_combo_df = pd.DataFrame(surf_combos, columns=['S1', 'S2'])
S_combo_df

Unnamed: 0,S1,S2
0,Texapon SB 3 KC,Plantapon ACG 50
1,Texapon SB 3 KC,Plantapon LC 7
2,Texapon SB 3 KC,Plantacare 818
3,Texapon SB 3 KC,Plantacare 2000
4,Texapon SB 3 KC,Dehyton MC
...,...,...
61,Dehyton AB 30,Plantapon Amino KG-L
62,Dehyton AB 30,Dehyquart A-CA
63,Plantapon Amino SCG-L,Plantapon Amino KG-L
64,Plantapon Amino SCG-L,Dehyquart A-CA


Unnamed: 0,S1,S2
0,Texapon SB 3 KC,Plantapon ACG 50
1,Texapon SB 3 KC,Plantapon LC 7
2,Texapon SB 3 KC,Plantacare 818
3,Texapon SB 3 KC,Plantacare 2000
4,Texapon SB 3 KC,Dehyton MC
...,...,...
61,Dehyton AB 30,Plantapon Amino KG-L
62,Dehyton AB 30,Dehyquart A-CA
63,Plantapon Amino SCG-L,Plantapon Amino KG-L
64,Plantapon Amino SCG-L,Dehyquart A-CA


In [9]:
df_FG_ratios = pd.read_csv('Featurisation_csv/Surfactant_FG_counts_scaled.csv', index_col=0)

In [10]:
def cosine_similarity(a, b):
    return dot(a, b)/(norm(a)*norm(b))

def tanimoto_similarity(a, b):
    return dot(a, b)/(sum(a**2) + sum(b**2) - dot(a, b))

def dice_similarity(a, b):
    return (2*dot(a, b))/(sum(a**2) + sum(b**2))

In [11]:
surf_sim_cosine   = []
surf_sim_tanimoto = []
surf_sim_dice     = []

for row in S_combo_df.index:
    a = df_FG_ratios.loc[S_combo_df['S1'][row]]
    b = df_FG_ratios.loc[S_combo_df['S2'][row]]
    surf_sim_cosine.append(cosine_similarity(a,b))
    surf_sim_tanimoto.append(tanimoto_similarity(a,b))
    surf_sim_dice.append(dice_similarity(a,b))

In [12]:
sim_array = np.array([surf_sim_cosine, surf_sim_tanimoto, surf_sim_dice])

avg_sim = np.average(sim_array, axis=0)

In [13]:
similarity_scores = {'Cosine': surf_sim_cosine, 'Tanimoto': surf_sim_tanimoto, 'Dice': surf_sim_dice, 'Average': avg_sim}
df_sim = pd.DataFrame(similarity_scores)


S_combo_df_sim = pd.concat((S_combo_df.reset_index(), df_sim), axis=1)
df_surf_pairs = S_combo_df_sim.drop(['index'], axis=1)

In [14]:
df_surf_pairs_sorted = df_surf_pairs.sort_values(by=['Cosine'], ascending=True)
df_surf_pairs_sorted

Unnamed: 0,S1,S2,Cosine,Tanimoto,Dice,Average
58,Dehyton ML,Plantapon Amino KG-L,-0.911197,-0.307273,-0.887142,-0.701871
28,Plantapon LC 7,Plantapon Amino KG-L,-0.865016,-0.289002,-0.812948,-0.655655
36,Plantacare 818,Plantapon Amino KG-L,-0.785579,-0.223657,-0.576181,-0.528472
51,Dehyton PK 45,Dehyton ML,-0.733288,-0.178452,-0.434429,-0.448723
24,Plantapon LC 7,Dehyton PK 45,-0.719857,-0.190219,-0.469803,-0.459960
...,...,...,...,...,...,...
45,Dehyton MC,Dehyton PK 45,0.878370,0.783093,0.878354,0.846605
15,Plantapon ACG 50,Dehyton PK 45,0.879380,0.778522,0.875471,0.844458
14,Plantapon ACG 50,Dehyton MC,0.907179,0.822514,0.902615,0.877436
52,Dehyton PK 45,Dehyton AB 30,0.922947,0.856877,0.922923,0.900915


Unnamed: 0,S1,S2,Cosine,Tanimoto,Dice,Average
58,Dehyton ML,Plantapon Amino KG-L,-0.911197,-0.307273,-0.887142,-0.701871
28,Plantapon LC 7,Plantapon Amino KG-L,-0.865016,-0.289002,-0.812948,-0.655655
36,Plantacare 818,Plantapon Amino KG-L,-0.785579,-0.223657,-0.576181,-0.528472
51,Dehyton PK 45,Dehyton ML,-0.733288,-0.178452,-0.434429,-0.448723
24,Plantapon LC 7,Dehyton PK 45,-0.719857,-0.190219,-0.469803,-0.459960
...,...,...,...,...,...,...
45,Dehyton MC,Dehyton PK 45,0.878370,0.783093,0.878354,0.846605
15,Plantapon ACG 50,Dehyton PK 45,0.879380,0.778522,0.875471,0.844458
14,Plantapon ACG 50,Dehyton MC,0.907179,0.822514,0.902615,0.877436
52,Dehyton PK 45,Dehyton AB 30,0.922947,0.856877,0.922923,0.900915


In [15]:
df_surf_pairs[df_surf_pairs['Average'] < -0.4]

Unnamed: 0,S1,S2,Cosine,Tanimoto,Dice,Average
2,Texapon SB 3 KC,Plantacare 818,-0.621552,-0.218531,-0.559284,-0.466456
6,Texapon SB 3 KC,Dehyton ML,-0.70082,-0.177712,-0.432237,-0.436923
11,Plantapon ACG 50,Plantapon LC 7,-0.654964,-0.186415,-0.458255,-0.433211
12,Plantapon ACG 50,Plantacare 818,-0.594816,-0.214195,-0.54516,-0.45139
23,Plantapon LC 7,Dehyton MC,-0.717565,-0.189023,-0.46616,-0.457583
24,Plantapon LC 7,Dehyton PK 45,-0.719857,-0.190219,-0.469803,-0.45996
26,Plantapon LC 7,Dehyton AB 30,-0.714714,-0.18995,-0.468982,-0.457882
28,Plantapon LC 7,Plantapon Amino KG-L,-0.865016,-0.289002,-0.812948,-0.655655
31,Plantacare 818,Dehyton MC,-0.581772,-0.203213,-0.51008,-0.431688
32,Plantacare 818,Dehyton PK 45,-0.65375,-0.223256,-0.574851,-0.483953


Unnamed: 0,S1,S2,Cosine,Tanimoto,Dice,Average
2,Texapon SB 3 KC,Plantacare 818,-0.621552,-0.218531,-0.559284,-0.466456
6,Texapon SB 3 KC,Dehyton ML,-0.70082,-0.177712,-0.432237,-0.436923
11,Plantapon ACG 50,Plantapon LC 7,-0.654964,-0.186415,-0.458255,-0.433211
12,Plantapon ACG 50,Plantacare 818,-0.594816,-0.214195,-0.54516,-0.45139
23,Plantapon LC 7,Dehyton MC,-0.717565,-0.189023,-0.46616,-0.457583
24,Plantapon LC 7,Dehyton PK 45,-0.719857,-0.190219,-0.469803,-0.45996
26,Plantapon LC 7,Dehyton AB 30,-0.714714,-0.18995,-0.468982,-0.457882
28,Plantapon LC 7,Plantapon Amino KG-L,-0.865016,-0.289002,-0.812948,-0.655655
31,Plantacare 818,Dehyton MC,-0.581772,-0.203213,-0.51008,-0.431688
32,Plantacare 818,Dehyton PK 45,-0.65375,-0.223256,-0.574851,-0.483953


In [16]:
selected_pair = []
selected_surf = []

for row in df_surf_pairs_sorted.index:
    while len(selected_pair) < 6:
        S1 = df_surf_pairs_sorted['S1'][row]
        S2 = df_surf_pairs_sorted['S2'][row]
        
        if (S1 in selected_surf) or (S2 in selected_surf):
            #print('Already in')
            pass
        else:
            #print('Both new')
            S_pair = [S1, S2]
            for s in S_pair:
                selected_surf.append(s)
            selected_pair.append(S_pair)
            #df_selected_pair = df_selected_pair.append({'S1': S1, 'S2': S2})
        break

In [17]:
Surf_DoE = []

for i in range(len(selected_pair)):
    for j in range(6):
        Surf_DoE.append(selected_pair[i])

In [18]:
selected_pair

[['Dehyton ML', 'Plantapon Amino KG-L'],
 ['Plantapon LC 7', 'Dehyton PK 45'],
 ['Plantacare 818', 'Dehyton AB 30'],
 ['Texapon SB 3 KC', 'Plantacare 2000'],
 ['Dehyton MC', 'Dehyquart A-CA'],
 ['Plantapon ACG 50', 'Plantapon Amino SCG-L']]

[['Dehyton ML', 'Plantapon Amino KG-L'],
 ['Plantapon LC 7', 'Dehyton PK 45'],
 ['Plantacare 818', 'Dehyton AB 30'],
 ['Texapon SB 3 KC', 'Plantacare 2000'],
 ['Dehyton MC', 'Dehyquart A-CA'],
 ['Plantapon ACG 50', 'Plantapon Amino SCG-L']]

In [19]:
Sample_ID = []

for i in range(1,37):
    Sample_ID.append('S'+str(i))

In [20]:
Surf_dict = {'Sample ID': Sample_ID, 'S1': [i[0] for i in Surf_DoE], 'S2': [i[1] for i in Surf_DoE], 'P': 'Luviquat Excellence', 'T': 'Arlypon TT'}
Ingredients_DoE_df = pd.DataFrame(Surf_dict)

In [21]:
l_bounds = [2.0, 2.0, 2.0, 2.0]
u_bounds = [8.0, 8.0, 6.0, 6.0]

sampler = qmc.LatinHypercube(d=4, optimization="random-cd", seed=7)
sample = sampler.random(n=36)
Conc_DoE = np.round(qmc.scale(sample, l_bounds, u_bounds),1)
Conc_DoE_df = pd.DataFrame(Conc_DoE, columns=['C_S1', 'C_S2', 'C_P', 'C_T'])

In [22]:
DoE_df = Ingredients_DoE_df.merge(Conc_DoE_df, how="inner", left_index=True, right_index=True)
DoE_df.index = np.arange(1, len(DoE_df) + 1)

In [23]:
all_ingredients =  surfactants_list + conditioning_polymer + thickener
#all_ingredients =  ['Sample'] + surfactants_list + conditioning_polymer + thickener

DoE_OT = pd.DataFrame(index=range(DoE_df.shape[0]), columns=all_ingredients)
DoE_OT[:] = 0.
DoE_OT.index = np.arange(1, len(DoE_OT) + 1)

DoE_OT.insert(0, 'Sample', Sample_ID)
DoE_OT

Unnamed: 0,Sample,Texapon SB 3 KC,Plantapon ACG 50,Plantapon LC 7,Plantacare 818,Plantacare 2000,Dehyton MC,Dehyton PK 45,Dehyton ML,Dehyton AB 30,Plantapon Amino SCG-L,Plantapon Amino KG-L,Dehyquart A-CA,Luviquat Excellence,Dehyquart CC6,Dehyquart CC7 Benz,Salcare Super 7,Arlypon F,Arlypon TT
1,S1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,S2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,S3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,S4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,S5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,S6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,S7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,S8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,S9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,S10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,Sample,Texapon SB 3 KC,Plantapon ACG 50,Plantapon LC 7,Plantacare 818,Plantacare 2000,Dehyton MC,Dehyton PK 45,Dehyton ML,Dehyton AB 30,Plantapon Amino SCG-L,Plantapon Amino KG-L,Dehyquart A-CA,Luviquat Excellence,Dehyquart CC6,Dehyquart CC7 Benz,Salcare Super 7,Arlypon F,Arlypon TT
1,S1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,S2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,S3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,S4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,S5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,S6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,S7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,S8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,S9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,S10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
for sample in range(1, len(DoE_df)+1):
    
    for j in ['S1', 'S2', 'P', 'T']:
    
        ing  = DoE_df[j].loc[sample]
        conc = DoE_df['C_'+j].loc[sample]
        
        DoE_OT.loc[sample][ing] = conc

In [25]:
repeats_df = pd.DataFrame(np.repeat(DoE_OT.iloc[5::6].values, 2, axis=0), columns=DoE_OT.columns)
repeats_df.index = np.arange(37, len(repeats_df) + 37)

In [26]:
DoE_initial = pd.concat([DoE_OT, repeats_df])

In [27]:
DoE_initial['Water'] = 100 - (DoE_initial.iloc[:, 1:].sum(axis=1))
DoE_initial['Water'] = round(DoE_initial['Water'].astype(float),1)

In [50]:
ingredient_list = ['Texapon SB 3 KC', 'Plantapon ACG 50', 'Plantapon LC 7',
                   'Plantacare 818', 'Plantacare 2000',
                   'Dehyton MC', 'Dehyton PK 45', 'Dehyton ML', 'Dehyton AB 30',
                   'Plantapon Amino SCG-L', 'Plantapon Amino KG-L','Dehyquart A-CA',
                   'Luviquat Excellence', 'Dehyquart CC6',
                   'Dehyquart CC7 Benz', 'Salcare Super 7', 'Arlypon F', 'Arlypon TT', 'Water']

density_dict = {'Texapon SB 3 KC': 1.128, 'Plantapon ACG 50': 1.147, 'Plantapon LC 7': 1.070,
                'Plantacare 818': 1.104, 'Plantacare 2000': 1.103,
                'Dehyton MC': 1.097, 'Dehyton PK 45': 1.062, 'Dehyton ML': 1.084, 'Dehyton AB 30': 1.031,
                'Plantapon Amino SCG-L': 1.051, 'Plantapon Amino KG-L': 1.028,'Dehyquart A-CA': 0.955,
                'Luviquat Excellence': 1.118, 'Dehyquart CC6': 1.067,
                'Dehyquart CC7 Benz': 1.024, 'Salcare Super 7': 1.121, 'Arlypon F': 0.887, 'Arlypon TT': 0.970, 'Water': 0.998}

In [29]:
DoE_initial['Sample Density'] = (1/sum(((DoE_initial[i]/100)/density_dict[i]) for i in ingredient_list))
DoE_initial['Sample Density'] = round(DoE_initial['Sample Density'].astype(float),3)

In [30]:
DoE_initial.index.name = 'ID'

In [31]:
DoE_initial

Unnamed: 0_level_0,Sample,Texapon SB 3 KC,Plantapon ACG 50,Plantapon LC 7,Plantacare 818,Plantacare 2000,Dehyton MC,Dehyton PK 45,Dehyton ML,Dehyton AB 30,...,Plantapon Amino KG-L,Dehyquart A-CA,Luviquat Excellence,Dehyquart CC6,Dehyquart CC7 Benz,Salcare Super 7,Arlypon F,Arlypon TT,Water,Sample Density
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,S1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,...,5.0,0.0,2.7,0.0,0.0,0.0,0.0,4.2,84.6,1.004
2,S2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.8,0.0,...,2.2,0.0,4.0,0.0,0.0,0.0,0.0,4.6,86.4,1.004
3,S3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.1,0.0,...,4.7,0.0,2.1,0.0,0.0,0.0,0.0,2.9,83.2,1.006
4,S4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.4,0.0,...,7.7,0.0,3.2,0.0,0.0,0.0,0.0,5.5,76.2,1.008
5,S5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.3,0.0,...,3.6,0.0,2.6,0.0,0.0,0.0,0.0,4.8,84.7,1.004
6,S6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.5,0.0,...,2.3,0.0,3.4,0.0,0.0,0.0,0.0,5.9,82.9,1.005
7,S7,0.0,0.0,6.9,0.0,0.0,0.0,7.1,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.3,78.7,1.01
8,S8,0.0,0.0,7.8,0.0,0.0,0.0,4.1,0.0,0.0,...,0.0,0.0,4.4,0.0,0.0,0.0,0.0,5.3,78.4,1.009
9,S9,0.0,0.0,3.1,0.0,0.0,0.0,7.4,0.0,0.0,...,0.0,0.0,2.8,0.0,0.0,0.0,0.0,2.4,84.3,1.007
10,S10,0.0,0.0,6.0,0.0,0.0,0.0,4.3,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2.1,84.6,1.007


Unnamed: 0_level_0,Sample,Texapon SB 3 KC,Plantapon ACG 50,Plantapon LC 7,Plantacare 818,Plantacare 2000,Dehyton MC,Dehyton PK 45,Dehyton ML,Dehyton AB 30,...,Plantapon Amino KG-L,Dehyquart A-CA,Luviquat Excellence,Dehyquart CC6,Dehyquart CC7 Benz,Salcare Super 7,Arlypon F,Arlypon TT,Water,Sample Density
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,S1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,...,5.0,0.0,2.7,0.0,0.0,0.0,0.0,4.2,84.6,1.004
2,S2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.8,0.0,...,2.2,0.0,4.0,0.0,0.0,0.0,0.0,4.6,86.4,1.004
3,S3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.1,0.0,...,4.7,0.0,2.1,0.0,0.0,0.0,0.0,2.9,83.2,1.006
4,S4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.4,0.0,...,7.7,0.0,3.2,0.0,0.0,0.0,0.0,5.5,76.2,1.008
5,S5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.3,0.0,...,3.6,0.0,2.6,0.0,0.0,0.0,0.0,4.8,84.7,1.004
6,S6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.5,0.0,...,2.3,0.0,3.4,0.0,0.0,0.0,0.0,5.9,82.9,1.005
7,S7,0.0,0.0,6.9,0.0,0.0,0.0,7.1,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.3,78.7,1.01
8,S8,0.0,0.0,7.8,0.0,0.0,0.0,4.1,0.0,0.0,...,0.0,0.0,4.4,0.0,0.0,0.0,0.0,5.3,78.4,1.009
9,S9,0.0,0.0,3.1,0.0,0.0,0.0,7.4,0.0,0.0,...,0.0,0.0,2.8,0.0,0.0,0.0,0.0,2.4,84.3,1.007
10,S10,0.0,0.0,6.0,0.0,0.0,0.0,4.3,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2.1,84.6,1.007


In [32]:
DoE_initial.to_csv('DoE_csv/PhD_MasterDataset_OT_Dec2022.csv')

In [33]:
surfactant_all_features = pd.read_csv('BASFSurfactants_RawFeatures.csv')
surfactant_all_features = surfactant_all_features.loc[:, ~surfactant_all_features.columns.str.contains('^Unnamed')]

FileNotFoundError: [Errno 2] No such file or directory: 'BASFSurfactants_RawFeatures.csv'

FileNotFoundError: [Errno 2] No such file or directory: 'BASFSurfactants_RawFeatures.csv'

In [None]:
df_expt = surfactant_all_features.iloc[:, -5:]
df_expt.set_index([surfactants_list], inplace=True)
#df_expt.drop(['CMC (w/w%) ', 'Min. SFT (mN/m)'], axis=1, inplace=True)

In [None]:
from sklearn.preprocessing import StandardScaler

expt_scaler = StandardScaler()
df_expt_scaled = pd.DataFrame(expt_scaler.fit_transform(df_expt), index=df_expt.index, columns=df_expt.columns)
df_expt_scaled

In [None]:
expt_sim_cosine   = []
expt_sim_tanimoto = []
expt_sim_dice     = []

for row in S_combo_df.index:
    a = df_expt_scaled.loc[S_combo_df['S1'][row]]
    b = df_expt_scaled.loc[S_combo_df['S2'][row]]
    expt_sim_cosine.append(cosine_similarity(a,b))
    expt_sim_tanimoto.append(tanimoto_similarity(a,b))
    expt_sim_dice.append(dice_similarity(a,b))

In [None]:
expt_sim_array = np.array([expt_sim_cosine, expt_sim_tanimoto, expt_sim_dice])

avg_expt_sim = np.average(expt_sim_array, axis=0)

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(avg_sim.reshape(-1,1), avg_expt_sim.reshape(-1),1)
x = np.linspace(min(min(avg_sim), min(avg_expt_sim)), max(max(avg_sim), max(avg_expt_sim))).reshape(-1,1)
y = model.predict(x)

In [None]:
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.dpi'] = 300

plt.figure(figsize=(6,6));
plt.scatter(avg_sim, avg_expt_sim);
plt.plot(x, x, '--');
plt.xlabel('Pairwise surfactant FG similarity scores');
plt.ylabel('Pairwise surfactant experimental similarity scores');

In [41]:
l_bounds = [8.0, 8.0, 1.0, 1.0]
u_bounds = [13.0, 13.0, 3.0, 5.0]

sampler = qmc.LatinHypercube(d=4, optimization="random-cd", seed=7)
sample = sampler.random(n=60)
LHS  = np.round(qmc.scale(sample, l_bounds, u_bounds),1)
# Conc_DoE_df = pd.DataFrame(Conc_DoE, columns=['C_S1', 'C_S2', 'C_P', 'C_T'])

In [42]:
LHS_df = pd.DataFrame(LHS, columns=['C_S1', 'C_S2', 'C_P', 'C_T'])
LHS_df

Unnamed: 0,C_S1,C_S2,C_P,C_T
0,10.4,10.9,1.3,1.7
1,12.6,10.8,1.6,3.1
2,12.5,11.1,2.4,4.7
3,9.0,11.3,1.5,1.2
4,8.2,9.2,1.0,2.1
5,12.4,12.3,2.5,1.5
6,12.0,8.3,1.3,2.3
7,9.7,10.8,1.8,2.5
8,11.4,9.5,2.2,4.0
9,8.1,10.5,1.8,4.9


Unnamed: 0,C_S1,C_S2,C_P,C_T
0,10.4,10.9,1.3,1.7
1,12.6,10.8,1.6,3.1
2,12.5,11.1,2.4,4.7
3,9.0,11.3,1.5,1.2
4,8.2,9.2,1.0,2.1
5,12.4,12.3,2.5,1.5
6,12.0,8.3,1.3,2.3
7,9.7,10.8,1.8,2.5
8,11.4,9.5,2.2,4.0
9,8.1,10.5,1.8,4.9


In [48]:
DoE_initial = pd.read_csv('DoE_csv/MasterDataset_OT_InitialDoE_Feb.csv', index_col='ID')

Unnamed: 0_level_0,Sample,Texapon SB 3 KC,Plantapon ACG 50,Plantapon LC 7,Plantacare 818,Plantacare 2000,Dehyton MC,Dehyton PK 45,Dehyton ML,Dehyton AB 30,Plantapon Amino SCG-L,Plantapon Amino KG-L,Dehyquart A-CA,Luviquat Excellence,Dehyquart CC6,Dehyquart CC7 Benz,Salcare Super 7,Arlypon F,Arlypon TT
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,S1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,5.0,0.0,2.7,0,0,0,0,4.2
2,S2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.8,0.0,0.0,2.2,0.0,4.0,0,0,0,0,4.6
3,S3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.1,0.0,0.0,4.7,0.0,2.1,0,0,0,0,2.9
4,S4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.4,0.0,0.0,7.7,0.0,3.2,0,0,0,0,5.5
5,S5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.3,0.0,0.0,3.6,0.0,2.6,0,0,0,0,4.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,S134,0.0,0.0,0.0,0.0,0.0,0.0,11.2,0.0,0.0,0.0,0.0,10.3,1.0,0,0,0,0,3.6
135,S135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.8,12.8,0.0,0.0,0.0,1.6,0,0,0,0,4.3
136,S136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.2,0.0,11.8,0.0,0.0,1.1,0,0,0,0,3.8
137,S137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.1,0.0,0.0,11.9,0.0,2.8,0,0,0,0,1.1


Unnamed: 0_level_0,Sample,Texapon SB 3 KC,Plantapon ACG 50,Plantapon LC 7,Plantacare 818,Plantacare 2000,Dehyton MC,Dehyton PK 45,Dehyton ML,Dehyton AB 30,Plantapon Amino SCG-L,Plantapon Amino KG-L,Dehyquart A-CA,Luviquat Excellence,Dehyquart CC6,Dehyquart CC7 Benz,Salcare Super 7,Arlypon F,Arlypon TT
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,S1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,5.0,0.0,2.7,0,0,0,0,4.2
2,S2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.8,0.0,0.0,2.2,0.0,4.0,0,0,0,0,4.6
3,S3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.1,0.0,0.0,4.7,0.0,2.1,0,0,0,0,2.9
4,S4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.4,0.0,0.0,7.7,0.0,3.2,0,0,0,0,5.5
5,S5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.3,0.0,0.0,3.6,0.0,2.6,0,0,0,0,4.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,S134,0.0,0.0,0.0,0.0,0.0,0.0,11.2,0.0,0.0,0.0,0.0,10.3,1.0,0,0,0,0,3.6
135,S135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.8,12.8,0.0,0.0,0.0,1.6,0,0,0,0,4.3
136,S136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.2,0.0,11.8,0.0,0.0,1.1,0,0,0,0,3.8
137,S137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.1,0.0,0.0,11.9,0.0,2.8,0,0,0,0,1.1


In [49]:
DoE_initial['Water'] = 100 - (DoE_initial.iloc[:, 1:].sum(axis=1))
DoE_initial['Water'] = round(DoE_initial['Water'].astype(float),1)

In [51]:
DoE_initial['Sample Density'] = (1/sum(((DoE_initial[i]/100)/density_dict[i]) for i in ingredient_list))
DoE_initial['Sample Density'] = round(DoE_initial['Sample Density'].astype(float),3)

In [53]:
DoE_initial

Unnamed: 0_level_0,Sample,Texapon SB 3 KC,Plantapon ACG 50,Plantapon LC 7,Plantacare 818,Plantacare 2000,Dehyton MC,Dehyton PK 45,Dehyton ML,Dehyton AB 30,...,Plantapon Amino KG-L,Dehyquart A-CA,Luviquat Excellence,Dehyquart CC6,Dehyquart CC7 Benz,Salcare Super 7,Arlypon F,Arlypon TT,Water,Sample Density
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,S1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,...,5.0,0.0,2.7,0,0,0,0,4.2,84.6,1.004
2,S2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.8,0.0,...,2.2,0.0,4.0,0,0,0,0,4.6,86.4,1.004
3,S3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.1,0.0,...,4.7,0.0,2.1,0,0,0,0,2.9,83.2,1.006
4,S4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.4,0.0,...,7.7,0.0,3.2,0,0,0,0,5.5,76.2,1.008
5,S5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.3,0.0,...,3.6,0.0,2.6,0,0,0,0,4.8,84.7,1.004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,S134,0.0,0.0,0.0,0.0,0.0,0.0,11.2,0.0,0.0,...,0.0,10.3,1.0,0,0,0,0,3.6,73.9,1.000
135,S135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.8,12.8,...,0.0,0.0,1.6,0,0,0,0,4.3,71.5,1.010
136,S136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.2,0.0,...,0.0,0.0,1.1,0,0,0,0,3.8,74.1,1.011
137,S137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.1,0.0,...,11.9,0.0,2.8,0,0,0,0,1.1,74.1,1.012


Unnamed: 0_level_0,Sample,Texapon SB 3 KC,Plantapon ACG 50,Plantapon LC 7,Plantacare 818,Plantacare 2000,Dehyton MC,Dehyton PK 45,Dehyton ML,Dehyton AB 30,...,Plantapon Amino KG-L,Dehyquart A-CA,Luviquat Excellence,Dehyquart CC6,Dehyquart CC7 Benz,Salcare Super 7,Arlypon F,Arlypon TT,Water,Sample Density
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,S1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,...,5.0,0.0,2.7,0,0,0,0,4.2,84.6,1.004
2,S2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.8,0.0,...,2.2,0.0,4.0,0,0,0,0,4.6,86.4,1.004
3,S3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.1,0.0,...,4.7,0.0,2.1,0,0,0,0,2.9,83.2,1.006
4,S4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.4,0.0,...,7.7,0.0,3.2,0,0,0,0,5.5,76.2,1.008
5,S5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.3,0.0,...,3.6,0.0,2.6,0,0,0,0,4.8,84.7,1.004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,S134,0.0,0.0,0.0,0.0,0.0,0.0,11.2,0.0,0.0,...,0.0,10.3,1.0,0,0,0,0,3.6,73.9,1.000
135,S135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.8,12.8,...,0.0,0.0,1.6,0,0,0,0,4.3,71.5,1.010
136,S136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.2,0.0,...,0.0,0.0,1.1,0,0,0,0,3.8,74.1,1.011
137,S137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.1,0.0,...,11.9,0.0,2.8,0,0,0,0,1.1,74.1,1.012


In [54]:
DoE_initial.to_csv('DoE_300123.csv')