In [1]:
from itertools import combinations
import pandas as pd
import CMC_helper_function as cmc

In [2]:

# Ratios for surfactant1
ratios = [0.05, 0.2, 0.5, 0.8, 0.95]

# Build all valid binary combos (exclude cationic-anionic)
keys = list(cmc.surfactant_library)
all_combos = [
    (s1, s2, r)
    for s1, s2 in combinations(keys, 2)
    if not (
        (cmc.surfactant_library[s1]['Category'] == 'cationic' and cmc.surfactant_library[s2]['Category'] == 'anionic')
        or
        (cmc.surfactant_library[s1]['Category'] == 'anionic' and cmc.surfactant_library[s2]['Category'] == 'cationic')
    )
    for r in ratios
]

# Create DataFrame with every combo
df = pd.DataFrame(all_combos, columns=['surfactant_1', 'surfactant_2', 'surfactant_1_ratio'])
df['surfactant_2_ratio'] = 1 - df['surfactant_1_ratio']

# Round to two decimals
df[['surfactant_1_ratio', 'surfactant_2_ratio']] = df[['surfactant_1_ratio', 'surfactant_2_ratio']].round(2)

df = df.sort_values(['surfactant_1', 'surfactant_2', 'surfactant_1_ratio']).reset_index(drop=True)
df


Unnamed: 0,surfactant_1,surfactant_2,surfactant_1_ratio,surfactant_2_ratio
0,CAPB,CHAPS,0.05,0.95
1,CAPB,CHAPS,0.20,0.80
2,CAPB,CHAPS,0.50,0.50
3,CAPB,CHAPS,0.80,0.20
4,CAPB,CHAPS,0.95,0.05
...,...,...,...,...
90,TTAB,CHAPS,0.05,0.95
91,TTAB,CHAPS,0.20,0.80
92,TTAB,CHAPS,0.50,0.50
93,TTAB,CHAPS,0.80,0.20


In [3]:
existing_combos = pd.read_csv('processed_data/all_CMC_processed_data.csv')
existing_combos = existing_combos[existing_combos['surfactant_2_ratio']!= 0.0]
print(existing_combos.shape)
existing_combos = existing_combos.sort_values(['surfactant_1', 'surfactant_2', 'surfactant_1_ratio']).reset_index(drop=True)
existing_combos

(32, 7)


Unnamed: 0,surfactant_1,assay,surfactant_1_ratio,surfactant_2,surfactant_2_ratio,CMC,R2
0,CAPB,,0.05,CHAPS,0.95,5.060144,0.962099
1,CAPB,,0.2,CHAPS,0.8,2.69027,0.918776
2,CAPB,,0.5,CHAPS,0.5,0.989354,0.906185
3,CTAB,,0.2,CAPB,0.8,0.913345,0.806361
4,CTAB,,0.95,CAPB,0.05,1.096685,0.889786
5,CTAB,,0.2,CHAPS,0.8,0.919692,0.895863
6,CTAB,,0.5,DTAB,0.5,1.672338,0.880482
7,CTAB,,0.8,DTAB,0.2,1.302299,0.924454
8,CTAB,,0.2,TTAB,0.8,2.430551,0.895319
9,CTAB,,0.8,TTAB,0.2,1.316639,0.909514


In [4]:
# build a set of tuples for fast membership-testing
existing_set = set(
    existing_combos[['surfactant_1','surfactant_2','surfactant_1_ratio']]
    .itertuples(index=False, name=None)
)

# annotate df in one line
df['done'] = [
    (s1, s2, r) in existing_set
    for s1, s2, r in df[['surfactant_1','surfactant_2','surfactant_1_ratio']].itertuples(index=False, name=None)
]


In [5]:
print(df[df['done'] == True].shape)
print(df[df['done'] == False].shape)

(32, 5)
(63, 5)


In [6]:
df[df['done'] == False].to_csv('processed_data/more_pairs.csv')

In [7]:
df[df['done'] == False].head(60)

Unnamed: 0,surfactant_1,surfactant_2,surfactant_1_ratio,surfactant_2_ratio,done
3,CAPB,CHAPS,0.8,0.2,False
4,CAPB,CHAPS,0.95,0.05,False
5,CTAB,CAPB,0.05,0.95,False
7,CTAB,CAPB,0.5,0.5,False
8,CTAB,CAPB,0.8,0.2,False
10,CTAB,CHAPS,0.05,0.95,False
12,CTAB,CHAPS,0.5,0.5,False
13,CTAB,CHAPS,0.8,0.2,False
14,CTAB,CHAPS,0.95,0.05,False
15,CTAB,DTAB,0.05,0.95,False
