In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
os.chdir('../')

In [3]:
plt.rcParams['mathtext.fontset'] = 'cm'
plt.rcParams["font.family"] = "serif"
plt.rcParams["font.serif"] = ["Times New Roman"
                                                ] + plt.rcParams["font.serif"]
plt.rcParams['font.size'] = 13
plt.rcParams['figure.dpi'] = 300

# The Data

In [4]:
data = pd.read_excel('./data/data.xlsx')
# data = data.drop(columns=['S/N'])
data

Unnamed: 0,S/N,cDen,Pot,Sn %,pH,C2H4,CO,H2,EtoH,FORM
0,1,150,3.5,100,14.05,0,23,12,0,61
1,2,150,3.3,80,14.05,0,23,7,0,66
2,3,150,3.2,50,14.05,0,34,5,3,52
3,4,150,3.1,10,14.05,1,42,5,2,42
4,5,150,3.0,5,14.05,4,48,5,10,19
5,6,150,3.0,3,14.05,7,50,5,11,14
6,7,150,2.8,0,14.05,15,47,11,6,11
7,8,250,4.0,100,14.05,0,22,12,0,63
8,9,250,3.8,80,14.05,0,17,10,0,70
9,10,250,3.7,50,14.05,2,36,8,2,48


In [5]:
features_col = list(data.columns[:5])
target_col = list(data.columns[5:])
#target_col = [target_col[0], target_col[2]]
print('Features: ', features_col)
print('Target: ', target_col)

Features:  ['S/N', 'cDen', 'Pot', 'Sn %', 'pH']
Target:  ['C2H4', 'CO', 'H2', 'EtoH', 'FORM']


In [6]:
# normalize the data in target columns by 100
data[target_col] = data[target_col] / 100
data.head(2)

Unnamed: 0,S/N,cDen,Pot,Sn %,pH,C2H4,CO,H2,EtoH,FORM
0,1,150,3.5,100,14.05,0.0,0.23,0.12,0.0,0.61
1,2,150,3.3,80,14.05,0.0,0.23,0.07,0.0,0.66


In [7]:
data[features_col[3]] = data[features_col[3]] / 100
data.head(2)

Unnamed: 0,S/N,cDen,Pot,Sn %,pH,C2H4,CO,H2,EtoH,FORM
0,1,150,3.5,1.0,14.05,0.0,0.23,0.12,0.0,0.61
1,2,150,3.3,0.8,14.05,0.0,0.23,0.07,0.0,0.66


In [8]:
# create a pymatgen structure from the data. remember that it's CuSn with Sn fraction in position 2 in features_col
import pymatgen.core as pmg

def create_structure(Sn_percent):
    # create the structure
    if Sn_percent <= 1:
        base = f'Cu{1-Sn_percent}Sn{Sn_percent}'
        comp = pmg.Composition(base)
    else:
        raise ValueError('Sn percent must be less than or equal to 1')
    return comp

data['weight'] = data['Sn %'].apply(create_structure).apply(lambda x: x.weight)
data.head(5)

Unnamed: 0,S/N,cDen,Pot,Sn %,pH,C2H4,CO,H2,EtoH,FORM,weight
0,1,150,3.5,1.0,14.05,0.0,0.23,0.12,0.0,0.61,118.71
1,2,150,3.3,0.8,14.05,0.0,0.23,0.07,0.0,0.66,107.6772
2,3,150,3.2,0.5,14.05,0.0,0.34,0.05,0.03,0.52,91.128
3,4,150,3.1,0.1,14.05,0.01,0.42,0.05,0.02,0.42,69.0624
4,5,150,3.0,0.05,14.05,0.04,0.48,0.05,0.1,0.19,66.3042


In [9]:
data['Cu %'] = 1 - data['Sn %']
data.head(2)

Unnamed: 0,S/N,cDen,Pot,Sn %,pH,C2H4,CO,H2,EtoH,FORM,weight,Cu %
0,1,150,3.5,1.0,14.05,0.0,0.23,0.12,0.0,0.61,118.71,0.0
1,2,150,3.3,0.8,14.05,0.0,0.23,0.07,0.0,0.66,107.6772,0.2


In [10]:
features_col += ['weight', 'Cu %']

In [11]:
# reassign the features and target columns
data = data[features_col + target_col]
data.head(2)

Unnamed: 0,S/N,cDen,Pot,Sn %,pH,weight,Cu %,C2H4,CO,H2,EtoH,FORM
0,1,150,3.5,1.0,14.05,118.71,0.0,0.0,0.23,0.12,0.0,0.61
1,2,150,3.3,0.8,14.05,107.6772,0.2,0.0,0.23,0.07,0.0,0.66


In [12]:
# print the minimum and maximum values of the features
print('Minimum values of the features')
print(data[features_col[1:]].min())

Minimum values of the features
cDen      141.000
Pot         2.800
Sn %        0.000
pH          8.020
weight     63.546
Cu %        0.000
dtype: float64


In [13]:
# print the minimum and maximum values of the features
print('Maximum values of the features')
print(data[features_col[1:]].max())

Maximum values of the features
cDen      450.00
Pot         4.70
Sn %        1.00
pH         14.05
weight    118.71
Cu %        1.00
dtype: float64


In [14]:
data.head()

Unnamed: 0,S/N,cDen,Pot,Sn %,pH,weight,Cu %,C2H4,CO,H2,EtoH,FORM
0,1,150,3.5,1.0,14.05,118.71,0.0,0.0,0.23,0.12,0.0,0.61
1,2,150,3.3,0.8,14.05,107.6772,0.2,0.0,0.23,0.07,0.0,0.66
2,3,150,3.2,0.5,14.05,91.128,0.5,0.0,0.34,0.05,0.03,0.52
3,4,150,3.1,0.1,14.05,69.0624,0.9,0.01,0.42,0.05,0.02,0.42
4,5,150,3.0,0.05,14.05,66.3042,0.95,0.04,0.48,0.05,0.1,0.19


In [21]:
features_col

['S/N', 'cDen', 'Pot', 'Sn %', 'pH', 'weight', 'Cu %']

In [22]:
target_col

['C2H4', 'CO', 'H2', 'EtoH', 'FORM']

In [20]:
# save the data
data.to_csv('./data/cleaned_data.csv', index=False)

In [15]:
import json
def write_to_csv(org_data, 
                 idx: int = 0, name='ethane', 
                 shuffle: bool = False, 
                 normalize: bool = False, 
                 split: bool = False, random_state: int = 42, split_ratio: float = 0.8):
    # make a copy of the data
    data = org_data.copy()
    if normalize:
        data[features_col[1:]] = data[features_col[1:]] / data[features_col[1:]].max()
        path = f'./data/{name}_normalized'
    else:
        path = f'./data/{name}'

    print(data.head(2))
    # shuffle the data
    if shuffle:
        data = data.sample(frac=1, random_state=random_state)

    # select the target column
    new_data  = data[features_col + [target_col[idx]]]
    print(new_data.head(2))

    # create the data directory
    if not os.path.exists(f'{path}'):
        os.makedirs(f'{path}')
    
    # split the data into train and test
    if split:
        train_data = new_data.sample(frac=split_ratio, random_state=random_state)
        test_data = new_data.drop(train_data.index)
        train_data.to_csv(f'{path}/train_data.csv', index=False)
        test_data.to_csv(f'{path}/test_data.csv', index=False)
    else:
        new_data.to_csv(f'{path}/data.csv', index=False)

    # write the parameters to a json file
    params = {'idx':   idx,
                'name':  name,
                'shuffle': shuffle,
                'normalize': normalize,
                'split': split,
                'random_state': random_state,
                'split_ratio': split_ratio}
    with open(f'{path}/params.json', 'w') as f:
        json.dump(params, f)


In [16]:
# write_to_csv(data=data, idx=0, name='ethane', shuffle=True, normalize=True, split=False)

In [17]:
# write_to_csv(org_data=data, idx=1, name='CO', shuffle=False, normalize=True, split=False)

In [18]:
# write_to_csv(data=data, idx=2, name='H2', shuffle=True, normalize=True, split=False)

In [19]:
# write_to_csv(data=data, idx=3, name='EthO', shuffle=True, normalize=True, split=False)