In [None]:
%pip install tensorflow

In [1]:
import pandas as pd
pd.set_option("display.max_rows", 5)
pd.set_option("display.max_columns", None)

import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# function to randomly drop a specified number of non-pure points
def drop_random_non_pure_points(df, num_to_drop):
    if num_to_drop >= len(df):
        return pd.DataFrame()  # Return an empty DataFrame if trying to drop more rows than available
    drop_indices = np.random.choice(df.index, num_to_drop, replace=False)
    return df.drop(drop_indices)

In [3]:
phase_equilibrium_data = pd.read_csv("./small_sample.csv")
phase_equilibrium_data

Unnamed: 0,Temperature,Pressure,z_water,z_ABSORBENT,z_CO2,z_N2,x_water,x_ABSORBENT,x_CO2,x_N2,y_water,y_ABSORBENT,y_CO2,y_N2,LiquidPhaseFraction,VapourPhaseFraction
0,346.137936,128608.273602,0.725918,0.039956,0.018889,0.215237,0.923365,0.056842,0.019742,0.000050,0.261263,0.000216,0.016881,0.721641,0.701788,0.298212
1,433.235885,212641.747543,0.533938,0.067883,0.065971,0.332208,0.000000,0.000000,0.000000,0.000000,0.533938,0.067883,0.065971,0.332208,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1022,433.217292,167067.303538,0.608605,0.032875,0.056175,0.302345,0.000000,0.000000,0.000000,0.000000,0.608605,0.032875,0.056175,0.302345,0.000000,1.000000
1023,346.341999,248676.415784,0.665872,0.064523,0.024508,0.245097,0.877968,0.090078,0.031842,0.000112,0.131723,0.000163,0.006037,0.862077,0.715783,0.284217


In [27]:
#features = ['Temperature', 'Pressure', 'z_EC', 'z_DMC', 'z_Tot']
#labels = ['x_DMC', 'x_EC', 'x_Tot', 'y_EC', 'y_DMC', 'y_Tot']

moleFractionNames = ['z_water', 'z_ABSORBENT', 'z_CO2', 'z_N2']
#mWNames = ['mW_water','mW_ABSORBENT','mW_CO2','mW_N2']
#mW = [18.015,61.08,44.01, 28.01]

# drop rows which contains NaN
phase_equilibrium_data = phase_equilibrium_data.dropna()
# drop rows where all values are 0
phase_equilibrium_data = phase_equilibrium_data.loc[~(phase_equilibrium_data == 0).all(axis=1)]

# create new columns "isPureLiquid" and "isPureVapour" and "1/T"
phase_equilibrium_data['isPureLiquid'] = phase_equilibrium_data.apply(lambda row:
                                                               (row['y_water'] == 0) and
                                                               (row['y_ABSORBENT'] == 0) and
                                                               (row['y_CO2'] == 0) and
                                                               (row['y_N2'] == 0),
                                                               axis=1).astype(int)

phase_equilibrium_data['isPureVapour'] = phase_equilibrium_data.apply(lambda row:
                                                               (row['x_water'] == 0) and
                                                               (row['x_ABSORBENT'] == 0) and
                                                               (row['x_CO2'] == 0) and
                                                               (row['x_N2'] == 0),
                                                               axis=1).astype(int)

phase_equilibrium_data['inverseTemperature'] = 1/phase_equilibrium_data['Temperature']

phase_equilibrium_data

Unnamed: 0,Temperature,Pressure,z_water,z_ABSORBENT,z_CO2,z_N2,x_water,x_ABSORBENT,x_CO2,x_N2,y_water,y_ABSORBENT,y_CO2,y_N2,LiquidPhaseFraction,VapourPhaseFraction,isPureLiquid,isPureVapour,inverseTemperature
0,346.137936,128608.273602,0.725918,0.039956,0.018889,0.215237,0.923365,0.056842,0.019742,0.000050,0.261263,0.000216,0.016881,0.721641,0.701788,0.298212,0,0,0.002889
1,433.235885,212641.747543,0.533938,0.067883,0.065971,0.332208,0.000000,0.000000,0.000000,0.000000,0.533938,0.067883,0.065971,0.332208,0.000000,1.000000,0,1,0.002308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1022,433.217292,167067.303538,0.608605,0.032875,0.056175,0.302345,0.000000,0.000000,0.000000,0.000000,0.608605,0.032875,0.056175,0.302345,0.000000,1.000000,0,1,0.002308
1023,346.341999,248676.415784,0.665872,0.064523,0.024508,0.245097,0.877968,0.090078,0.031842,0.000112,0.131723,0.000163,0.006037,0.862077,0.715783,0.284217,0,0,0.002887


In [53]:
# checking if all mole fractions add up to 1
all_mole_frac = np.sum((phase_equilibrium_data[moleFractionNames][:]), axis=1)
np.sum(all_mole_frac)/len(phase_equilibrium_data) # if this = 1.0, then all rows have mole fractions summing to 1

1.0

In [69]:
# calculating the percentage of pure liquid and vapour points
num_pure_liq_points = len(phase_equilibrium_data[phase_equilibrium_data['isPureLiquid'] == 1]);
num_non_pure_liq_points = len(phase_equilibrium_data[phase_equilibrium_data['isPureLiquid'] == 0]);
frac_pure_liq_points = num_pure_liq_points/(num_pure_liq_points + num_non_pure_liq_points)

num_pure_vap_points = len(phase_equilibrium_data[phase_equilibrium_data['isPureVapour'] == 1]);
num_non_pure_vap_points = len(phase_equilibrium_data[phase_equilibrium_data['isPureVapour'] == 0]);
frac_pure_vap_points = num_pure_vap_points/(num_pure_vap_points + num_non_pure_vap_points)

print(str(frac_pure_liq_points*100) + '%' + " of the points are pure liquid.")
print(str(frac_pure_vap_points*100) + '%' + " of the points are pure vapour.")

0.0% of the points are pure liquid.
29.1015625% of the points are pure vapour.


In [None]:
featureNames = ['inverseTemperature', 'Pressure', 'z_water', 'z_ABSORBENT', 'z_CO2', 'z_N2','mW_water','mW_ABSORBENT','mW_CO2','mW_N2']
# featureNames = ['inverseTemperature', 'Pressure', 'z_water', 'z_ABSORBENT', 'z_CO2', 'z_N2']
targetNamesRegression = ['x_ABSORBENT', 'x_CO2','x_N2', 'y_water', 'y_ABSORBENT', 'y_CO2']
targetNamesClassification = ['isPureVapour', 'VapourPhaseFraction']

# Filter rows where both 'all_y_zero' and 'all_x_zero' are 0
phaseEquilibriumDataRegression = phaseEquilibriumData[(phaseEquilibriumData['isPureLiquid'] == 0) & (phaseEquilibriumData['isPureVapour'] == 0)]

# Filter for non-pure points
nonPurePoints = phaseEquilibriumData[(phaseEquilibriumData['isPureLiquid'] == 0) & (phaseEquilibriumData['isPureVapour'] == 0)]

# Specify the number of points to drop
num_to_drop = int(len(nonPurePoints)*0.5)  # Change this to your desired number

# Drop random non-pure points
phaseEquilibriumDataReduced = drop_random_non_pure_points(nonPurePoints, num_to_drop)

# First, identify the pure liquid and vapor points
pureLiquidPoints = phaseEquilibriumData[phaseEquilibriumData['isPureLiquid'] == 1]
pureVaporPoints = phaseEquilibriumData[phaseEquilibriumData['isPureVapour'] == 1]

# Assuming phaseEquilibriumDataReduced is the reduced set of non-pure points from the previous step
# Concatenate the pure points with the reduced non-pure points to form the classification dataset
phaseEquilibriumDataClassification = pd.concat([pureLiquidPoints, pureVaporPoints, phaseEquilibriumDataReduced])

# Reset the index of the new DataFrame, if desired
phaseEquilibriumDataClassification.reset_index(drop=True, inplace=True)

phaseEquilibriumData = phaseEquilibriumData.sort_values(by=['inverseTemperature'], ascending=[True])
phaseEquilibriumDataClassification = phaseEquilibriumDataClassification.sort_values(by=['inverseTemperature'], ascending=[True])
phaseEquilibriumDataRegression = phaseEquilibriumDataRegression.sort_values(by=['inverseTemperature'], ascending=[True])

X = phaseEquilibriumData[featureNames]
y = phaseEquilibriumData[targetNamesClassification]

X_train_classification_full, X_test_classification_full, y_train_classification_full, y_test_classification_full = train_test_split(X, y, test_size=0.2, random_state=42)

vapourPhaseFraction = 1.0

# Filter for non-pure points
nonPurePoints_X_train = X_train_classification_full[(y_train_classification_full['VapourPhaseFraction'] < vapourPhaseFraction)]
nonPurePoints_X_test = X_test_classification_full[(y_test_classification_full['VapourPhaseFraction'] < vapourPhaseFraction)]
nonPurePoints_y_train = y_train_classification_full[(y_train_classification_full['VapourPhaseFraction'] < vapourPhaseFraction)]
nonPurePoints_y_test = y_test_classification_full[(y_test_classification_full['VapourPhaseFraction'] < vapourPhaseFraction)]

percentageOfPointsToDrop = 0.0
# Drop random non-pure points
X_train_reduced = drop_random_non_pure_points(nonPurePoints_X_train, int(len(nonPurePoints_X_train)*percentageOfPointsToDrop))
X_test_reduced = drop_random_non_pure_points(nonPurePoints_X_test, int(len(nonPurePoints_X_test)*percentageOfPointsToDrop))
y_train_reduced = drop_random_non_pure_points(nonPurePoints_y_train, int(len(nonPurePoints_y_train)*percentageOfPointsToDrop))
y_test_reduced = drop_random_non_pure_points(nonPurePoints_y_test, int(len(nonPurePoints_y_test)*percentageOfPointsToDrop))


print(X_train_reduced.shape[0]/nonPurePoints_X_train.shape[0])
print(X_test_reduced.shape[0]/nonPurePoints_X_test.shape[0])
print(y_train_reduced.shape[0]/nonPurePoints_y_train.shape[0])
print(y_test_reduced.shape[0]/nonPurePoints_y_test.shape[0])
# First, identify the pure liquid and vapor points

pureVaporPoints_X_train = X_train_classification_full[(y_train_classification_full['VapourPhaseFraction'] >= vapourPhaseFraction)]
pureVaporPoints_X_test = X_test_classification_full[(y_test_classification_full['VapourPhaseFraction'] >= vapourPhaseFraction)]
pureVaporPoints_y_train = y_train_classification_full[(y_train_classification_full['VapourPhaseFraction'] >= vapourPhaseFraction)]
pureVaporPoints_y_test = y_test_classification_full[(y_test_classification_full['VapourPhaseFraction'] >= vapourPhaseFraction)]


# Assuming phaseEquilibriumDataReduced is the reduced set of non-pure points from the previous step
# Concatenate the pure points with the reduced non-pure points to form the classification dataset
X_train_classification = pd.concat([pureVaporPoints_X_train, X_train_reduced])
X_test_classification = pd.concat([pureVaporPoints_X_test, X_test_reduced])
y_train_classification = pd.concat([pureVaporPoints_y_train, y_train_reduced])
y_test_classification = pd.concat([pureVaporPoints_y_test, y_test_reduced])
print(y_train_classification_full.columns)
y_train_classification_full = y_train_classification_full.drop('VapourPhaseFraction', axis=1)
y_test_classification_full = y_test_classification_full.drop('VapourPhaseFraction', axis=1)
print(y_train_classification_full.columns)
X_train_classification_full.to_csv('./trainingData/X_train_classification_full.csv', index=False)
y_train_classification_full.to_csv('./trainingData/y_train_classification_full.csv', index=False)
X_test_classification_full.to_csv('./trainingData/X_test_classification_full.csv', index=False)
y_test_classification_full.to_csv('./trainingData/y_test_classification_full.csv', index=False)

y_train_classification = y_train_classification.drop('VapourPhaseFraction', axis=1)
y_test_classification = y_test_classification.drop('VapourPhaseFraction', axis=1)

X_train_classification.to_csv('./trainingData/X_train_classification.csv', index=False)
y_train_classification.to_csv('./trainingData/y_train_classification.csv', index=False)
X_test_classification.to_csv('./trainingData/X_test_classification.csv', index=False)
y_test_classification.to_csv('./trainingData/y_test_classification.csv', index=False)

X = phaseEquilibriumData[featureNames]
y = phaseEquilibriumData[targetNamesRegression]

X_train_regression_full, X_test_regression_full, y_train_regression_full, y_test_regression_full = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_regression_full.to_csv('./trainingData/X_train_regression_full.csv', index=False)
y_train_regression_full.to_csv('./trainingData/y_train_regression_full.csv', index=False)
X_test_regression_full.to_csv('./trainingData/X_test_regression_full.csv', index=False)
y_test_regression_full.to_csv('./trainingData/y_test_regression_full.csv', index=False)


X_train_regression = X_train_regression_full[(y_train_classification_full['isPureVapour'] == 0)]
X_test_regression = X_test_regression_full[(y_test_classification_full['isPureVapour'] == 0)]
y_train_regression = y_train_regression_full[(y_train_classification_full['isPureVapour'] == 0)]
y_test_regression = y_test_regression_full[(y_test_classification_full['isPureVapour'] == 0)]

X_train_regression.to_csv('./trainingData/X_train_regression.csv', index=False)
y_train_regression.to_csv('./trainingData/y_train_regression.csv', index=False)
X_test_regression.to_csv('./trainingData/X_test_regression.csv', index=False)
y_test_regression.to_csv('./trainingData/y_test_regression.csv', index=False)