In [1]:
import numpy as np

import pandas as pd
from pandas import read_csv

from matplotlib import pyplot as plt

from scipy.stats import zscore
from sklearn.model_selection import train_test_split

# Load Data

In [2]:
df = read_csv('new_train.csv')

In [3]:
df.head(10)

Unnamed: 0,MOFname,volume,weight,density,surface_area,void_fraction,void_volume,functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2,CO2_working_capacity
0,mof_unit_1,1116.667429,875.2406,1.301526,0.0,0.07899,0.0607,COOH-OEt,3,4,11,pcu,22.864166,6.786041,105.284502
1,mof_unit_2,2769.503842,2211.697211,1.32609,603.61,0.13794,0.104,F-OMe,10,44,57,etb,33.61678,7.147286,101.224774
2,mof_unit_3,1089.818728,773.68796,1.178856,788.5,0.14874,0.1262,OMe-COOH,2,22,24,pcu,19.263726,6.347967,118.987011
3,mof_unit_4,2205.198301,1304.63872,0.982408,1441.53,0.21814,0.222,H-SO3H,9,17,24,sra,25.701377,6.190085,187.626004
4,mof_unit_5,1137.800963,901.73612,1.31602,0.0,0.07778,0.0591,NHMe-OH,2,1,22,pcu,30.001838,6.478063,79.210001
5,mof_unit_6,3954.659761,1543.02768,0.647909,2430.55,0.37094,0.5725,Pr-NO2,9,7,23,sra,17.146541,5.398304,55.786959
6,mof_unit_7,3565.914939,1954.749656,0.910268,1530.02,0.33337,0.3662,NH2,10,53,55,etb,18.363791,6.303857,111.690462
7,mof_unit_8,916.043907,639.11156,1.158537,1685.58,0.28458,0.2456,Br,2,5,13,pcu,12.684804,5.234732,58.989774
8,mof_unit_9,2228.882339,1422.69182,1.05992,1458.02,0.24277,0.2291,HCO,4,14,19,acs,37.040424,6.298964,135.587108
9,mof_unit_10,4275.42142,1606.196,0.623834,3276.11,0.41004,0.6573,NHMe-OH,3,13,29,pcu,10.999299,5.539327,59.739057


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68613 entries, 0 to 68612
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   MOFname               68613 non-null  object 
 1   volume                68613 non-null  float64
 2   weight                68613 non-null  float64
 3   density               68613 non-null  float64
 4   surface_area          68613 non-null  float64
 5   void_fraction         68613 non-null  float64
 6   void_volume           68613 non-null  float64
 7   functional_groups     68290 non-null  object 
 8   metal_linker          68613 non-null  int64  
 9   organic_linker1       68613 non-null  int64  
 10  organic_linker2       68613 non-null  int64  
 11  topology              68613 non-null  object 
 12  CO2/N2_selectivity    68613 non-null  float64
 13  heat_adsorption_CO2   66526 non-null  float64
 14  CO2_working_capacity  68613 non-null  float64
dtypes: float64(9), int6

In [5]:
df['metal_linker'].unique()

array([ 3, 10,  2,  9,  4,  1, 12], dtype=int64)

# Clean Data

In [6]:
# Show sum of NULL for each column
df.isnull().sum()

MOFname                    0
volume                     0
weight                     0
density                    0
surface_area               0
void_fraction              0
void_volume                0
functional_groups        323
metal_linker               0
organic_linker1            0
organic_linker2            0
topology                   0
CO2/N2_selectivity         0
heat_adsorption_CO2     2087
CO2_working_capacity       0
dtype: int64

In [7]:
# Clean na from selected columns
df['functional_groups'].fillna(value = 'empty',inplace=True)
df.dropna(inplace=True)
df.isnull().sum()

MOFname                 0
volume                  0
weight                  0
density                 0
surface_area            0
void_fraction           0
void_volume             0
functional_groups       0
metal_linker            0
organic_linker1         0
organic_linker2         0
topology                0
CO2/N2_selectivity      0
heat_adsorption_CO2     0
CO2_working_capacity    0
dtype: int64

In [8]:
print(df.max())

MOFname                 mof_unit_9999
volume                         223965
weight                        22595.9
density                       3.35179
surface_area                  7083.53
void_fraction                 0.87206
void_volume                    6.6101
functional_groups               empty
metal_linker                       12
organic_linker1                    59
organic_linker2                    59
topology                          the
CO2/N2_selectivity            914.244
heat_adsorption_CO2               inf
CO2_working_capacity          736.062
dtype: object


In [9]:
df_clean = df
df_clean.loc[df_clean['heat_adsorption_CO2'] == df_clean['heat_adsorption_CO2'].max()]
df_clean = df.drop([68611, 68612])
print(df_clean.max())

MOFname                 mof_unit_9999
volume                         223965
weight                        22595.9
density                       3.35179
surface_area                  7083.53
void_fraction                 0.87206
void_volume                    6.6101
functional_groups               empty
metal_linker                       12
organic_linker1                    59
organic_linker2                    59
topology                          the
CO2/N2_selectivity            914.244
heat_adsorption_CO2           17.1554
CO2_working_capacity          736.062
dtype: object


In [10]:
df_clean = df_clean[df_clean['surface_area'] > 0]
df_clean

Unnamed: 0,MOFname,volume,weight,density,surface_area,void_fraction,void_volume,functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2,CO2_working_capacity
1,mof_unit_2,2769.503842,2211.697211,1.326090,603.61,0.13794,0.1040,F-OMe,10,44,57,etb,33.616780,7.147286,101.224774
2,mof_unit_3,1089.818728,773.687960,1.178856,788.50,0.14874,0.1262,OMe-COOH,2,22,24,pcu,19.263726,6.347967,118.987011
3,mof_unit_4,2205.198301,1304.638720,0.982408,1441.53,0.21814,0.2220,H-SO3H,9,17,24,sra,25.701377,6.190085,187.626004
5,mof_unit_6,3954.659761,1543.027680,0.647909,2430.55,0.37094,0.5725,Pr-NO2,9,7,23,sra,17.146541,5.398304,55.786959
6,mof_unit_7,3565.914939,1954.749656,0.910268,1530.02,0.33337,0.3662,NH2,10,53,55,etb,18.363791,6.303857,111.690462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66519,mof_unit_66520,1426.479810,1272.451540,1.481238,1343.62,0.30190,0.2038,Me-I,3,12,21,pcu,5.867674,4.485481,7.602105
66520,mof_unit_66521,23943.701366,5497.752320,0.381279,4182.24,0.66340,1.7399,HCO-Me,1,9,27,pcu,4.060772,3.605688,2.675231
66521,mof_unit_66522,14389.971556,4396.164320,0.507298,4149.64,0.57051,1.1246,OPr-Cl,1,9,20,pcu,4.313411,3.361233,-1.686092
66522,mof_unit_66523,16997.806645,3932.703680,0.384191,4326.62,0.66963,1.7430,CN,2,2,5,nbo,3.447440,2.781566,-7.546805


In [11]:
df_clean.info()
print(df.topology.unique())
# print(df.functional_groups.unique())
print(df.metal_linker.unique())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54074 entries, 1 to 66523
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   MOFname               54074 non-null  object 
 1   volume                54074 non-null  float64
 2   weight                54074 non-null  float64
 3   density               54074 non-null  float64
 4   surface_area          54074 non-null  float64
 5   void_fraction         54074 non-null  float64
 6   void_volume           54074 non-null  float64
 7   functional_groups     54074 non-null  object 
 8   metal_linker          54074 non-null  int64  
 9   organic_linker1       54074 non-null  int64  
 10  organic_linker2       54074 non-null  int64  
 11  topology              54074 non-null  object 
 12  CO2/N2_selectivity    54074 non-null  float64
 13  heat_adsorption_CO2   54074 non-null  float64
 14  CO2_working_capacity  54074 non-null  float64
dtypes: float64(9), int6

In [12]:
# , 'functional_groups', 'topology', 'metal_linker'

In [13]:
X = df_clean.drop(['MOFname', 'CO2_working_capacity'], axis = 1)
y = df_clean['CO2_working_capacity'].values

In [21]:
X = np.asarray(X).astype('float32')

ValueError: could not convert string to float: 'F-OMe'

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X_train

Unnamed: 0,volume,weight,density,surface_area,void_fraction,void_volume,functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2
30144,3071.301290,1334.02586,0.721259,2727.99,0.35561,0.4930,OPr-Et,12,4,13,bcu,13.646863,5.545981
64854,7365.644303,5449.96848,1.228662,1140.54,0.27981,0.2277,Ph-I,2,25,25,pcu,17.267465,5.044852
57014,1526.655071,843.34267,0.917302,1688.07,0.29359,0.3201,F-H,3,14,18,pcu,19.424196,5.534532
7339,1398.851286,1179.26508,1.399875,927.85,0.18238,0.1303,SO3H-Br,3,12,13,pcu,54.673990,7.442451
9,4275.421420,1606.19600,0.623834,3276.11,0.41004,0.6573,NHMe-OH,3,13,29,pcu,10.999299,5.539327
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14290,774.361816,736.78050,1.579951,1031.34,0.21327,0.1350,I,3,5,14,pcu,27.256049,5.940989
56321,1348.240412,1265.82140,1.559030,630.97,0.15121,0.0970,Br-COOH,3,14,16,pcu,40.418769,6.861470
48057,1784.282026,885.77296,0.824344,1997.11,0.27535,0.3340,OH-OPr,2,4,26,pcu,18.374829,5.657585
1083,2304.400999,967.59740,0.697246,2950.88,0.37112,0.5323,Pr-OMe,3,13,17,pcu,13.670052,4.964187


In [19]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn import preprocessing

In [20]:
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(X_train))
print(normalizer.mean.numpy())

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [16]:
def build_and_compile_model(norm):
  model = Sequential([
      norm,
      Dense(64, activation='relu'),
      Dense(64, activation='relu'),
      Dense(64, activation='relu'),
      Dense(64, activation='relu'),
      Dropout(0.2),
      Dense(1)
  ])

  model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
  return model

In [17]:
X

Unnamed: 0,volume,weight,density,surface_area,void_fraction,void_volume,functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2
1,2769.503842,2211.697211,1.326090,603.61,0.13794,0.1040,F-OMe,10,44,57,etb,33.616780,7.147286
2,1089.818728,773.687960,1.178856,788.50,0.14874,0.1262,OMe-COOH,2,22,24,pcu,19.263726,6.347967
3,2205.198301,1304.638720,0.982408,1441.53,0.21814,0.2220,H-SO3H,9,17,24,sra,25.701377,6.190085
5,3954.659761,1543.027680,0.647909,2430.55,0.37094,0.5725,Pr-NO2,9,7,23,sra,17.146541,5.398304
6,3565.914939,1954.749656,0.910268,1530.02,0.33337,0.3662,NH2,10,53,55,etb,18.363791,6.303857
...,...,...,...,...,...,...,...,...,...,...,...,...,...
66519,1426.479810,1272.451540,1.481238,1343.62,0.30190,0.2038,Me-I,3,12,21,pcu,5.867674,4.485481
66520,23943.701366,5497.752320,0.381279,4182.24,0.66340,1.7399,HCO-Me,1,9,27,pcu,4.060772,3.605688
66521,14389.971556,4396.164320,0.507298,4149.64,0.57051,1.1246,OPr-Cl,1,9,20,pcu,4.313411,3.361233
66522,16997.806645,3932.703680,0.384191,4326.62,0.66963,1.7430,CN,2,2,5,nbo,3.447440,2.781566


In [18]:
dnn_model = build_and_compile_model(normalizer)
dnn_model.summary()

monitor = EarlyStopping(monitor='val_loss', min_delta=0, patience=15, verbose=1, mode='auto', restore_best_weights=True)

NameError: name 'normalizer' is not defined

In [None]:
# model = Sequential()
# model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
# model.add(Dense(64, activation='relu'))
# model.add(Dense(64, activation='relu'))
# model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.2))
# model.add(Dense(1, activation = 'linear'))
# model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(0.001))

# # monitor = EarlyStopping(monitor='val_loss', min_delta=0, patience=15, verbose=1, mode='auto', restore_best_weights=True)
# # checkpointer = ModelCheckpoint(filepath='best_weights.hdf5', verbose=0)

# model.summary()

In [None]:
#  with monitor
# history = model.fit(X_train,  y_train, validation_split = 0.1, epochs=6000, callbacks=[monitor])
# without monitor
# history = dnn_model.fit(X_train,  y_train, validation_split = 0.2, epochs=40)

In [None]:
history = dnn_model.fit(X_train,  y_train, validation_split = 0.2, epochs=25)
# history = dnn_model.fit(X_train,  y_train, validation_split = 0.1, epochs=6000, callbacks=[monitor])

In [None]:
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
plt.plot(loss_values, 'b', label='training loss')
plt.plot(val_loss_values, 'r', label='val training loss')

In [None]:
normalizer.adapt(np.array(X_train))

In [None]:
y_train_pred = dnn_model.predict(X_train)
y_test_pred = dnn_model.predict(X_test)

In [None]:
print(y_test[:10])
print(y_test_pred[:10])