In [1]:
import numpy as np

import pandas as pd
from pandas import read_csv

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Load Data

In [2]:
df = read_csv('new_train.csv')

In [3]:
df.head(10)

Unnamed: 0,MOFname,volume,weight,density,surface_area,void_fraction,void_volume,functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2,CO2_working_capacity
0,mof_unit_1,1116.667429,875.2406,1.301526,0.0,0.07899,0.0607,COOH-OEt,3,4,11,pcu,22.864166,6.786041,105.284502
1,mof_unit_2,2769.503842,2211.697211,1.32609,603.61,0.13794,0.104,F-OMe,10,44,57,etb,33.61678,7.147286,101.224774
2,mof_unit_3,1089.818728,773.68796,1.178856,788.5,0.14874,0.1262,OMe-COOH,2,22,24,pcu,19.263726,6.347967,118.987011
3,mof_unit_4,2205.198301,1304.63872,0.982408,1441.53,0.21814,0.222,H-SO3H,9,17,24,sra,25.701377,6.190085,187.626004
4,mof_unit_5,1137.800963,901.73612,1.31602,0.0,0.07778,0.0591,NHMe-OH,2,1,22,pcu,30.001838,6.478063,79.210001
5,mof_unit_6,3954.659761,1543.02768,0.647909,2430.55,0.37094,0.5725,Pr-NO2,9,7,23,sra,17.146541,5.398304,55.786959
6,mof_unit_7,3565.914939,1954.749656,0.910268,1530.02,0.33337,0.3662,NH2,10,53,55,etb,18.363791,6.303857,111.690462
7,mof_unit_8,916.043907,639.11156,1.158537,1685.58,0.28458,0.2456,Br,2,5,13,pcu,12.684804,5.234732,58.989774
8,mof_unit_9,2228.882339,1422.69182,1.05992,1458.02,0.24277,0.2291,HCO,4,14,19,acs,37.040424,6.298964,135.587108
9,mof_unit_10,4275.42142,1606.196,0.623834,3276.11,0.41004,0.6573,NHMe-OH,3,13,29,pcu,10.999299,5.539327,59.739057


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68613 entries, 0 to 68612
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   MOFname               68613 non-null  object 
 1   volume                68613 non-null  float64
 2   weight                68613 non-null  float64
 3   density               68613 non-null  float64
 4   surface_area          68613 non-null  float64
 5   void_fraction         68613 non-null  float64
 6   void_volume           68613 non-null  float64
 7   functional_groups     68290 non-null  object 
 8   metal_linker          68613 non-null  int64  
 9   organic_linker1       68613 non-null  int64  
 10  organic_linker2       68613 non-null  int64  
 11  topology              68613 non-null  object 
 12  CO2/N2_selectivity    68613 non-null  float64
 13  heat_adsorption_CO2   66526 non-null  float64
 14  CO2_working_capacity  68613 non-null  float64
dtypes: float64(9), int6

In [5]:
df['metal_linker'].unique()

array([ 3, 10,  2,  9,  4,  1, 12], dtype=int64)

# Clean Data

In [6]:
# Show sum of NULL for each column
df.isnull().sum()

MOFname                    0
volume                     0
weight                     0
density                    0
surface_area               0
void_fraction              0
void_volume                0
functional_groups        323
metal_linker               0
organic_linker1            0
organic_linker2            0
topology                   0
CO2/N2_selectivity         0
heat_adsorption_CO2     2087
CO2_working_capacity       0
dtype: int64

In [7]:
# Clean na from selected columns
df['functional_groups'].fillna(value = 'empty',inplace=True)
df.dropna(inplace=True)
df.isnull().sum()

MOFname                 0
volume                  0
weight                  0
density                 0
surface_area            0
void_fraction           0
void_volume             0
functional_groups       0
metal_linker            0
organic_linker1         0
organic_linker2         0
topology                0
CO2/N2_selectivity      0
heat_adsorption_CO2     0
CO2_working_capacity    0
dtype: int64

In [8]:
# One hot encoding

In [9]:
one_hot = pd.get_dummies(df['metal_linker'], prefix='Metal')
one_hot.head()

Unnamed: 0,Metal_1,Metal_2,Metal_3,Metal_4,Metal_9,Metal_10,Metal_12
0,0,0,1,0,0,0,0
1,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0
3,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0


# Process

In [10]:
df_new = df
df.max()
df.loc[df['heat_adsorption_CO2'] == df['heat_adsorption_CO2'].max()]
df_new = df.drop([68611, 68612])

In [11]:
df_new.max()

MOFname                 mof_unit_9999
volume                         223965
weight                        22595.9
density                       3.35179
surface_area                  7083.53
void_fraction                 0.87206
void_volume                    6.6101
functional_groups               empty
metal_linker                       12
organic_linker1                    59
organic_linker2                    59
topology                          the
CO2/N2_selectivity            914.244
heat_adsorption_CO2           17.1554
CO2_working_capacity          736.062
dtype: object

In [12]:
X = df_new.drop(['MOFname', 'CO2_working_capacity', 'functional_groups', 'topology'], axis = 1)
y = df_new['CO2_working_capacity']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20)

In [14]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53219 entries, 58252 to 37135
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   volume               53219 non-null  float64
 1   weight               53219 non-null  float64
 2   density              53219 non-null  float64
 3   surface_area         53219 non-null  float64
 4   void_fraction        53219 non-null  float64
 5   void_volume          53219 non-null  float64
 6   metal_linker         53219 non-null  int64  
 7   organic_linker1      53219 non-null  int64  
 8   organic_linker2      53219 non-null  int64  
 9   CO2/N2_selectivity   53219 non-null  float64
 10  heat_adsorption_CO2  53219 non-null  float64
dtypes: float64(8), int64(3)
memory usage: 4.9 MB


In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
# defind model

In [17]:
model = Sequential()
model.add(Dense(128, input_dim = 11, activation = 'relu'))
model.add(Dense(64, activation = 'relu'))

In [18]:
#  output layer
model.add(Dense(1, activation = 'linear'))

In [19]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               1536      
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 9,857
Trainable params: 9,857
Non-trainable params: 0
_________________________________________________________________


In [20]:
y_train

58252     60.119228
30397    185.161081
14029    482.063139
23265     71.020388
16110    115.838671
            ...    
3915     338.620533
47497    110.451893
31962    128.950371
23775     74.298313
37135    111.531494
Name: CO2_working_capacity, Length: 53219, dtype: float64

In [37]:
y_train = np.asarray(y_train)
print(y_train)

y_test = np.asarray(y_test)
print(y_test)

[ 60.11922803 185.16108148 482.06313863 ... 128.95037088  74.29831305
 111.53149353]
[101.75731112  78.90857088  98.07742471 ... 142.66724156  96.28056649
 238.90472307]


In [22]:
history = model.fit(X_train_scaled, y_train, validation_split=0.2, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [23]:
# Predict on test data
predictions = model.predict(X_test_scaled[:5])
print('Predicted value: ', predictions)
print('Real value: ', y_test[:5])

Predicted value:  [[188.1457  ]
 [ 61.919033]
 [ 78.817566]
 [ 85.129005]
 [175.29692 ]]
Real value:  32184    101.757311
939       78.908571
50647     98.077425
6811      74.048057
12119    141.185184
Name: CO2_working_capacity, dtype: float64


In [24]:
mse_neural, mae_neural = model.evaluate(X_test, y_test)
print('MSE from neural net: ', mse_neural)
print('MAE from neural net: ', mae_neural)

MSE from neural net:  54199881728.0
MAE from neural net:  186395.96875


# Orther model

In [30]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Random Forest

In [26]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators =  30, random_state=30)
model.fit(X_train_scaled, y_train)

RandomForestRegressor(n_estimators=30, random_state=30)

In [27]:
y_pred_RF = model.predict(X_test_scaled)

In [42]:
print(y_pred_RF[:20])
print(y_test[:20])

[170.02809063  66.24095425  81.36634608  77.959964   159.95090471
  72.01923871 331.68901229  89.03707909 136.17556779 192.87103118
  85.03539796 150.39012776 261.78166939 132.06591009  83.69116863
 261.93852513 146.49587845  86.17333114  90.90515684 109.49938472]
[101.75731112  78.90857088  98.07742471  74.048057   141.18518431
  47.0136358  354.52879438  77.91481006  83.34764433 157.96372973
  70.53357729 155.61947767 299.73592625  97.70699294  51.6255444
 299.01404651 139.72161564  77.94666983  93.28277638  75.25047457]


In [40]:
mse_RF = mean_squared_error(y_test, y_pred_RF)
mae_RF = mean_absolute_error(y_test, y_pred_RF)
print('MSE from RF: ', mse_RF)
print('MAE from RF: ', mae_RF)

MSE from RF:  1003.4653692344247
MAE from RF:  21.64652902058078
