In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor

In [2]:
tmcs = pd.read_excel('data/0.xlsx', sheet_name='TMCs', usecols=['compounds','CurrentDensity','overpotential']).loc[:49, :]
new_tmcs = pd.read_csv('new_TMCs_1024.csv')['compound']
cols_tm = ['element','AN','AW','CR','AR','VR','AV','FIE','SIE','EA','EN','Nd','Nv','D','EC','TC']
tm = pd.read_excel('data/0.xlsx', sheet_name='TM_descriptors', usecols=cols_tm).fillna(0,inplace = False).replace('/', 0, inplace=False)
cols_c = ['element','AN','AW','CR','AR','VR','AV','FIE','SIE','EA','EN','Np','Nv','D']
c =  pd.read_excel('data/0.xlsx', sheet_name='C_descriptors', usecols=cols_c).fillna(0,inplace = False)

In [3]:
import re
import json

def parse_compound(compound):
    matches = re.findall(r'([A-Z][a-z]*)(\d*\.*\d*)', compound)
    result = {}
    for match in matches:
        element, count = match
        count = float(count) if count else 1.0  
        result[element] = count
    
    return result

def get_res(compound_name):
    parsed_result = parse_compound(compound_name)
    return parsed_result

In [29]:
train_data = []
for index, compound in enumerate(new_tmcs.values):
    compound_feat = [compound]
    compound = get_res(compound)
    keys = list(compound.keys())
    val = list(compound.values())
    if len(keys) > 2:
        A_key, A_index = keys[0], val[0]
        B_key, B_index = keys[1], val[1]
        A_feat = tm[tm['element']==A_key].values.flatten()[1:] * float(A_index)
        B_feat = tm[tm['element']==B_key].values.flatten()[1:] * float(B_index)
        C_feat = c[c['element']==keys[2]].values.flatten()[1:] * float(val[2])
        AB_feat = np.array([a + b for a, b in zip(A_feat, B_feat)])
        compound_feat += list(AB_feat) + list(C_feat) + list([50])
    else:
        AB_key = keys[0]
        AB_index = val[0]
        AB_feat = tm[tm['element']==AB_key].values.flatten()[1:] * float(AB_index)
        C_feat = c[c['element']==keys[1]].values.flatten()[1:] * float(val[1])
        compound_feat += list(AB_feat) + list(C_feat) + list([50])
    train_data.append(compound_feat)

In [30]:
predict_data = np.array(train_data)[:, 1:]
predict_data.shape

(16164, 29)

In [31]:
print(predict_data.astype(float))

[[2.10000e+01 4.49559e+01 1.70000e+00 ... 6.00000e+00 1.42900e-03
  5.00000e+01]
 [2.10000e+01 4.49559e+01 1.70000e+00 ... 1.20000e+01 2.85800e-03
  5.00000e+01]
 [2.10000e+01 4.49559e+01 1.70000e+00 ... 1.80000e+01 4.28700e-03
  5.00000e+01]
 ...
 [4.20000e+01 9.79835e+01 1.68000e+00 ... 1.20000e+01 3.92000e+00
  5.00000e+01]
 [4.25000e+01 9.95266e+01 1.65000e+00 ... 1.20000e+01 3.92000e+00
  5.00000e+01]
 [4.30000e+01 9.94100e+01 1.64000e+00 ... 1.20000e+01 3.92000e+00
  5.00000e+01]]


In [32]:
# regressor  =RandomForestRegressor(n_estimators=100, random_state=21)
from xgboost import XGBRegressor
model=XGBRegressor()
with open('model/XGBoost_tuning_5.pkl', 'rb') as f:
    model = pickle.load(f) 
y_pred = model.predict(predict_data.astype(float))

In [7]:
# regressor  =RandomForestRegressor(n_estimators=100, random_state=21)
with open('model/rf_tuning_4.pkl', 'rb') as f:
    model = pickle.load(f)

y_pred = model.predict(predict_data)

In [33]:
index = np.array(train_data)[:,:1]
y_pred = np.array(y_pred).reshape(-1, 1)
result = np.concatenate((index, y_pred), axis=1)
sorted_indices = np.argsort(result[:, 1])
sorted_compound = result[sorted_indices]
col = ['compound', 'overpotential']
df_sorted_compound = pd.DataFrame(sorted_compound, columns=col)
import os
if not os.path.exists('predict/'):
    os.makedirs('predict/')
df_sorted_compound.to_csv('predict/predict_5_50.csv', index=False)