In [5]:
import json
import numpy as np
import pandas as pd
import pickle

In [6]:
def extract_data_from_json(json_file):
    with open(json_file, 'r') as file:
        data = json.load(file)
        key_report_cas = data.get('key_report_cas', {})
        natrual_oils = data.get('natrual_oils', {})
        all_key_report_cas = data.get('all_key_report_cas', {})
        key_natrual_oils = data.get("key_recipe_code", {})

        return key_report_cas, natrual_oils, all_key_report_cas, key_natrual_oils

In [7]:

json_file = 'data/key_materials.json'  # Replace with your JSON file path
key_report_cas, natrual_oils, all_key_report_cas, key_natrual_oils = extract_data_from_json(json_file)
NUM_KEY_CAS = 110
NUM_ALL_KEY_CAS = 820
NUM_NATURAL_OILS = 235
NUM_KEY_OILS = 5

In [8]:

def extract_vector_from_report(key_report_cas, report):
    input_vector = np.zeros(NUM_KEY_CAS)
    report_data = report
    report_data = report_data[['CAS', '面积百分比']]
    report_data = report_data.groupby('CAS')['面积百分比'].sum().reset_index()
    for _, row in report_data.iterrows():
        cas = row['CAS']
        area = row['面积百分比']
        if cas in key_report_cas:
            input_vector[key_report_cas[cas]] = area   
    return input_vector

In [9]:

def extract_all_vector_from_report(key_report_cas, report):
    input_vector = np.zeros(NUM_ALL_KEY_CAS)
    report_data = report
    report_data = report_data[['CAS', '面积百分比']]
    report_data = report_data.groupby('CAS')['面积百分比'].sum().reset_index()
    for _, row in report_data.iterrows():
        cas = row['CAS']
        area = row['面积百分比']
        if cas in key_report_cas:
            input_vector[key_report_cas[cas]] = area   
    return input_vector

In [10]:
def extract_vector_from_recipe(natrual_oils, recipe):
    output_feature = np.zeros(NUM_NATURAL_OILS)
    for _, row in recipe.iterrows():
        code = str(row['元件品号'])
        percentage = row['组成用量']
        if code in natrual_oils:
            output_feature[natrual_oils[code]] = percentage
    return output_feature 

In [11]:
def extract_key_vector_from_recipe(natrual_oils, recipe):
    output_feature = np.zeros(NUM_KEY_OILS)
    for _, row in recipe.iterrows():
        code = str(row['元件品号'])
        percentage = row['组成用量']
        if code in natrual_oils:
            output_feature[natrual_oils[code]] = percentage
    return output_feature 

In [12]:
sample_report = 'test_sample/sample_report.xlsx'
sample_recipe = 'test_sample/sample_recipe.xlsx'

In [13]:
with open('data/database.pkl', 'rb') as f:
    loaded_data = pickle.load(f)
    analysisReportDf = loaded_data['analysisReportDf']
    formulaDf = loaded_data['formulaDf']
    analysisReportDf['面积百分比'] = analysisReportDf['面积百分比'].astype(float)
    formulaDf['组成用量'] = formulaDf['组成用量'].astype(float)

In [14]:
print(formulaDf.head())

     主件品号      主件品名                                  元件ID   元件品号  \
0  A00446  仿-粉香（水溶）  48DBFF66-04CB-4381-0672-154924518262  98086   
1  A00446  仿-粉香（水溶）  B3FCB979-1446-45BA-217B-1549245C316F  98061   
2  A00446  仿-粉香（水溶）  D305E665-DA05-4914-00AC-1549248F38EF  97099   
3  A00446  仿-粉香（水溶）  E7152084-CC7B-42A9-4AF3-154924A76F6E  97071   
4  A00446  仿-粉香（水溶）  F166E9CE-A38C-4783-C3ED-154924B32452  97055   

           元件品名        CAS号   组成用量   底数    Period          单位成本      成本金额  
0     异甲基紫罗兰酮70    127-51-5  0.220  100  2025-06   156.76243113  0.344877  
1        甲基紫罗兰酮    127-42-4  0.070  100  2025-06   223.47466395  0.156432  
2  羟基香草醛/羟醛（进口）    107-75-5  0.136  100  2025-06   334.96232957  0.455548  
3          波洁红醛  18127-01-0  0.080  100  2025-06   716.43315237  0.573146  
4          兔耳草醛    103-95-7  0.300  100  2025-06   314.34678257  0.943040  


In [15]:
print(analysisReportDf.head())

   Unnamed: 0      时间                              名称          CAS   匹配度  \
0         0.0   6.719                          丙二醇/PG  000057-55-6  91.0   
1         1.0   9.579  2-Propanol, 1-(2-propenyloxy)-  021460-36-6  81.0   
2         2.0  12.199         丁酸异丁酯/ISOBUTYL BUTYRATE  000539-90-2  79.0   
3         3.0  13.482                苯甲醛/BENZALDEHYDE  000100-52-7  63.0   
4         4.0  13.628               甲位蒎烯/ALPHA PINENE  000080-56-8  94.0   

   面积百分比     分析编号  
0  0.020  FC13521  
1  0.050  FC13521  
2  0.002  FC13521  
3  0.001  FC13521  
4  0.080  FC13521  


In [24]:
report = analysisReportDf[analysisReportDf['分析编号'] == 'FC13525'].reset_index(drop=True)
input_vector = extract_all_vector_from_report(all_key_report_cas, report)
print(input_vector)

[0.000e+00 0.000e+00 0.000e+00 2.600e-01 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 7.887e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 7.000e-02 0.000e+00 1.000e-03 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 5.000e-02 0.000e+00 2.000e-02
 0.000e+00 4.000e-03 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 9.000e-02 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 7.400e-01 0.000e+00
 0.000e+00 4.000e-03 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.970e+00 0.000e+00 0.000e+00
 0.000

In [171]:
recipe = formulaDf[formulaDf['主件品号'] == 'P13412'].reset_index(drop=True)
output_feature = extract_vector_from_recipe(natrual_oils, recipe)
print(output_feature)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [16]:
report_to_recipe = pd.read_excel('data/report_recipe.xlsx').reset_index(drop=True)

input_vectors = []
output_features = []
for _, row in report_to_recipe.iterrows():
    report_id = row['分析报告-分析编号']

    recipe_id = row['配方-调香师编号']

    
    report = analysisReportDf[analysisReportDf['分析编号'] == report_id].reset_index(drop=True)
    recipe = formulaDf[formulaDf['主件品号'] == recipe_id].reset_index(drop=True)
    
    input_vector = extract_vector_from_report(key_report_cas, report)
    output_feature = extract_key_vector_from_recipe(key_natrual_oils, recipe)

    if input_vector.sum() == 0 or output_feature.sum() == 0:
        continue

    input_vectors.append(input_vector)
    output_features.append(output_feature)

input_vectors = np.array(input_vectors)
output_features = np.array(output_features)

# Save the processed data
np.save('data/input.npy', input_vectors)
np.save('data/key_output.npy', output_features)

In [17]:
print(input_vectors.shape)

(448, 110)


In [18]:
print(output_features.shape)


(448, 5)


In [21]:
print(input_vectors[0])
print(output_features[0])

[0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 1.000e-02 0.000e+00 0.000e+00 0.000e+00 0.000e+00 4.100e-01
 1.800e-01 3.900e-01 8.700e-01 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 6.000e-01 0.000e+00 0.000e+00 7.000e-02 2.000e-01 3.000e-02
 5.470e+00 5.780e+00 2.200e-01 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 6.000e-02 3.000e-02 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 2.180e+00 5.000e-02 8.400e-01 9.300e-02 4.000e-02
 3.000e-02 2.500e-01 0.000e+00 2.000e-02 0.000e+00 0.000e+00 0.000e+00
 0.000

In [19]:
import torch
from model import SimpleBinaryClassifier
from sklearn.preprocessing import StandardScaler

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [24]:
model = SimpleBinaryClassifier(110, 5)
model.load_state_dict(torch.load("key_cas_key_oil_compound_to_oil_binary_model.pth"))
model.eval()
model.to(device)

SimpleBinaryClassifier(
  (model): Sequential(
    (0): Linear(in_features=110, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.2, inplace=False)
    (8): Linear(in_features=32, out_features=5, bias=True)
  )
)

In [33]:
for i in range(10):
    # 2. Inference
    x_new = input_vectors[i]
    x_new = x_new.reshape(1, -1)  # Reshape to match model input
    x_new = np.log1p(x_new)

    y_new = output_features[i]
    y_new = y_new.reshape(1, -1)  # Reshape to match model output
    y_new = np.log1p(y_new)

    x_tensor = torch.tensor(x_new, dtype=torch.float32).to(device)

    with torch.no_grad():
        y_pred = model(x_tensor)
        y_pred_np = y_pred.cpu().numpy()

    # 3. Reverse output transformation
    y_pred_original = np.expm1(y_pred_np)

    threshold = 0.1

    print('Raw Results')
    print(y_pred_original)
    print(y_new)
    print('Tresholded Results')
    print(y_pred_original > threshold)
    print(y_new > 0)
    print('-------------------')

Raw Results
[[-0.8342449  -0.97847515  1.9521244   0.83136576 -0.6404208 ]]
[[0.         0.         0.40546511 0.26236426 0.        ]]
Tresholded Results
[[False False  True  True False]]
[[False False  True  True False]]
-------------------
Raw Results
[[-0.8225541  -0.99903566 -0.9409154  -0.9662168  20.012938  ]]
[[0.         0.         0.         0.         0.43825493]]
Tresholded Results
[[False False False False  True]]
[[False False False False  True]]
-------------------
Raw Results
[[ 4.9215074  -0.9978858  -0.9945512  -0.6596781  -0.98317844]]
[[2.62466859 0.         0.         0.         0.        ]]
Tresholded Results
[[ True False False False False]]
[[ True False False False False]]
-------------------
Raw Results
[[ 0.84547985 -0.99940205 -0.9997676  -0.60522735 -0.7856935 ]]
[[0.30010459 0.         0.         0.         0.        ]]
Tresholded Results
[[ True False False False False]]
[[ True False False False False]]
-------------------
Raw Results
[[-0.78341794 -0.982

In [30]:
threshold = 0.3
print(y_pred_original > threshold)
print(y_new > 0)

[[False False False False  True]]
[[False False False False  True]]


In [160]:
y = output_features[0]
print(np.mean((y - y_pred_original) ** 2))

0.02153033398463458


In [4]:
all_key_cas = pd.read_excel('data/PEP天然油识别(1).xlsx', sheet_name='主要CAS总表8.15').reset_index(drop=True)

In [10]:
list_all_key_cas = list(all_key_cas['CAS'])
print(len(list_all_key_cas))
dict_all_key_cas = {}
count = 0
for cas in list_all_key_cas:
    dict_all_key_cas[cas] = count
    count += 1

820


In [14]:
with open('data/key_materials.json', 'r') as file:
    data = json.load(file)

print(type(data))

<class 'dict'>


In [15]:
data['all_key_report_cas'] = dict_all_key_cas

In [16]:
with open("data.json", "w") as f:
    json.dump(data, f, indent=4)