In [1]:
from pandas import read_csv, DataFrame
import numpy as np
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.preprocessing import MinMaxScaler

#### **Create Grid records**

In [2]:
import numpy as np
import pandas as pd

dataset = pd.read_excel("./895_RecordsDescriptorsBoiling.xlsx")

polymer_name = 'PET'
solvent_name = 'Cyrene'
sample_type = "fiber"

# Filter dataset for polymer/solvent of interest
df_onlySolventPolymer= dataset[(dataset["Polymer"] == polymer_name) & (dataset["Solvent"] == solvent_name) & (dataset["Sample type"] == sample_type)].copy()

# Step 1: Save original records with actual Dissolution
df_original_records = df_onlySolventPolymer.copy()
df_original_records["Source"] = "Original"

# Get solvent melting & boiling from dataset
solvent_boiling = df_onlySolventPolymer["Solvent boilling (K)"].iloc[0]
solvent_melting = df_onlySolventPolymer["Solvent melting (K)"].iloc[0]

time_vals = np.arange(1, 8 * 60 + 1, 5) # 0–480 minutes
temp_lower = max(dataset["Temperature (K)"].min(), solvent_melting)
print("Dataset Temperature:", dataset["Temperature (K)"].min())
print("Solvent Melting:", solvent_melting)

temp_vals = np.arange(temp_lower, solvent_boiling + 1, 5) 

# Step 3: Create grid records
template = df_onlySolventPolymer.iloc[0].copy()
grid_records = []

for t in time_vals:
    for temp in temp_vals:
        row = template.copy()
        row["Time (min)"] = t
        row["Temperature (K)"] = temp
        grid_records.append(row)

df_grid = pd.DataFrame(grid_records)
df_grid["Source"] = "GridSearch"

# Step 5: Combine original + grid
df_out = pd.concat([df_original_records, df_grid], ignore_index=True)

# Step 6: Save to Excel
output_file = f"dataset_gridsearch_{polymer_name}_{solvent_name}.xlsx"
df_out.to_excel(output_file, index=False)

print("Excel file created:", output_file)

Dataset Temperature: 293.15
Solvent Melting: 255.15
Excel file created: dataset_gridsearch_PET_Cyrene.xlsx


#### **Obtain Trained Model**

In [3]:
dataset = pd.read_excel("./895_RecordsDescriptorsBoiling.xlsx")

# Encoding
dissolve_encoding: dict[str, int] = {"NO": 0,"YES": 1}
sample_type_enconding: dict[str, int] = {"pellet": 0, "waste": 1, "fiber": 2, "film": 3, "powder": 4}

encoding: dict[str, dict[str, int]] = {
    "Dissolution": dissolve_encoding,
    "Sample type": sample_type_enconding
}
df: DataFrame = dataset.replace(encoding, inplace=False)

# Normalization
# Remove Polymer and Solvent Identifier Columns
df = df.drop(columns=["Polymer_ID", "Solvent_ID", "Polymer", "Solvent", "Solvent boilling (K)"])

# Separate features (X) from target "Dissolve" (y)
X = df.drop(columns=["Dissolution"])
y = df["Dissolution"]

# Apply log(1+x) Transformation
X_log = np.log1p(X)

# Scale to [0,1] with MinMax Normalization
min_max_scaler = MinMaxScaler(feature_range=(0, 1), copy=True)
X_scaled = min_max_scaler.fit_transform(X_log)

# Rebuild Dataset merging normalized features with target
df_log_minmax = DataFrame(X_scaled, columns = X.columns, index= X.index)
df_log_minmax["Dissolution"] = y

# Separate features (X) from target "Dissolve" (y)
X = df_log_minmax.drop(columns=["Dissolution"])
y = df_log_minmax["Dissolution"]

# Treinar modelo
param_grid = {'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 6, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 25, 'subsample': 0.9}
model = GradientBoostingClassifier(**param_grid, random_state = 4)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, random_state = 5)
model.fit(X_train, y_train)


  df: DataFrame = dataset.replace(encoding, inplace=False)


#### **Normalize GridSearch Dataset**

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Enconding 
df_out_encoded = df_out.replace(encoding, inplace=False)

# Keep raw Time and Temp columns for reference
df_raw = df_out_encoded[["Time (min)", "Temperature (K)"]].copy()
df_raw = df_raw.rename(columns={
    "Time (min)": "Time (min) Unormalized",
    "Temperature (K)": "Temperature (K) Unormalized"
})

drop_cols = ["Polymer_ID", "Solvent_ID", "Polymer", "Solvent", "Source",'Solvent boilling (K)']
df_encoded = df_out_encoded.drop(columns=[c for c in drop_cols if c in df_out_encoded.columns])

y = df_encoded["Dissolution"]
X = df_encoded.drop(columns=["Dissolution"])

X_log = np.log1p(X)

# --- Step 7: MinMax scale ---
#scaler = MinMaxScaler(feature_range=(0, 1), copy=True)
X_scaled = min_max_scaler.transform(X_log)

# --- Step 8: Rebuild normalized dataframe with raw values ---
df_log_minmax = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
df_log_minmax["Dissolution"] = y
df_to_test = pd.concat([df_log_minmax, df_raw, df_out[["Source"]]], axis=1)

df_to_test.to_excel(f"./dataset_gridsearch_normalized_{polymer_name}_{solvent_name}.xlsx", index = False)

  df_out_encoded = df_out.replace(encoding, inplace=False)


In [5]:
import numpy as np

# Make a copy so we don't overwrite accidentally
df_scored = df_to_test.copy()

# Build feature matrix (exclude columns not used by model)
X_all = df_scored.drop(columns=["Dissolution", 'Time (min) Unormalized','Temperature (K) Unormalized', 'Source'])

# Predict probabilities for all records
probs = model.predict_proba(X_all)[:, 1]

# Overwrite Dissolution only for GridSearch
df_scored["Dissolution"] = df_scored["Dissolution"].astype(float)
df_scored.loc[df_scored["Source"] == "GridSearch", "Dissolution"] = probs[df_scored["Source"] == "GridSearch"]

# Save to Excel
output_file = f"./dataset_gridsearch_scored_{polymer_name}_{solvent_name}.xlsx"
df_scored.to_excel(output_file, index=False)

print("Excel file created:", output_file)


Excel file created: ./dataset_gridsearch_scored_PET_Cyrene.xlsx


In [6]:
#df_scored["Dissolution"] = (df_scored["Dissolution"] >= 0.5).astype(int)

In [7]:
import numpy as np
import plotly.graph_objs as go

fig = go.Figure()

# --- Plot Original points (real experimental YES/NO) ---
for outcome, color, marker in [(0, "red", "circle"), (1, "green", "diamond")]:
    subset = df_scored[(df_scored["Source"] == "Original") & (df_scored["Dissolution"] == outcome)]
    fig.add_trace(go.Scatter3d(
        x=subset['Time (min) Unormalized'],             
        y=subset['Temperature (K) Unormalized'],
        z=[outcome] * len(subset),
        mode="markers",
        marker=dict(size=6, color=color, symbol=marker, opacity=1),
        name=f"Original Dissolution {outcome}"
    ))

# --- Prepare GridSearch data for surface ---
grid_subset = df_scored[df_scored["Source"] == "GridSearch"]

# Create sorted unique axis values
time_vals = np.sort(grid_subset['Time (min) Unormalized'].unique())
temp_vals = np.sort(grid_subset['Temperature (K) Unormalized'].unique())

# Pivot to a 2D matrix (Temperature as rows, Time as columns)
Z = grid_subset.pivot_table(
    index='Temperature (K) Unormalized',
    columns='Time (min) Unormalized',
    values='Dissolution'
).values

# Add probability surface
fig.add_trace(go.Surface(
    x=time_vals,
    y=temp_vals,
    z=Z,
    colorscale="Viridis",
    opacity=0.7,
    name="GridSearch Probability Surface",
    showscale=True
))

# --- Layout ---
fig.update_layout(
    scene=dict(
        xaxis=dict(title="Time (min)", range=[0, df_scored['Time (min) Unormalized'].max()]),
        yaxis=dict(
            title="Temperature (K)",
            range=[df_scored['Temperature (K) Unormalized'].min(),
                   df_scored['Temperature (K) Unormalized'].max()]
        ),
        zaxis=dict(title="Dissolution / Probability", range=[0, 1])
    ),
    legend=dict(x=0, y=1),
    margin=dict(l=0, r=0, b=0, t=40),
    autosize=True,
    height=800, width=900,
    scene_aspectmode="cube"
)

fig.show()
