In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, make_scorer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
dataset = pd.read_csv(".csv", delimiter=";")
dataset.shape

##### **Variables Enconding**

In [None]:
dissolve_encoding: dict[str, int] = {"NO": 0,"YES": 1}
sample_type_enconding: dict[str, int] = {"pellet": 0, "waste": 1, "fiber": 2, "film": 3, "powder": 4}

encoding: dict[str, dict[str, int]] = {
    "Dissolve": dissolve_encoding,
    "Sample_type": sample_type_enconding
}
df: DataFrame = dataset.replace(encoding, inplace=False)

##### **Variables Normalization**

In [4]:
# Remove Polymer and Solvent Identifier Columns
df = df.drop(columns=["Polymer_id", "Solvent_id"])

# Separate features (X) from target "Dissolve" (y)
X = df.drop(columns=["Dissolve"])
y = df["Dissolve"]

# Apply log(1+x) Transformation
X_log = np.log1p(X)

# Scale to [0,1] with MinMax Normalization
min_max_scaler = MinMaxScaler(feature_range=(0, 1), copy=True)
X_scaled = min_max_scaler.fit_transform(X_log)

# Rebuild Dataset merging normalized features with target
df_log_minmax = DataFrame(X_scaled, columns = X.columns, index= X.index)
df_log_minmax["Dissolve"] = y

In [5]:
# Separate features (X) from target "Dissolve" (y)
X = df_log_minmax.drop(columns=["Dissolve"])
y = df_log_minmax["Dissolve"]

##### **Decision Tree Model**

In [None]:
# Hyperparameters to test
param_grid = {
    'criterion': 'gini',
    'max_depth': 12,
    'max_features': 0.5,
    'max_leaf_nodes': 123,
    'min_impurity_decrease': 0.0,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'splitter': 'best'
}

# Save Results
acc_val_list = []
acc_train_list = []
acc_test_list = []
recall_test_list = []
best_params_list = []

# Loop of Iterations with random_state varying the data split 
for i in range(5): 
    # Split data into Train (70%)
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=i, stratify=y)
    # Split the rest of the data into Validation (15%) and Test (15%)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=i, stratify=y_temp)

    print(f"\n{'='*30}\nStarting Iteration {i+1}\n{'='*30}")

    # Train model with hyperparameters
    model = DecisionTreeClassifier(**param_grid, random_state=i)
    model.fit(X_train, y_train)

    # Predictions
    y_val_pred = model.predict(X_val)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Compute Metrics
    acc_val = accuracy_score(y_val, y_val_pred)
    acc_train = accuracy_score(y_train, y_train_pred)
    acc_test = accuracy_score(y_test, y_test_pred)

    # Store metrics
    acc_val_list.append(acc_val)
    acc_train_list.append(acc_train)
    acc_test_list.append(acc_test)

    # Show iteration results
    print(f"Running parameters:  {param_grid}")
    print(f"Accuracy - Validation:  {acc_val:.4f}")
    print(f"Accuracy - Train:     {acc_train:.4f}")
    print(f"Accuracy - Test:      {acc_test:.4f}")


# Compute mean and standard deviation after all iterations
print("\n=== Final Results ===")
print(f"Accuracy - Validation: Mean = {np.mean(acc_val_list):.4f}, Standard Deviation = {np.std(acc_val_list):.4f}")
print(f"Accuracy - Train:     Mean = {np.mean(acc_train_list):.4f}, Standard Deviation = {np.std(acc_train_list):.4f}")
print(f"Accuracy - Test:      Mean = {np.mean(acc_test_list):.4f}, Standard Deviation = {np.std(acc_test_list):.4f}")

##### **Gradient Boosting Model**

In [None]:
# Hyperparameters to test
param_grid = {
    'learning_rate': 0.1,
    'loss': 'exponential',
    'max_depth': 10,
    'max_features': 'sqrt',
    'min_samples_leaf': 1,
    'min_samples_split': 10,
    'n_estimators': 250,
    'subsample': 0.8
}

# Save Results
acc_val_list = []
acc_train_list = []
acc_test_list = []
recall_test_list = []
best_params_list = []

# Loop of Iterations with random_state varying the data split 
for i in range(5): 
    # Split data into Train (70%)
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=i, stratify=y)
    # Split the rest of the data into Validation (15%) and Test (15%)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=i, stratify=y_temp)

    print(f"\n{'='*30}\nStarting Iteration {i+1}\n{'='*30}")

    # Train model with hyperparameters
    model = GradientBoostingClassifier(**param_grid, random_state=i)
    model.fit(X_train, y_train)

    # Predictions
    y_val_pred = model.predict(X_val)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Compute Metrics
    acc_val = accuracy_score(y_val, y_val_pred)
    acc_train = accuracy_score(y_train, y_train_pred)
    acc_test = accuracy_score(y_test, y_test_pred)

    # Store metrics
    acc_val_list.append(acc_val)
    acc_train_list.append(acc_train)
    acc_test_list.append(acc_test)

    # Show iteration results
    print(f"Running parameters:  {param_grid}")
    print(f"Accuracy - Validation:  {acc_val:.4f}")
    print(f"Accuracy - Train:     {acc_train:.4f}")
    print(f"Accuracy - Test:      {acc_test:.4f}")


# Compute mean and standard deviation after all iterations
print("\n=== Final Results ===")
print(f"Accuracy - Validation: Mean = {np.mean(acc_val_list):.4f}, Standard Deviation = {np.std(acc_val_list):.4f}")
print(f"Accuracy - Train:     Mean = {np.mean(acc_train_list):.4f}, Standard Deviation = {np.std(acc_train_list):.4f}")
print(f"Accuracy - Test:      Mean = {np.mean(acc_test_list):.4f}, Standard Deviation = {np.std(acc_test_list):.4f}")