In [None]:
import pandas as pd
import numpy as np
import kpi_features as kf
import kpi_mls as km
import shap
from sklearn.linear_model import LinearRegression
from sklearn import tree, svm, neighbors, ensemble
from sklearn.tree import ExtraTreeRegressor
from sklearn.neural_network import MLPRegressor

In [None]:
# Load your data
data = pd.read_csv("your_data.csv")

# Define the property column
props = 'your_props'

# Extract SMILES and property values
smiles = data.loc[data["SMILES"].notnull(), "SMILES"].values
props_list = data.loc[data[props].notnull(), props].values

X = []
y = []

# Extract features and property values
for smi, pr in zip(smiles, props_list):
    X.append(kf.extract_features(smi))
    y.append(pr)

# Print the lengths of the feature and property lists
print(len(X))
print(len(y))

# Create DataFrames for features and properties
data_x = pd.DataFrame(np.array(X), columns=xf.feature_names())
data_y = pd.DataFrame(np.array(y), columns=['MP'])

# Concatenate the feature and property DataFrames
data = pd.concat([data_x, data_y], axis=1)

# Plot Pearson correlation heatmap and save feature correlations
kf.plot_pearson(data, 'PRGn', 'tmp')


In [None]:
# Define file paths
test_dir = "your_test.csv"
train_dir = "your_train.csv"

# Read training and validation data
train_data = pd.read_csv(train_dir)
# Standardize features and get the training dataset
train_dataset = kf.features_standard(train_data, props)[1]

# Read test data
test_data = pd.read_csv(test_dir)
# Standardize features and get the test dataset
test_dataset = kf.features_standard(test_data, props)[1]

In [None]:
# 1. Linear Regression
model_linear_regression = LinearRegression()
km.try_different_method(train_dataset, test_dataset, props, model_linear_regression, "LR")
# %matplotlib inline

# 2.DecisionTree
model_decision_tree_regression = tree.DecisionTreeRegressor()
km.try_different_method(train_dataset, test_dataset, props, model_decision_tree_regression, 'DT')

# 3.SVM
model_svm = svm.SVR()
km.try_different_method(train_dataset, test_dataset, props, model_svm, 'SVM')

# 4.kNN
model_k_neighbor = neighbors.KNeighborsRegressor()
km.try_different_method(train_dataset, test_dataset, props, model_k_neighbor, 'KNN')

# 5.RandomForest
model_random_forest_regressor = ensemble.RandomForestRegressor(n_estimators=100)
km.try_different_method(train_dataset, test_dataset, props, model_random_forest_regressor, 'RF')

# 6.Adaboost
model_adaboost_regressor = ensemble.AdaBoostRegressor(n_estimators=50)
km.try_different_method(train_dataset, test_dataset, props, model_adaboost_regressor, 'AdaBoost')

# 7.GBRT
model_gradient_boosting_regressor = ensemble.GradientBoostingRegressor(n_estimators=100)
km.try_different_method(train_dataset, test_dataset, props, model_gradient_boosting_regressor, 'GB')

# 8.Bagging
model_bagging_regressor = ensemble.BaggingRegressor()
km.try_different_method(train_dataset, test_dataset, props, model_bagging_regressor, 'Bagging')

# 9.ExtraTree
model_extra_tree_regressor = ExtraTreeRegressor()
km.try_different_method(train_dataset, test_dataset, props, model_extra_tree_regressor, 'Extra Trees')

# 10.Nns
model_MLPRegressor = MLPRegressor(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000)
km.try_different_method(train_dataset, test_dataset, props, model_MLPRegressor, 'MLP')

In [None]:
def calculate_shap_values(train_dataset, train_data, props, xm):
    """
    Calculate SHAP values for a RandomForestRegressor model.

    Parameters:
    train_dataset (object): An object containing training dataset features (X) and target (y).
    train_data (DataFrame): DataFrame containing the training data.
    props (list): List of properties to be used for feature standardization.
    xm (module): Module containing the feature standardization function.

    Returns:
    shap_values (array): SHAP values for the input features.
    """
    
    # Initialize RandomForestRegressor model
    model_random_forest_regressor = ensemble.RandomForestRegressor(n_estimators=100)
    
    # Fit the model to the training data
    model_random_forest_regressor.fit(train_dataset.X, train_dataset.y)
    
    # Initialize JavaScript visualization for SHAP (only necessary for notebook environments)
    shap.initjs()
    
    # Create a SHAP TreeExplainer
    explainer = shap.TreeExplainer(model_random_forest_regressor)
    
    # Standardize features and calculate SHAP values
    standardized_features = xm.features_standard(train_data, props)[0]
    shap_values = explainer.shap_values(standardized_features)
    
    return shap_values

# Example usage:
# shap_values = calculate_shap_values(train_dataset, train_data, props, xm)

In [None]:
def plot_and_save_shap_summary(shap_values, train_data, props, a=1.8, cmap="viridis"):
    """
    Generate and save SHAP summary plot and sorted SHAP values to CSV.

    Parameters:
    shap_values (array): SHAP values for the input features.
    train_data (DataFrame): DataFrame containing the training data.
    props (list): List of properties to be used for feature standardization.
    xm (module): Module containing the feature standardization function.
    xf (module): Module containing the function to get feature names.
    a (float): Scaling factor for plot size.
    cmap (str): Colormap for the SHAP plot.

    Returns:
    None
    """
    
    # Standardize features
    standardized_features = km.features_standard(train_data, props)[0]
    
    # Get feature names
    feature_names = kf.feature_names()
    
    # Generate SHAP summary plot
    shap.summary_plot(shap_values, standardized_features, feature_names=feature_names, 
                      max_display=10, plot_size=(2.35 * a, 4 * a), cmap=cmap)
    
    # Handle multi-output SHAP values (e.g., for multi-class classification problems)
    if isinstance(shap_values, list):
        shap_values = shap_values[0]  # Assuming we are interested in the first output

    # Calculate mean absolute SHAP value for each feature
    shap_abs_mean = np.abs(shap_values).mean(axis=0)

    # Create a DataFrame with feature names and their corresponding mean SHAP values
    df_shap = pd.DataFrame(list(zip(feature_names, shap_abs_mean)), columns=['Feature Name', 'SHAP Value'])
    
    # Sort the DataFrame by SHAP value in descending order
    df_shap_sorted = df_shap.sort_values(by='SHAP Value', ascending=False)
    
    # Save the sorted SHAP values to a CSV file
    df_shap_sorted.to_csv('sorted_shap_values.csv', index=False)
    
    print('Sorted SHAP values have been saved to "sorted_shap_values.csv".')
