In [None]:
# installing other requirements
!pip install loguru
!pip install LIME
!pip install shap
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from loguru import logger
from sklearn.feature_selection import RFE
import lime
import lime.lime_tabular
import shap
from shap import Explainer, TreeExplainer, Explanation
from shap.plots import waterfall
from shap.maskers import Independent

# Preprocessing

In [None]:
df = pd.read_csv("/content/cancer patient data sets.csv")
df.head()

Dropping the prediction variable i.e. level and two feaure variables which is patient id and index.

In [None]:
level = df['Level']
df = df.drop(['index', 'Patient Id', 'Level'], axis=1)

# Getting total number of classes
classes = level.unique()

Scaling the dataset

In [None]:
# Standardizing the dataframe
scaler = StandardScaler()
# Standardize the dataframe
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

#splitting
X_train_temp, X_test, y_train_temp, y_test = train_test_split(df, level, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_temp, y_train_temp, test_size=0.2, random_state=42)

X_train_shap = X_train
X_valid_shap = X_valid
X_test_shap = X_test

y_train_shap = y_train
y_valid_shap = y_valid
y_test_shap = y_test

X_train = np.array(X_train)
X_valid = np.array(X_valid)
X_test = np.array(X_test)

y_train = np.array(y_train)
y_valid = np.array(y_valid)
y_test = np.array(y_test)

label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)

y_encoded_train = label_encoder.fit_transform(y_train)
y_one_hot_train = onehot_encoder.fit_transform(y_encoded_train.reshape(-1, 1))

y_encoded_valid = label_encoder.fit_transform(y_valid)
y_one_hot_valid = onehot_encoder.fit_transform(y_encoded_valid.reshape(-1, 1))

y_encoded_test = label_encoder.fit_transform(y_test)
y_one_hot_test = onehot_encoder.fit_transform(y_encoded_test.reshape(-1, 1))

# Random Forest with Grid Search

In [None]:
# implementing grid search

# Define the parameter grid
param_grid = {
    'n_estimators': [5, 10, 50, 100, 150],
    'max_depth': [1, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier()

# Create a GridSearchCV instance
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5)

# Perform grid search on the training data
grid_search.fit(X_train, y_one_hot_train)

# Print the best hyperparameters and the best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 5}
Best Score: 1.0


In [None]:
rf_classifier = RandomForestClassifier(max_depth=5, min_samples_leaf= 1, min_samples_split= 2, n_estimators= 50)

# Train the classifier
rf_classifier.fit(X_train, y_one_hot_train)

Metrics calculation on validation dataset

In [None]:
# Validate the model
valid_predictions = rf_classifier.predict(X_valid)
valid_predictions = np.argmax(valid_predictions, axis=1)
valid_predictions = label_encoder.inverse_transform(valid_predictions)


In [None]:
# Validate the model
valid_predictions = rf_classifier.predict(X_valid)
valid_predictions = np.argmax(valid_predictions, axis=1)
valid_predictions = label_encoder.inverse_transform(valid_predictions)

truth_valid = np.argmax(y_one_hot_valid, axis=1)
truth_valid = label_encoder.inverse_transform(truth_valid)

valid_accuracy = accuracy_score(truth_valid, valid_predictions)
print("Validation Accuracy:", valid_accuracy)
valid_precision = precision_score(truth_valid, valid_predictions, average='micro')
print("Validation Precision:", valid_precision)
valid_recall = recall_score(truth_valid, valid_predictions, average='micro')
print("Validation Recall:", valid_recall)
valid_f1 = f1_score(truth_valid, valid_predictions, average='micro')
print("Validation Recall:", valid_f1)
report = classification_report(truth_valid, valid_predictions, target_names=classes)
label_map=label_encoder.inverse_transform([0,1,2])
print(f"Classification Report: \n{report}")
cm = confusion_matrix(truth_valid, valid_predictions)
sns.heatmap(cm, annot=True, fmt='g', cmap='Blues',xticklabels=label_map,yticklabels=label_map)
plt.title("Heatmap for Lung Cancer Prediction for Validation Dataset")
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.show()

Metrics calculation on test dataset

In [None]:
# Test the model
test_predictions = rf_classifier.predict(X_test)
test_predictions = np.argmax(test_predictions, axis=1)
test_predictions = label_encoder.inverse_transform(test_predictions)

truth_test = np.argmax(y_one_hot_test, axis=1)
truth_test = label_encoder.inverse_transform(truth_test)

test_accuracy = accuracy_score(truth_test, test_predictions)
print("Test Accuracy:", test_accuracy)

test_precision = precision_score(truth_test, test_predictions, average='micro')
print("Test Precision:", test_precision)
test_recall = recall_score(truth_test, test_predictions, average='micro')
print("Test Recall:", test_recall)
test_f1 = f1_score(truth_test, test_predictions, average='micro')
print("Test Recall:", test_f1)
report = classification_report(truth_test, test_predictions, target_names=classes)

print(f"Classification Report: \n{report}")
cm = confusion_matrix(truth_test, test_predictions)
sns.heatmap(cm, annot=True, fmt='g', cmap='Blues',xticklabels=label_map,yticklabels=label_map)
plt.title("Heatmap for Lung Cancer Prediction for Testing Dataset")
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.show()

In [None]:
train_size = X_train.shape[0]
print("Training dataset size:", train_size)


Training dataset size: 640


# RFE for Random Forest

Recursive Feature Elimination (RFE) is a feature selection technique that can be used with the Random Forest algorithm. RFE aims to select the most relevant features by recursively eliminating less important features from the dataset. It works by training a model, such as a Random Forest, and ranking the features based on their importance or contribution to the model's performance. The least important features are then removed, and the process is repeated until the desired number of features is reached.

In [None]:
# RFE for Random Forest

rfe = RFE(estimator=rf_classifier, n_features_to_select=len(df.columns))  # Select 2 features

# Fit the RFE object to the data
rfe.fit(X_train, y_one_hot_train)

# Get the selected feature indices and their rankings
selected_feature_indices = rfe.get_support(indices=True)
feature_rankings = rfe.ranking_

# Print the selected feature indices and their rankings
print("Selected feature indices:", selected_feature_indices)
print("Feature rankings:", feature_rankings)


In [None]:
# Get feature importances
importances = rf_classifier.feature_importances_

# Calculate the standard deviation of feature importances across trees
std = np.std([tree.feature_importances_ for tree in rf_classifier.estimators_], axis=0)

# Create a Series with feature importances and corresponding column names
forest_importances = pd.Series(importances, index=df.columns)

# Create a figure and axis for plotting
fig, ax = plt.subplots()

# Plot the feature importances as a bar chart with error bars using the standard deviation
forest_importances.plot.bar(yerr=std, ax=ax)

# Set the title and y-axis label
ax.set_title("Feature importances")
ax.set_ylabel("Mean decrease in impurity")

# Adjust the layout of the figure
fig.tight_layout()

# LIME


LIME (Local Interpretable Model-Agnostic Explanations) is a popular technique for explaining the predictions of machine learning models. It provides interpretable explanations for individual predictions by approximating the behavior of the underlying model in the local neighborhood of the instance being explained.

In [None]:
# Define the Lime Explainer object
explainer = lime.lime_tabular.LimeTabularExplainer(training_data=X_train,
                                                   feature_names=list(df.columns),
                                                   class_names=classes)

**N instances on Test Data**

In [None]:
# Get the predicted classes and probabilities for all instances in the test set
pred_probs = rf_classifier.predict(X_test)
pred_classes = np.argmax(pred_probs, axis=1)
pred_classes = label_encoder.inverse_transform(pred_classes)

# Define the number of samples
samples_class = 11
print("first ", samples_class," samples labels", y_test[:samples_class])

# Create a list to store sample instances for each class
sample_instances_list = []

for s in range(samples_class):
    # Get the sample and append them to the list
    sample_instances_list.append([X_test[s]])

# Loop through the sample instances and generate an explanation for each one
for sample_instances in sample_instances_list:
    for i, sample_instance in enumerate(sample_instances):
        sample_instance = sample_instance.reshape(1, -1)
        # Get the predicted class and probability of the sample instance
        pred_prob = rf_classifier.predict(sample_instance)[0]
        pred_class = np.argmax(pred_prob)
        class_value = label_encoder.inverse_transform((pred_class.reshape(-1, 1)))
        prob = pred_prob[pred_class]

        # Use the Lime explainer to generate an explanation for the sample instance
        exp = explainer.explain_instance(sample_instance.reshape(X_valid.shape[1],),
                                         rf_classifier.predict,
                                         num_features=X_valid.shape[1])

        # Create a DataFrame for the explanation
        exp_df = pd.DataFrame(exp.as_list(), columns=['Feature', 'Weight'])

        # Add a column for the absolute value of the weights
        exp_df['Absolute Weight'] = exp_df['Weight'].abs()

        # Sort the DataFrame by the absolute value of the weights
        exp_df = exp_df.sort_values('Absolute Weight', ascending=False)

        # Reset the index and display the DataFrame
        exp_df = exp_df.reset_index(drop=True)

        print('Sample Instance', i+1, 'Class:', class_value)
        print('Predicted Probability:', prob)

        # Set the display options to show all rows and columns
        pd.set_option('display.max_rows', None)
        pd.set_option('display.max_columns', None)

        print('LIME Explanation:')
        display(exp_df)
        print('-----------------------\n')


        # Plotting
        exp_df=exp_df.iloc[:8]
        plt.figure(figsize=(8, 6))
        plt.bar(exp_df['Feature'], exp_df['Weight'])
        plt.xlabel('Feature')
        plt.ylabel('Weight')
        plt.title('Feature Weights')
        plt.xticks(rotation=45)
        plt.show()






**Aggregated LIME results for all 3 classes on Test Data**

In [None]:
# Get the predicted classes and probabilities for all instances in the test set
pred_probs = rf_classifier.predict(X_test)
pred_classes = np.argmax(pred_probs, axis=1)
pred_classes = label_encoder.inverse_transform(pred_classes)

samples_class = X_test.shape[0]  # Set the number of samples per class to the size of the test set
sample_instances_list = []  # Create a list to store sample instances for each class

# Loop through each class and choose a sample instance from that class
for c in ["High", "Low", "Medium"]:
    # Find the indices of the instances in the test set that belong to this class
    ids = np.where(pred_classes == c)[0][:samples_class]
    sample_instances_list.append(X_test[ids, :])  # Get the sample instances per class and append them to the list

for sample_instances in sample_instances_list:
    # Loop through the sample instances and generate an explanation for each one
    class_results = []
    all_values = []
    for i, sample_instance in enumerate(sample_instances):
        sample_instance = sample_instance.reshape(1, -1)
        # Get the predicted class and probability of the sample instance
        pred_prob = rf_classifier.predict(sample_instance)[0]
        pred_class = np.argmax(pred_prob)
        class_value = label_encoder.inverse_transform((pred_class.reshape(-1, 1)))
        prob = pred_prob[pred_class]

        # Use the Lime explainer to generate an explanation for the sample instance
        exp = explainer.explain_instance(sample_instance.reshape(X_test.shape[1],),
                                         rf_classifier.predict,
                                         num_features=X_test.shape[1])
        cols = np.array(exp.as_list())[:, 0]

        if class_results == []:
            class_results = list((np.array(np.array(exp.as_list())[:, 1])))
        else:
            class_results = np.c_[class_results, list((np.array(np.array(exp.as_list())[:, 1])))]

        all_values.append(list((np.array(np.array(exp.as_list())[:, 1]))))

    all_values = (np.array(all_values, dtype=np.float32))

    # Compute the correlation matrix
    total_features = 8
    corr = np.corrcoef(all_values[:, :total_features], rowvar=False)

    # Plot the heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, annot=True, cmap='coolwarm', yticklabels=cols[:total_features],
                xticklabels=cols[:total_features])
    plt.show()

    if class_results != []:
        values = np.mean(np.array(class_results, dtype=np.float32), axis=1)
        exp_mean = np.c_[cols, values]
        exp_df = pd.DataFrame(exp_mean, columns=['Feature', 'Weight'])
        exp_df = exp_df.astype({'Weight': 'float32'})
        exp_df['Absolute Weight'] = exp_df['Weight'].abs()
        exp_df = exp_df.sort_values('Absolute Weight', ascending=False)
        exp_df = exp_df.reset_index(drop=True)

        print('Sample Instances', i + 1, 'Class:', class_value[0])
        print('Predicted Probability:', prob)
        pd.set_option('display.max_rows', None)
        pd.set_option('display.max_columns', None)
        print('LIME Explanation:')
        display(exp_df)
        print('-----------------------\n')
        # Plotting
        exp_df=exp_df.iloc[:8]
        plt.figure(figsize=(8, 6))
        plt.bar(exp_df['Feature'], exp_df['Weight'])
        plt.xlabel('Feature')
        plt.ylabel('Weight')
        plt.title('Feature Weights')
        plt.xticks(rotation=45)
        plt.show()


# SHAP

In [None]:
rf_classifier = RandomForestClassifier(max_depth=5, min_samples_leaf= 1, min_samples_split= 2, n_estimators= 50)
# Train the classifier
rf_classifier.fit(X_train, y_train)

In [None]:
label_encoder.inverse_transform([0,1,2])

In [None]:
explainer = shap.Explainer(rf_classifier)


**SHAP on Test Data**

In [None]:
shap_values = explainer(X_test_shap)
shap_values = shap.TreeExplainer(rf_classifier).shap_values(X_test_shap)
shap.summary_plot(shap_values, X_test_shap)



**N instances on Test Data**

In [None]:
# Define the number of samples to display
samples = 11
print("first ", samples," samples labels",  y_test[:samples])
# Get the indices of the test dataset
indices = list(X_test_shap.index)

# Create the SHAP explainer
explainer = TreeExplainer(rf_classifier)

# Get the SHAP values for the test dataset
sv = explainer(X_test_shap)

# Create the Explanation object using the SHAP values
exp = Explanation(
    sv.values[:, :, 0],
    sv.base_values[:, 1],
    data=X_test_shap.values,
    feature_names=X_test_shap.columns
)
# Iterate over the samples
if samples > 0:
    for i in range(samples):
        # Create a DataFrame with the SHAP values for each feature
        display(pd.DataFrame({
            'row_id': indices[i],
            'feature': X_test_shap.columns,
            'feature_value': X_test_shap.iloc[i],
            'base_value': exp.base_values[i],
            'shap_values': exp.values[i]
        }))

        print('----------------------------')
else:
    display(pd.DataFrame({
        'row_id': indices[samples],
        'feature': X_test_shap.columns,
        'feature_value': X_test_shap.iloc[samples],
        'base_value': exp.base_values[samples],
        'shap_values': exp.values[samples]
    }))


**Aggregated SHAP on Test Data**

In [None]:
# Iterate over each class label
# for label in ["High", "Low", "Medium"]:
#     print("Class", label)
for label in ["High", "Low", "Medium"]:
    print("Class", label)

    # Filter the test dataset based on the current label
    df_test = X_test_shap[X_test_shap.index.isin(level[level == label].index)]

    # Create the masker using the filtered dataset
    masker = Independent(df_test, max_samples=X_test.shape[0])

    # Create the SHAP explainer using the random forest classifier and the masker
    explainer = TreeExplainer(rf_classifier, data=masker)

    # Get the expected base value from the explainer
    bv = explainer.expected_value[1]

    # Get the SHAP values for the filtered dataset
    sv = explainer(df_test, check_additivity=False)

    # Create a DataFrame with the SHAP values for each feature
    df = pd.DataFrame({
        'row_id': df_test.index.values.repeat(df_test.shape[1]),
        'feature': df_test.columns.to_list() * df_test.shape[0],
        'feature_value': df_test.values.flatten(),
        'base_value': bv,
        'shap_values': sv.values[:, :, 1].flatten()
    })

    # Group the DataFrame by feature and calculate the mean values
    df_mean = df.groupby('feature').mean().drop(["row_id"], axis=1).T

    # Display the mean SHAP values for each feature
    display(df_mean)

    # Select the top features based on absolute mean SHAP values
    total_features = 8
    indices = np.argsort(-np.abs(df_mean.iloc[-1]))[:total_features].values

    # Get the SHAP values for all instances and selected features
    all_values = []
    for id_ in np.unique(df['row_id']):
        all_values.append(np.array(df[df['row_id'] == id_].T)[-1, :])
    all_values = np.array(all_values, dtype=np.float32)[:, indices]

    # Calculate the correlation matrix
    corr = np.corrcoef(all_values, rowvar=False)
    # Plot the heatmap of the correlation matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, annot=True, cmap='coolwarm', yticklabels=df_mean.columns[indices], xticklabels=df_mean.columns[indices])
    plt.show()
