In [None]:
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train = pd.read_csv("/content/drive/MyDrive/Filters_colorectal/selected_train_features_with_plof_wo_cluster.csv")
test = pd.read_csv("/content/drive/MyDrive/Filters_colorectal/selected_test_features_with_plof_wo_cluster.csv")

In [None]:
X_train = train.drop(["plof_scores","label", "loss"], axis = 1)
X_test = test.drop(["plof_scores","label", "loss"], axis = 1)
y_train = train["label"]
y_test = test["label"]

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(
    n_estimators=1200,
    max_depth=20,
    min_samples_leaf=1,
    random_state=42
)

# from sklearn.svm import SVC

# rfc = SVC(
#     kernel='rbf',
#     C=1.5,
#     gamma='scale',
#     probability=True,
#     decision_function_shape='ovr'
# )
rfc.fit(X_train, y_train)

In [None]:
accuracy = rfc.score(X_test, y_test)
print("Model accuracy with selected features:", accuracy)

Model accuracy with selected features: 0.92


In [None]:
!pip install lime
import lime
import lime.lime_tabular

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=b4c1cdc9a72991fd1d71c64590799d369300c38a958443c65b05afbc51f9b107
  Stored in directory: /root/.cache/pip/wheels/fd/a2/af/9ac0a1a85a27f314a06b39e1f492bee1547d52549a4606ed89
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [None]:
feature_names = X_train.columns
class_names = y_train.unique()

In [None]:
# Instantiate a LIME explainer object
explainer = lime.lime_tabular.LimeTabularExplainer(
    X_train.values,
    feature_names=feature_names,
    class_names=class_names,
    feature_selection='auto'
    )

In [None]:
# Select a subset of instances for explanation
explaining_instances = X_test.sample(n=200, random_state=42)

In [None]:
# Generate explanations for the selected instances
explanations = []
for i in range(len(explaining_instances)):
    explanation = explainer.explain_instance(explaining_instances.iloc[i], rfc.predict_proba, num_features=len(X_train.columns))
    explanations.append(explanation)



In [None]:
# Extract Feature Importance Scores
feature_importances = []
for explanation in explanations:
    # Extract feature importance scores for each instance
    importance_scores = [imp[1] for imp in explanation.as_list()]
    feature_importances.append(importance_scores)

In [None]:
# Calculate the average importance scores across all instances
average_importances = np.mean(feature_importances, axis=0)

In [None]:
# Rank Features and Select Top 50% on their importance scores
def feat(per):
  feature_ranking = pd.DataFrame({'Feature': X_train.columns, 'Importance': average_importances}).sort_values(by='Importance', ascending=False)

  selected_features = feature_ranking.head(int(len(X_train.columns) * per))['Feature'].tolist()
  tr = X_train[selected_features].copy()
  ts = X_test[selected_features].copy()
  train = pd.concat([tr, y_train], axis = 1)
  test = pd.concat([ts, y_test], axis = 1)
  tm = per*100
  train_path = f"/content/drive/MyDrive/Filters_colorectal/LIME/train_{tm}_lime_svm_selected.csv"
  test_path = f"/content/drive/MyDrive/Filters_colorectal/LIME/test_{tm}_lime_svm_selected.csv"
  train.to_csv(train_path, index=False)
  test.to_csv(test_path, index=False)

In [None]:
ls = [0.2, 0.4 , 0.5 , 0.6 , 0.8]
for x in ls:
  feat(x)