In [1]:
!pip install bayesian-optimization
!pip install shap lime

Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.3 colorama-0.4.6
Collecting shap
  Downloading shap-0.45.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (538 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m538.2/538.2 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting slicer==0.0.7 (from shap)
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Building wheels for collected packages: lime
 

In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import skew
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import r2_score, accuracy_score
from bayes_opt import BayesianOptimization
import shap
import lime , lime.lime_tabular

In [None]:
#@title Preprocessing database

irrelevant_columns = [
    "ID", "id", "Index", "index", "Serial Number", "serial_number",
    "Address", "address", "Href", "href", "Timestamp", "timestamp",
    "Creation Date", "creation_date", "Last Updated Date", "last_updated_date",
    "Version", "version", "Checksum", "checksum", "Year", "year"
    "Row ID", "row_id", "Record ID", "record_id", "Customer ID", "customer_id",
    "Client ID", "client_id", "Account ID", "account_id", "Transaction ID", "transaction_id",
    "Email", "email", "Phone Number", "phone_number", "Website", "website",
    "Fax", "fax", "IP Address", "ip_address", "MAC Address", "mac_address",
    "Social Security Number", "social_security_number", "Driver's License", "drivers_license",
    "ID_", "id_", "Index_", "index_", "Serial_Number", "serial_number_",
    "Address_", "address_", "Href_", "href_", "Timestamp_", "timestamp_",
    "Creation_Date", "creation_date_", "Last_Updated_Date", "last_updated_date_",
    "Version_", "version_", "Checksum_", "checksum_",
    "Row_ID", "row_id_", "Record_ID", "record_id_", "Customer_ID", "customer_id_",
    "Client_ID", "client_id_", "Account_ID", "account_id_", "Transaction_ID", "transaction_id_",
    "Email_", "email_", "Phone_Number", "phone_number_", "Website_", "website_",
    "Fax_", "fax_", "IP_Address", "ip_address_", "MAC_Address", "mac_address_",
    "Social_Security_Number", "social_security_number_", "Driver's_License", "drivers_license_"
]

ordinal_data = {
    'low':1,
    'medium':2,
    'moderate':2,
    'high':3,
    # Education Level
    'high school diploma': 1,
    'associate\'s degree': 2,
    'bachelor\'s degree': 3,
    'master\'s degree': 4,
    'doctorate degree': 5,
    # Income Level
    'low income': 1,
    'middle income': 2,
    'high income': 3,
    # Customer Satisfaction
    'very dissatisfied': 1,
    'dissatisfied': 2,
    'neutral': 3,
    'satisfied': 4,
    'very satisfied': 5,
    # Likert Scale
    'strongly disagree': 1,
    'disagree': 2,
    'neither agree nor disagree': 3,
    'agree': 4,
    'strongly agree': 5,
    # Job Seniority
    'entry-level': 1,
    'mid-level': 2,
    'senior-level': 3,
    'executive-level': 4,
    # Severity of Illness/Condition
    'mild': 1,
    'moderate': 2,
    'severe': 3,
    # Temperature
    'cold': 1,
    'warm': 2,
    'hot': 3,
    'very hot': 4,
    # Customer Rating
    '1 star': 1,
    '2 stars': 2,
    '3 stars': 3,
    '4 stars': 4,
    '5 stars': 5,
    # Likelihood of Purchase
    'very unlikely': 1,
    'unlikely': 2,
    'likely': 4,
    'very likely': 5,
    # Degree of Agreement
    'strongly disagree': 1,
    'disagree': 2,
    'neutral': 3,
    'agree': 4,
    'strongly agree': 5,
    # Pain Scale
    'no pain': 1,
    'mild pain': 2,
    'moderate pain': 3,
    'severe pain': 4,
    'extreme pain': 5,
    # Likelihood of Recommendation
    'very unlikely to recommend': 1,
    'unlikely to recommend': 2,
    'likely to recommend': 4,
    'very likely to recommend': 5,
    # Quality Ratings
    'poor quality': 1,
    'fair quality': 2,
    'good quality': 3,
    'very good quality': 4,
    'excellent quality': 5,
    # Customer Service Experience
    'very poor': 1,
    'poor': 2,
    'average': 3,
    'good': 4,
    'excellent': 5,
    # Ease of Use
    'very difficult': 1,
    'difficult': 2,
    'easy': 4,
    'very easy': 5,
    # Likelihood of Churn
    'very unlikely to churn': 1,
    'unlikely to churn': 2,
    'likely to churn': 4,
    'very likely to churn': 5,
    # Satisfaction with Product/Service
    'not satisfied': 1,
    'slightly satisfied': 2,
    'moderately satisfied': 3,
    'extremely satisfied': 5,
    # Risk Levels
    'low risk': 1,
    'moderate risk': 2,
    'high risk': 3,
    # Performance Ratings
    'below expectations': 1,
    'meeting expectations': 2,
    'exceeding expectations': 3 }

In [None]:
#@title Data Preprocessing

def remove_irrelevant_columns(data, irrelevant_columns):
  columns_to_delete = [col for col in data.columns if col in irrelevant_columns]

  if len(columns_to_delete)>0:
    data.drop(columns=columns_to_delete, inplace=True)

  return data


def remove_duplicate_values(data):
  if data.duplicated().any():
    data.drop_duplicates(inplace=True)

  return data


def remove_constant_values(data):
  constant_columns = [col for col in data.columns if data[col].nunique() == 1]

  if len(constant_columns) > 0:
    data.drop(columns=constant_columns, inplace=True)

  return data


def remove_string_numerical(data):
  string_num_cols = [col for col in data.columns if data[col].dtype == 'object' and data[col].str.isnumeric().all()]

  if len(string_num_cols) > 0:
    data[string_num_cols] = data[string_num_cols].apply(pd.to_numeric)

  return data


def remove_object_numerical(data):
  import re
  object_cols = data.select_dtypes(include=['object']).columns

  if len(object_cols) > 0:
    for col in object_cols:
      numerical_values = data[col].apply(lambda x: re.findall(r'\d+\.\d+|\d+', str(x)))
      numeric_col = col + '_numeric'
      data[numeric_col] = numerical_values.apply(lambda x: float(x[0]) if x else None)

  return data


def missing_values(data, threshold=0.5, k_neighbors=5):
  numerical_cols = data.select_dtypes(include=['number']).columns
  categorical_cols = data.select_dtypes(include=['object']).columns

  if data.isnull().any().any():
      missing_percentage = data.isnull().mean()

      if (missing_percentage > threshold).any():
        imputer = KNNImputer(n_neighbors=k_neighbors)
        strategy = 'knn'
      else:
        imputer = SimpleImputer(strategy='mean')
        strategy = 'mean'
      if strategy == 'knn':
        imputer = KNNImputer(n_neighbors=k_neighbors)
      else:
        imputer = SimpleImputer(strategy=strategy)
      data[numerical_cols] = imputer.fit_transform(data[numerical_cols])
      mode_imputer = SimpleImputer(strategy='most_frequent')
      data[categorical_cols] = mode_imputer.fit_transform(data[categorical_cols])

  return data


# def convert_datetime(data):
#     object_cols = data.select_dtypes(include=['object']).columns

#     if len(object_cols) > 0:
#         for col in object_cols:
#             try:
#                 data[col] = pd.to_datetime(data[col])
#                 data[col + '_numeric'] = data[col].astype('int64') // 10**9
#                 data.drop(columns=[col], inplace=True)
#             except (ValueError, TypeError):
#                 pass

#     return data


def encode_objects(data):
  categorical_columns = data.select_dtypes(include=['object']).columns

  if len(categorical_columns) > 0:
    for col in categorical_columns:
      unique_values_count = data[col].nunique()

      if unique_values_count == 2:
        encoder = LabelEncoder()
        data[col] = encoder.fit_transform(data[col])

      elif unique_values_count <= 7:
          if all(value in ordinal_data for value in data[col].str.lower()):
            data[col] = data[col].str.lower().map(ordinal_data)
          elif any(word in data[col].str.lower() for word in ['low', 'medium', 'moderate', 'high']):
            data[col] = data[col].apply(lambda x: ordinal_data[x.lower()] if x.lower() in ordinal_data else x)
          else:
            encoder = OneHotEncoder(sparse_output=False, drop='first')
            encoded_values = encoder.fit_transform(data[[col]])
            col_names = [f"{col}_{value}" for value in encoder.categories_[0][1:]]
            df = pd.DataFrame(encoded_values, columns=col_names)
            data = pd.concat([data, df], axis=1)
            data.drop(columns=[col], inplace=True)

  return data


def adjust_values(data):
  scaler = StandardScaler()
  scaler.fit_transform(data)

  return data


#master function
def automatic_data_preprocessing(data, threshold=0.5, k_neighbors=5):

    data = remove_irrelevant_columns(data, irrelevant_columns)
    data = remove_duplicate_values(data)
    data = remove_constant_values(data)
    data = remove_string_numerical(data)
    data = remove_object_numerical(data)
    data = missing_values(data, threshold, k_neighbors)
    # data = convert_datetime(data)
    data = encode_objects(data)
    data = adjust_values(data)

    return data

In [13]:
dataset = pd.read_csv('thyroid_cancer_data.csv')
# dataset = automatic_data_preprocessing(data)

print("Column Names with Indexes:")
for idx, col_name in enumerate(dataset.columns):
    print(f"Index {idx}: {col_name}")
target_col_idx = int(input("Enter the index of the target variable column: "))

X = dataset.drop(dataset.columns[target_col_idx], axis=1)
y = dataset.iloc[:, target_col_idx]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print()
print('X shape : ',X.shape)
print('y shape : ',y.shape)
print('X_train shape : ',X_train.shape)
print('X_test shape : ',X_test.shape)
print('y_train shape : ',y_train.shape)
print('y_test shape : ',y_test.shape)

Column Names with Indexes:
Index 0: mean_radius
Index 1: mean_texture
Index 2: mean_perimeter
Index 3: mean_area
Index 4: mean_smoothness
Index 5: diagnosis
Enter the index of the target variable column: 5

X shape :  (569, 5)
y shape :  (569,)
X_train shape :  (455, 5)
X_test shape :  (114, 5)
y_train shape :  (455,)
y_test shape :  (114,)


In [None]:
#@title Optimizer Functions

# # Decision Tree Regressor Optimization
# def optimize_dtr(X_train, X_test, y_train, y_test):
#     def dtr_optimizer(max_depth, min_samples_split, min_samples_leaf):
#         dtr = DecisionTreeRegressor(max_depth=int(max_depth), min_samples_split=int(min_samples_split),
#                                      min_samples_leaf=int(min_samples_leaf))
#         dtr.fit(X_train, y_train)
#         y_pred = dtr.predict(X_test)
#         return r2_score(y_test, y_pred), {'max_depth': int(max_depth),
#                                           'min_samples_split': int(min_samples_split),
#                                           'min_samples_leaf': int(min_samples_leaf)}

#     dtr_bounds = {'max_depth': (1, 50), 'min_samples_split': (2, 20),
#                   'min_samples_leaf': (1, 20)}

#     bayes_dtr = BayesianOptimization(f=dtr_optimizer, pbounds=dtr_bounds, random_state=42)
#     bayes_dtr.maximize(init_points=10, n_iter=10)
#     best_params = bayes_dtr.max['params']

#     dtr_final = DecisionTreeClassifier(max_depth=int(round(best_params['max_depth'])),
#                                        min_samples_split=int(best_params['min_samples_split']),
#                                        min_samples_leaf=int(best_params['min_samples_leaf']))
#     dtr_final.fit(X_train, y_train)

#     y_pred = dtr_final.predict(X_test)
#     performance = r2_score(y_test, y_pred)

#     return dtr_final, performance


# # Random Forest Regressor Optimization
# def optimize_rfr(X_train, X_test, y_train, y_test):
#     def rfr_optimizer(n_estimators, max_depth, min_samples_split, min_samples_leaf):
#         rfr = RandomForestRegressor(n_estimators=int(round(n_estimators)), max_depth=int(round(max_depth)),
#                                      min_samples_split=int(min_samples_split), min_samples_leaf=int(min_samples_leaf))
#         rfr.fit(X_train, y_train)
#         y_pred = rfr.predict(X_test)
#         return r2_score(y_test, y_pred)

#     rfr_bounds = {'n_estimators': (10, 100), 'max_depth': (1, 50), 'min_samples_split': (2, 20),
#                   'min_samples_leaf': (1, 20)}

#     bayes_rfr = BayesianOptimization(f=rfr_optimizer, pbounds=rfr_bounds, random_state=42)
#     bayes_rfr.maximize(init_points=10, n_iter=10)
#     best_params = bayes_rfr.max['params']

#     rfr_final = RandomForestRegressor(n_estimators=int(round(best_params['n_estimators'])),
#                                       max_depth=int(round(best_params['max_depth'])),
#                                       min_samples_split=int(best_params['min_samples_split']),
#                                       min_samples_leaf=int(best_params['min_samples_leaf']))
#     rfr_final.fit(X_train, y_train)

#     y_pred = rfr_final.predict(X_test)
#     performance = r2_score(y_test, y_pred)

#     return rfr_final, performance


# # Gradient Boosting Regressor Optimization
# def optimize_gbr(X_train, X_test, y_train, y_test):
#     def gbr_optimizer(n_estimators, learning_rate, max_depth, min_samples_split, min_samples_leaf, subsample):
#         gbr = GradientBoostingRegressor(n_estimators=int(round(n_estimators)), learning_rate=learning_rate,
#                                          max_depth=int(round(max_depth)), min_samples_split=int(min_samples_split),
#                                          min_samples_leaf=int(min_samples_leaf), subsample=subsample)
#         gbr.fit(X_train, y_train)
#         y_pred = gbr.predict(X_test)
#         return r2_score(y_test, y_pred)

#     gbr_bounds = {'n_estimators': (10, 100), 'learning_rate': (0.001, 1.0), 'max_depth': (1, 50),
#                   'min_samples_split': (2, 20), 'min_samples_leaf': (1, 20), 'subsample': (0.1, 1.0)}

#     bayes_gbr = BayesianOptimization(f=gbr_optimizer, pbounds=gbr_bounds, random_state=42)
#     bayes_gbr.maximize(init_points=10, n_iter=10)
#     best_params = bayes_gbr.max['params']

#     gbr_final = GradientBoostingRegressor(n_estimators=int(round(best_params['n_estimators'])),
#                                            learning_rate=best_params['learning_rate'],
#                                            max_depth=int(round(best_params['max_depth'])),
#                                            min_samples_split=int(best_params['min_samples_split']),
#                                            min_samples_leaf=int(best_params['min_samples_leaf']),
#                                            subsample=best_params['subsample'])
#     gbr_final.fit(X_train, y_train)

#     y_pred = gbr_final.predict(X_test)
#     performance = r2_score(y_test, y_pred)

#     return gbr_final, performance


# # Decision Tree Classifier Optimization
# def optimize_dtc(X_train, X_test, y_train, y_test):
#     def dtc_optimizer(max_depth, min_samples_split, min_samples_leaf):
#         dtc = DecisionTreeClassifier(max_depth=int(round(max_depth)), min_samples_split=int(min_samples_split),
#                                       min_samples_leaf=int(min_samples_leaf))
#         dtc.fit(X_train, y_train)
#         y_pred = dtc.predict(X_test)
#         return accuracy_score(y_test, y_pred)

#     dtc_bounds = {'max_depth': (1, 50), 'min_samples_split': (2, 20),
#                   'min_samples_leaf': (1, 20)}

#     bayes_dtc = BayesianOptimization(f=dtc_optimizer, pbounds=dtc_bounds, random_state=42)
#     bayes_dtc.maximize(init_points=10, n_iter=10)
#     best_params = bayes_dtc.max['params']

#     dtc_final = DecisionTreeClassifier(max_depth=int(round(best_params['max_depth'])),
#                                        min_samples_split=int(best_params['min_samples_split']),
#                                        min_samples_leaf=int(best_params['min_samples_leaf']))
#     dtc_final.fit(X_train, y_train)

#     y_pred = dtc_final.predict(X_test)
#     performance = accuracy_score(y_test, y_pred)

#     return dtc_final, performance


# # Random Forest Classifier Optimization
# def optimize_rfc(X_train, X_test, y_train, y_test):
#     def rfc_optimizer(n_estimators, max_depth, min_samples_split, min_samples_leaf):
#         rfc = RandomForestClassifier(n_estimators=int(round(n_estimators)), max_depth=int(round(max_depth)),
#                                       min_samples_split=int(min_samples_split), min_samples_leaf=int(min_samples_leaf))
#         rfc.fit(X_train, y_train)
#         y_pred = rfc.predict(X_test)
#         return accuracy_score(y_test, y_pred)

#     rfc_bounds = {'n_estimators': (10, 100), 'max_depth': (1, 50), 'min_samples_split': (2, 20),
#                   'min_samples_leaf': (1, 20)}

#     bayes_rfc = BayesianOptimization(f=rfc_optimizer, pbounds=rfc_bounds, random_state=42)
#     bayes_rfc.maximize(init_points=10, n_iter=10)
#     best_params = bayes_rfc.max['params']

#     rfc_final = RandomForestClassifier(n_estimators=int(round(best_params['n_estimators'])),
#                                        max_depth=int(round(best_params['max_depth'])),
#                                        min_samples_split=int(best_params['min_samples_split']),
#                                        min_samples_leaf=int(best_params['min_samples_leaf']))
#     rfc_final.fit(X_train, y_train)

#     y_pred = rfc_final.predict(X_test)
#     performance = accuracy_score(y_test, y_pred)

#     return rfc_final, performance


# # Gradient Boosting Classifier Optimization
# def optimize_gbc(X_train, X_test, y_train, y_test):
#     def gbc_optimizer(n_estimators, learning_rate, max_depth, min_samples_split, min_samples_leaf, subsample):
#         gbc = GradientBoostingClassifier(n_estimators=int(n_estimators), learning_rate=learning_rate,
#                                          max_depth=int(max_depth), min_samples_split=int(min_samples_split),
#                                          min_samples_leaf=int(min_samples_leaf), subsample=subsample)
#         gbc.fit(X_train, y_train)
#         y_pred = gbc.predict(X_test)
#         return accuracy_score(y_test, y_pred), {'n_estimators': int(n_estimators),
#                                                 'learning_rate': learning_rate,
#                                                 'max_depth': int(max_depth),
#                                                 'min_samples_split': int(min_samples_split),
#                                                 'min_samples_leaf': int(min_samples_leaf),
#                                                 'subsample': subsample}

#     gbc_bounds = {'n_estimators': (10, 100), 'learning_rate': (0.001, 1.0), 'max_depth': (1, 50),
#                   'min_samples_split': (2, 20), 'min_samples_leaf': (1, 20), 'subsample': (0.1, 1.0)}

#     bayes_gbc = BayesianOptimization(f=gbc_optimizer, pbounds=gbc_bounds, random_state=42)
#     bayes_gbc.maximize(init_points=10, n_iter=10)
#     best_params = bayes_gbc.max['params']
#     accuracy = bayes_gbc.max['target']

#     gbc_final = GradientBoostingClassifier(n_estimators=int(round(best_params['n_estimators'])),
#                                            learning_rate=best_params['learning_rate'],
#                                            max_depth=int(round(best_params['max_depth'])),
#                                            min_samples_split=int(best_params['min_samples_split']),
#                                            min_samples_leaf=int(best_params['min_samples_leaf']),
#                                            subsample=best_params['subsample'])
#     gbc_final.fit(X_train, y_train)

#     y_pred = gbc_final.predict(X_test)
#     performance = accuracy_score(y_test, y_pred)

#     return gbc_final, performance

In [4]:
# Decision Tree Classifier Optimization
def optimize_dtc(X_train, X_test, y_train, y_test):
    def dtc_optimizer(max_depth, min_samples_split, min_samples_leaf):
        dtc = DecisionTreeClassifier(max_depth=int(round(max_depth)), min_samples_split=int(min_samples_split),
                                      min_samples_leaf=int(min_samples_leaf))
        dtc.fit(X_train, y_train)
        y_pred = dtc.predict(X_test)
        return accuracy_score(y_test, y_pred)

    dtc_bounds = {'max_depth': (1, 50), 'min_samples_split': (2, 20),
                  'min_samples_leaf': (1, 20)}

    bayes_dtc = BayesianOptimization(f=dtc_optimizer, pbounds=dtc_bounds, random_state=42)
    bayes_dtc.maximize(init_points=10, n_iter=10)
    params = bayes_dtc.max['params']
    return params

In [5]:
best_params = optimize_dtc(X_train, X_test, y_train, y_test)
model = DecisionTreeClassifier(max_depth=int(round(best_params['max_depth'])),
                                   min_samples_split=int(best_params['min_samples_split']),
                                   min_samples_leaf=int(best_params['min_samples_leaf']))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
performance = accuracy_score(y_test, y_pred)
print(performance)

|   iter    |  target   | max_depth | min_sa... | min_sa... |
-------------------------------------------------------------
| [0m1        [0m | [0m0.9561   [0m | [0m19.35    [0m | [0m19.06    [0m | [0m15.18    [0m |
| [0m2        [0m | [0m0.9123   [0m | [0m30.33    [0m | [0m3.964    [0m | [0m4.808    [0m |
| [0m3        [0m | [0m0.9561   [0m | [0m3.846    [0m | [0m17.46    [0m | [0m12.82    [0m |
| [0m4        [0m | [0m0.9386   [0m | [0m35.7     [0m | [0m1.391    [0m | [0m19.46    [0m |
| [0m5        [0m | [0m0.9211   [0m | [0m41.79    [0m | [0m5.034    [0m | [0m5.273    [0m |
| [0m6        [0m | [0m0.9211   [0m | [0m9.987    [0m | [0m6.781    [0m | [0m11.45    [0m |
| [0m7        [0m | [0m0.9211   [0m | [0m22.17    [0m | [0m6.533    [0m | [0m13.01    [0m |
| [0m8        [0m | [0m0.9211   [0m | [0m7.835    [0m | [0m6.551    [0m | [0m8.595    [0m |
| [0m9        [0m | [0m0.9561   [0m | [0m23.35    [0m 

In [21]:
explainer = shap.Explainer(model)
shap_values = explainer(X)
shap.plots.beeswarm(shap_values)

ValueError: The beeswarm plot does not support plotting explanations with instances that have more than one dimension!