<a href="https://colab.research.google.com/github/Awais-mohammad/ADAS/blob/master/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import numpy as np
import pandas as pd
import warnings
import shap
import matplotlib.pyplot as plt
from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from feature_engine.encoding import RareLabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import re

warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', 1000)
df = pd.read_csv('/kaggle/input/diamonds/diamonds.csv').drop_duplicates()

main_label = 'log10_price'
df[main_label] = df['price'].apply(lambda x: np.log10(x))

# Data transformation
df['log10_carat'] = df['carat'].apply(lambda x: 1/50*round(50*np.log10(x)))
df = df[df['x']>0]
df = df[df['y']>0]
df = df[df['z']>0]
df['log10mean'] = (df['x'] + df['y']).apply(lambda x: 1/10*round(10*np.log10(0.5*x)))
df['depth'] = df['depth'].apply(lambda x: str(5*round(1/5*x)))
df['table'] = df['table'].apply(lambda x: str(5*round(1/5*x)))
for col in ['cut', 'color', 'clarity', 'depth', 'table']:
    df[col] = df[col].fillna('None')
    encoder = RareLabelEncoder(n_categories=1, max_n_categories=60, replace_with='Other', tol=20.0/df.shape[0])
    df[col] = encoder.fit_transform(df[[col]])
cols2drop = ['price', 'carat', 'x', 'y', 'z']
df = df.drop(cols2drop, axis=1)

# Machine learning
y = df[main_label].values.reshape(-1,)
X = df.drop([main_label], axis=1)
cat_cols = df.select_dtypes(include=['object']).columns
cat_cols_idx = [list(X.columns).index(c) for c in cat_cols]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

train_pool = Pool(X_train, y_train, cat_features=cat_cols_idx)
test_pool = Pool(X_test, y_test, cat_features=cat_cols_idx)

model = CatBoostRegressor(iterations=700, depth=5, verbose=0, learning_rate=0.1, loss_function='RMSE')
model.fit(train_pool)
y_train_pred = model.predict(train_pool)
y_test_pred = model.predict(test_pool)

rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
print(f"RMSE score for train {round(rmse_train, 3)} dex, and for test {round(rmse_test, 3)} dex")

# Explanations with SHAP values
shap.initjs()
ex = shap.TreeExplainer(model)
shap_values = ex.shap_values(X_test)
shap.summary_plot(shap_values, X_test)

expected_values = ex.expected_value
print(f"Average predicted price is {round(10**expected_values)} USD")
print(f"Average actual price is {round(10**np.mean(y_test))} USD")

def show_shap(col, shap_values=shap_values, label=main_label, X_test=X_test, ylabel='dex'):
    df_infl = X_test.copy()
    df_infl['shap_'] = shap_values[:, df_infl.columns.tolist().index(col)]
    gain = round(df_infl.groupby(col).mean()['shap_'], 4)
    gain_std = round(df_infl.groupby(col).std()['shap_'], 4)
    cnt = df_infl.groupby(col).count()['shap_']
    dd_dict = {'col': list(gain.index), 'gain': list(gain.values), 'gain_std': list(gain_std.values), 'count': cnt}
    df_res = pd.DataFrame.from_dict(dd_dict).sort_values('gain', ascending=False).set_index('col')
    plt.figure(figsize=(12, 9))
    plt.errorbar(df_res.index, df_res['gain'], yerr=df_res['gain_std'], fmt="o", color="r")
    plt.title(f'SHAP values for column {col}, label {label}')
    plt.ylabel(ylabel)
    plt.tick_params(axis="x", rotation=90)
    plt.show()
    print(df_res)
    return

for col in X_test.columns:
    print()
    print(col)
    print()
    show_shap(col, shap_values, label=main_label, X_test=X_test)