In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

In [None]:
df = pd.read_csv('SampleSuperstore.csv')
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
drop_cols = ['Discount', 'Country', 'City']
df = df.drop(drop_cols, axis = 1)

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df = df.drop_duplicates(keep = 'first')

In [None]:
df.shape

In [None]:
df = df.sort_values(by = 'Profit', ascending = False)
df.head()

In [None]:
df_num = df.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [None]:
plt.figure(figsize = (7,7))
for i in df_num.columns:
    plt.title(i)
    plt.hist(df[i], bins = 10)
    plt.show()

In [None]:
for i in df_num.columns:
    plt.title(i)
    sns.distplot(df[i], bins = 10)
    plt.show()

In [None]:
region_group = df.groupby('Region').size()
region_group

In [None]:
region_group.plot(title = 'Region_by_group')

In [None]:
df_num_corr = df_num.corr()['Profit'][:-1]
df_num_corr

In [None]:
for i in range(0, len(df_num.columns), 5):
    sns.pairplot(df_num, y_vars = ['Profit'], x_vars = df_num.columns[i:i+5])

In [None]:
for i in df_num.columns:
    sns.regplot(df[i], df['Profit'])
    plt.show()

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.columns

In [None]:
features = ['Ship Mode', 'Segment', 'Region', 'Category', 'Sub-Category', 'Quantity']
X = df[features]
X.head()

In [None]:
X.isnull().any()

In [None]:
X.nunique()

In [None]:
from sklearn.model_selection import train_test_split

y = df.Sales
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.25, random_state = 1)

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
for col in X_train.columns:
    X_train[col] = labelencoder.fit_transform(X_train[col])
    X_val[col] = labelencoder.transform(X_val[col])


In [None]:
X_train.head()

In [None]:
import xgboost
import shap
shap.initjs()
model = xgboost.XGBRegressor(n_estimators = 1000, learning_rate = 0.001)
model.fit(X_train, y_train)

In [None]:
mybooster = model.get_booster()
model_bytearray = mybooster.save_raw()[4:]
def myfun(self=None):
    return model_bytearray

mybooster.save_raw = myfun

In [None]:
explainer = shap.TreeExplainer(mybooster)
shap_values = explainer.shap_values(X_train)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[0,:], X_train.iloc[0,:])

In [None]:
shap.force_plot(explainer.expected_value, shap_values, X_train)

In [None]:
shap.dependence_plot("Region", shap_values, X_train)

In [None]:
for col in X_train.columns:
    shap.dependence_plot(col, shap_values, X_train)

In [None]:
shap.summary_plot(shap_values, X_train)

In [None]:
shap.summary_plot(shap_values, X_train, plot_type="bar")