In [108]:
import numpy as np
import pandas as pd


from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

from omnixai.explainers.tabular import TabularExplainer
from omnixai.data.tabular import Tabular



In [136]:
categorical_features+numerical_feature

['Area', 'Age', 'Type', 'Price Range', 'Capacity', 'Number of Menu Items']

In [137]:
#Model selection function
def regressor_selection(X,y, metric = 'r2'):    
    pipe = Pipeline([('regressor' , RandomForestRegressor())])    
    param_grid = ''
    param = [        
                
        {'regressor' : [RandomForestRegressor()],
        'regressor__n_estimators' : [100,200,500],
        'regressor__max_depth' : list( range(5,25,5) ),
        'regressor__min_samples_split' : list( range(4,12,2) )
        },
        
        {'regressor' : [KNeighborsRegressor()],
         'regressor__n_neighbors' : [5,10,20,30],
         'regressor__p' : [1,2] 
        },
        {
         'regressor' : [Lasso(max_iter=500)],
         'regressor__alpha' : [0.001,0.01,0.1,1,10,100,1000]         
        }
            ]
    param_grid = param    
    clf = GridSearchCV(pipe, param_grid = param_grid, 
                       cv = 5, n_jobs=-1,scoring = metric)    
    best_clf = clf.fit(X, y)
    
    return(best_clf.best_params_['regressor'])



#Reading Data
url=  'Restaurant_Profitability_Training_Data.csv'
df = pd.read_csv(url)

Target = 'Profit'
categorical_features = ['Area', 'Age', 'Type','Price Range','Capacity','Number of Menu Items']
numerical_feature = []
target = 'Profit'

label=df[target]
data= df[categorical_features+numerical_feature]



#Data Preprocessing
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))
                                      ,('scaler', StandardScaler())])
categorical_transformer = OneHotEncoder(categories='auto')

encoder = ColumnTransformer(
    transformers=[
        ('numerical', numeric_transformer, numerical_feature),
        ('categorical', categorical_transformer, categorical_features)])
encoder.fit(data)

#Model Building and Selection
clf = regressor_selection(encoder.transform(data),label, metric = 'r2')
model = clf.fit(encoder.transform(data),label)                

#
tabular_data = Tabular(
    data=data,
    categorical_columns=categorical_features
 
)

SyntaxError: invalid syntax (155560165.py, line 47)

In [111]:
#Wrapper function for prediction
def Profitability_Prediction(Areas, Ages, Types, Price_Ranges, Capacities, Items):    
    input_data = np.column_stack([Areas, Ages, Types, Price_Ranges, Capacities, Items])
    X = pd.DataFrame(input_data,columns=['Area', 'Age', 'Type','Price Range','Capacity','Number of Menu Items'])
    result = model.predict(encoder.transform(X))
    return result.tolist()


In [112]:
def pre_processing(tabular_data):
    X = tabular_data.to_pd()    
    return encoder.transform(X)

In [138]:
# Initialize a TabularExplainer
explainers = TabularExplainer(
   explainers=["lime", "shap"], # The explainers to apply
   mode="regression",                             # The task type
   data=tabular_data,                                   # The data for initializing the explainers
   model=model,                                       # The ML model to explain
   preprocess=pre_processing,     # Converts raw features into the model inputs
    params={
        "lime": {"kernel_width": 3},
        "shap": {"nsamples": 100}
    }
   
)

In [154]:
def Profitability_Explainers(Areas, Ages, Types, Price_Ranges, Capacities, Items):    
    input_data = np.column_stack([Areas, Ages, Types, Price_Ranges, Capacities, Items])
    X = pd.DataFrame(input_data,columns=['Area', 'Age', 'Type','Price Range','Capacity','Number of Menu Items'])
    
    tabular_data = Tabular(
    data=X,
    categorical_columns=categorical_features
 
        )
    explanations = explainers.explain(tabular_data)
    return explanations['shap'].get_explanations()[0]['scores']

In [155]:
Areas, Ages, Types, Price_Ranges, Capacities, Items = ['Downtown'],['Developed'],['Cafeteria'],['$'],['90-100'],['Salad & Sandwich only']

In [158]:
Profitability_Explainers(Areas, Ages, Types, Price_Ranges, Capacities, Items)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.21it/s]


[-16032.586610081848,
 -14727.565925624636,
 -12518.838368002485,
 1458.5810453150418,
 1313.2527373121823,
 3.4650000427821013]

In [159]:
Profitability_Explainers(Areas, Ages, Types, Price_Ranges, Capacities, Items)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.19it/s]


[-16032.586610081848,
 -14727.565925624636,
 -12518.838368002485,
 1458.5810453150418,
 1313.2527373121823,
 3.4650000427821013]

In [115]:
tabular_data[0:1]

       Area        Age       Type Price Range Capacity   Number of Menu Items
0  Downtown  Developed  Cafeteria          $$   90-100  Salad & Sandwich only

In [69]:
# Initialize a TabularExplainer
explainers = TabularExplainer(
   explainers=["lime", "shap"], # The explainers to apply
   mode="regression",                             # The task type
   data=tabular_data,                                   # The data for initializing the explainers
   model=model,                                       # The ML model to explain
   preprocess=pre_processing     # Converts raw features into the model inputs
   
)

Using 1044 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


In [70]:
explanations = explainers.explain(tabular_data[0:1])


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.38s/it]


In [102]:
a = explanations['lime'].get_explanations()

In [107]:
a[0]['scores']

[-21685.906853960903,
 -13196.63689255275,
 -8164.708935031015,
 7574.361170665834,
 3056.098963629456,
 136.9408841443397]

In [96]:
a

"{'instanc"

In [85]:

analyzer = PredictionAnalyzer(
    mode="regression",
    test_data=tabular_data[0:1],                           # The test dataset (a `Tabular` instance)
    test_targets=np.asarray([40147]),                      # The test labels (a numpy array)
    model=model,                                   # The ML model
    preprocess=pre_processing  # Converts raw features into the model inputs
)

In [87]:
prediction_explanations = analyzer.explain()

R^2 score is not well-defined with less than two samples.


In [88]:
prediction_explanations

{'metric': <omnixai.explanations.prediction.metrics.MetricExplanation at 0x189ffa13dc0>,
 'residual': <omnixai.explanations.prediction.residual.ResidualExplanation at 0x189ffe5cf40>}