# Notebook example

Installing some necessary packages:

In [1]:
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
!jupyter labextension install @jupyter-widgets/jupyterlab-manager



Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


An error occured.
ValueError: Please install nodejs 5+ and npm before continuing installation. nodejs may be installed using conda or directly from the nodejs website.
See the log file for details:  C:\Users\gusta\AppData\Local\Temp\jupyterlab-debug-04zgl4me.log


In [2]:
!pip install xgboost

Collecting xgboost
  Downloading https://files.pythonhosted.org/packages/3d/1b/83e5dc0021d12884e9998999945e156cf3628a79dacecaed2ede9f3107cb/xgboost-1.3.3-py3-none-win_amd64.whl (95.2MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.3.3


**It is necessary to change the working directory so the project structure works properly:**

In [1]:
import sys
sys.path.append("../../")

From this point, it's on you!

---

In [2]:
import pandas as pd

from ml.data_source.spreadsheet import Spreadsheet
from ml.preprocessing.preprocessing import Preprocessing
from ml.model.trainer import TrainerSklearn

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

Could not import lightgbm, required if using LGBMExplainableModel


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.


In [3]:
df = Spreadsheet().get_data('../../../data/raw/train.csv')

In [4]:
df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age'], dtype='object')

In [5]:
p = Preprocessing()

In [6]:
df = p.clean_data(df)
df = p.categ_encoding(df)

INFO:root:Cleaning data
INFO:root:Category encoding


In [7]:
df.head()

Unnamed: 0,Survived,Age,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
0,0,22.0,0,0,1,0,1
1,1,38.0,1,0,0,1,0
2,1,26.0,0,0,1,1,0
3,1,35.0,1,0,0,1,0
4,0,35.0,0,0,1,0,1


In [8]:
X = df.drop(columns=["Survived"])
y = df["Survived"]

In [9]:
# Ensure the same random state passed to TrainerSkleran().train()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((499, 6), (215, 6), (499,), (215,))

In [10]:
rf = TrainerSklearn().train(X, y, classification=True, 
                            algorithm=RandomForestClassifier, 
                            preprocessing=p,
                           data_split=('train_test', {'test_size':.3}),
                           random_state=123)

Setting feature_perturbation = "tree_path_dependent" because no background data was given.
INFO:interpret_community.TabularExplainer:Initialized valid explainer TreeExplainer with args {'explain_subset': None, 'features': ['Age', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male'], 'classes': None}


In [11]:
rf.get_metrics()

{'accuracy': 0.8232558139534883,
 'f1': 0.7500000000000001,
 'precision': 0.7808219178082192,
 'recall': 0.7215189873417721,
 'roc_auc': 0.8634121370067014}

In [12]:
rf.get_columns()

In [13]:
rf.predict_proba(X_test, binary=True)

array([1.        , 0.45271429, 0.43433333, 0.93333333, 0.29      ,
       0.0097619 , 0.08943685, 0.        , 0.0525    , 0.98      ,
       0.65514286, 0.10844703, 0.10844703, 0.97      , 0.87      ,
       0.29193651, 0.01      , 0.42744644, 1.        , 0.0300202 ,
       0.30402381, 1.        , 0.46507143, 0.73488095, 0.56333333,
       1.        , 0.01339054, 1.        , 0.        , 0.35177655,
       0.        , 0.        , 1.        , 0.11603523, 0.        ,
       0.74980952, 0.42744644, 0.13549914, 0.62743681, 0.        ,
       0.055     , 0.        , 0.29346825, 0.02      , 1.        ,
       0.1125    , 0.21      , 1.        , 1.        , 0.36449206,
       0.08943685, 0.38      , 0.        , 0.02      , 1.        ,
       0.42744644, 1.        , 0.        , 0.10844703, 0.00461538,
       1.        , 1.        , 0.08943685, 0.        , 0.49278571,
       0.        , 0.00461538, 1.        , 0.        , 0.42744644,
       0.08943685, 0.02      , 0.17333333, 0.31885714, 0.43383

In [14]:
# Predicting new data
def predict_new(X, model, probs=True):
    X = p.clean_data(X)
    X = p.categ_encoding(X)
    
    columns = model.get_columns()
    for col in columns:
        if col not in X.columns:
            X[col] = 0
    print(X)
    if probs:
        return model.predict_proba(X)
    else:
        return model.predict(X)

In [15]:
new_data = pd.DataFrame({
    'Pclass':3,
    'Sex': 'male',
    'Age':4
}, index=[0])

new_data

Unnamed: 0,Pclass,Sex,Age
0,3,male,4


In [16]:
predict_new(new_data, rf)

INFO:root:Cleaning data
INFO:root:Category encoding


   Age  Pclass_3  Sex_male  Pclass_1  Pclass_2  Sex_female
0    4         1         1         0         0           0


array([[0.68114286, 0.31885714]])

**Get local explainer for each instance:**

In [30]:
# Get local explainer
res = rf.local_interpret(X_test, len(X_test.columns))

In [31]:
res

Unnamed: 0,Importance_Name_0,Importance_Name_1,Importance_Name_2,Importance_Name_3,Importance_Name_4,Importance_Name_5,Importance_Value_0,Importance_Value_1,Importance_Value_2,Importance_Value_3,Importance_Value_4,Importance_Value_5
0,Sex_female,Sex_male,Pclass_3,Pclass_2,Age,Pclass_1,0.235407,0.212511,0.131872,0.049253,-0.019605,-0.028737
1,Sex_male,Sex_female,Age,Pclass_2,Pclass_3,Pclass_1,0.107810,0.105021,0.027756,-0.024243,-0.076479,-0.173282
2,Pclass_3,Pclass_1,Pclass_2,Age,Sex_male,Sex_female,0.136886,0.054153,0.035203,-0.040919,-0.093182,-0.107176
3,Sex_female,Sex_male,Age,Pclass_2,Pclass_1,Pclass_3,0.218018,0.210792,0.206967,-0.012758,-0.035676,-0.073309
4,Sex_male,Sex_female,Age,Pclass_2,Pclass_3,Pclass_1,0.125001,0.124176,0.092246,-0.010514,-0.068209,-0.133400
...,...,...,...,...,...,...,...,...,...,...,...,...
210,Age,Sex_female,Sex_male,Pclass_2,Pclass_3,Pclass_1,0.197178,0.135364,0.130705,-0.002840,-0.014871,-0.026237
211,Sex_female,Sex_male,Pclass_1,Pclass_3,Age,Pclass_2,0.163068,0.161115,0.099112,0.085461,0.072337,-0.000391
212,Sex_female,Age,Pclass_1,Pclass_2,Sex_male,Pclass_3,0.097598,0.095317,0.090221,0.087228,0.079541,-0.030605
213,Pclass_3,Pclass_1,Sex_female,Sex_male,Pclass_2,Age,0.127740,0.054102,0.043235,0.041091,0.036320,-0.113190
