In [None]:
!pip install lime -q
!pip install tensorflow -q

from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import make_pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# Interpretation

Understanding and diagnosing your machine-learning models - GaÃ«l Varoquaux - [youtube](https://youtu.be/kbj3llSbaVA)

In [None]:
url = 'http://lib.stat.cmu.edu/datasets/CPS_85_Wages'

cols = [
'EDUCATION: Number of years of education.',
'SOUTH: Indicator variable for Southern Region (1=Person lives in 		South, 0=Person lives elsewhere).',
'SEX: Indicator variable for sex (1=Female, 0=Male).',
'EXPERIENCE: Number of years of work experience.',
'UNION: Indicator variable for union membership (1=Union member, 		0=Not union member).',
'WAGE: Wage (dollars per hour).',
'AGE: Age (years).',
'RACE: Race (1=Other, 2=Hispanic, 3=White).',
'OCCUPATION: Occupational category (1=Management, 		2=Sales, 3=Clerical, 4=Service, 5=Professional, 6=Other).',
'SECTOR: Sector (0=Other, 1=Manufacturing, 2=Construction).',
'MARR: Marital Status (0=Unmarried,  1=Married)'
]

In [None]:
raw = pd.read_csv(url, skiprows=27, skipfooter=6, sep=None, header=None)

raw.columns = [c.split(':')[0].lower() for c in cols]

raw.head()

Log-transform to deal with multiplicative factors (also makes more normal):

In [None]:
data = raw.copy()

data.loc[:, 'y'] = np.log10(data.loc[:, 'wage'])

## What do you see below?

In [None]:
import seaborn as sns
sns.pairplot(data, vars=['y', 'age', 'education', 'experience'])

Strong correlation between age & experience.

Variables are related to each other

Univarite feature selection = marignal links
- single feature + target
- not conditional

Linear models = conditional links = conditional correlations
- remove the effect of features on other features
- hard if features are correlated

Conditioning
- if I remove the effect of experience on age - is there anything left?

Linear model = conditional correlations

In [None]:

y = data.loc[:, 'y']

x = data.drop(['y', 'wage'], axis=1)

model = RidgeCV()
model.fit(x, y)

params = model.coef_

plt.barh(np.arange(params.size), params)
plt.yticks(np.arange(params.size), x.columns)
plt.tight_layout()

These coefficients are conditional between target and wage, conditioning on others

Let's try with L1 reg:

In [None]:
model = LassoCV(max_iter=10000, cv=3)
model.fit(x, y)

params = model.coef_

plt.barh(np.arange(params.size), params)
plt.yticks(np.arange(params.size), x.columns)
plt.tight_layout()

Now occupation and experience have disappeared!
- experience can be explained by age

Correlated features + L1 -> unstable feature selection!

## LIME

[marcotcr/lime](https://github.com/marcotcr/lime) - explores linear, local decision boundaries of features

For a linear model, interpretation is global (constant)

For non-linear models, the interpretation must be local

Explains **single predictions**
- local explanation
- local means linear is a good approximation

LIME can be used for tabular data, images and text

### LIME for NLP

[Tutorial](https://marcotcr.github.io/lime/tutorials/Lime%20-%20basic%20usage%2C%20two%20class%20case.html)

```
The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date.
```

In [None]:
cats = ['alt.atheism', 'soc.religion.christian']
tr = fetch_20newsgroups(subset='train', categories=cats)
te = fetch_20newsgroups(subset='test', categories=cats)

print(tr['target_names'])

In [None]:
enc = TfidfVectorizer(lowercase=False)
tr_vec = enc.fit_transform(tr.data)
te_vec = enc.transform(te.data)

In [None]:
tr_vec[1].todense()

Train a forest:

In [None]:
rf = RandomForestClassifier(n_estimators=50, verbose=0, n_jobs=4)
rf.fit(tr_vec, tr.target)
pred = rf.predict(te_vec)
f1_score(te.target, pred, average='weighted')

In [None]:
c = make_pipeline(enc, rf)
print(c.predict_proba([te.data[0]]))

In [None]:
explainer = LimeTextExplainer(class_names=tr['target_names'])
idx = 83
exp = explainer.explain_instance(te.data[idx], c.predict_proba, num_features=6)
print('Document id: %d' % idx)
print('Probability(christian) =', c.predict_proba([te.data[idx]])[0,1])
print('True class: %s' % tr['target_names'][te.target[idx]])

exp.show_in_notebook(text=True)