# Explainable Machine Learning

## Preprocessing

### Loading the dataset

In [1]:
import pandas as pd
import os

In [20]:
df_sms = pd.read_csv('spam_data.csv', encoding='latin-1')
df_sms.dropna(axis=1, inplace=True)
df_sms.columns = ['label', 'message']
df_sms

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [21]:
df_sms.groupby('label').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [25]:
df_sms['label'] = df_sms.label.map({'ham':0, 'spam':1})

### Tokenization

In [26]:
import gensim
import nltk

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ceyx\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
def tokenize(text):
    result = []
    for token in simple_preprocess(text):
        if token not in STOPWORDS:
            result.append(stemmer.stem(lemmatizer.lemmatize(token,'v')))
    return result

In [33]:
messages = df_sms['message'].map(tokenize)

### Bag-of-Words & TfIdf

In [35]:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.matutils import corpus2dense

In [41]:
dictionary = Dictionary(movie_overviews)
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

In [42]:
bow_corpus = [dictionary.doc2bow(doc) for doc in movie_overviews]

In [43]:
tfidf_corpus = TfidfModel(bow_corpus)[bow_corpus]
tfidf_corpus = corpus2dense(tfidf_corpus, num_terms=100000, num_docs=len(tfidf_corpus)).T

## Explainable Machine Learning algorithms

### White-box models

#### Logistic Regression

#### Decision Tree

### Black-box model interpretation - Random Forest

#### Global surrogate model - Logistic Regression

#### Local surrogate model - Decision Tree

### Variable importance - Permutation importance