# Basic Text Features

In [1]:
import sys
import string
import nltk

import numpy as np
from typing import Iterable

import sklearn
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.feature_selection import chi2

In [2]:
print(f'sklearn.__version__={sklearn.__version__}')

sklearn.__version__=1.3.2


In [4]:
# spacy
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

porter    = PorterStemmer()
lancaster = LancasterStemmer()
# nltk.download()

In [5]:
words = ["dogs","destabilize","misunderstanding","railroad","moonlight",
         "football","pass","passing","friendship", "friends", "friendships",
         "passed","trouble","troubling","care", "believes"]
preprocess = [porter, lancaster]

len_bin = 20
col_formater = "{0:len_bin}{1:len_bin}{2:len_bin}".replace("len_bin",str(len_bin))
print(col_formater.format("Word", porter.__class__.__name__, lancaster.__class__.__name__))
print("")
for w in words:
    print( col_formater.format(w, porter.stem(w), lancaster.stem(w)))

Word                PorterStemmer       LancasterStemmer    

dogs                dog                 dog                 
destabilize         destabil            dest                
misunderstanding    misunderstand       misunderstand       
railroad            railroad            railroad            
moonlight           moonlight           moonlight           
football            footbal             footbal             
pass                pass                pass                
passing             pass                pass                
friendship          friendship          friend              
friends             friend              friend              
friendships         friendship          friend              
passed              pass                pass                
trouble             troubl              troubl              
troubling           troubl              troubl              
care                care                car                 
believes            bel

### Stemming

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize

def stem(sentence):
    token_words = word_tokenize(sentence)
    sentence_stemmed = []
    for word in token_words:
        sentence_stemmed.append(porter.stem(word))
        sentence_stemmed.append(" ")
    return "".join(sentence_stemmed)


In [7]:
s = "J.K. Rowling wrote Harry Potter. She never expected the book to be famous."
stem(s)

'j.k. rowl wrote harri potter . she never expect the book to be famou . '

In [8]:
sent_tokenize(s)

['J.K. Rowling wrote Harry Potter.',
 'She never expected the book to be famous.']

In [9]:
# Be carefull separating phrases
s.split(".")

['J',
 'K',
 ' Rowling wrote Harry Potter',
 ' She never expected the book to be famous',
 '']

In [10]:
word_tokenize(s)

['J.K.',
 'Rowling',
 'wrote',
 'Harry',
 'Potter',
 '.',
 'She',
 'never',
 'expected',
 'the',
 'book',
 'to',
 'be',
 'famous',
 '.']

### Lemmatization


Lemmatization consists on properly use of a vocabulary and morphological analysis of words, aiming to remove inflectional endings only with the goal of returning any word to a set of base (or dictionary form) words.


`Lemmatize(saw) = see`


We will use a lemmatizer from WordNet (https://wordnet.princeton.edu) avaliable from nltk.


In [7]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [8]:
sentence = "I was running and eating. This was a terrible idea."
punctuations="?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\35796/nltk_data'
    - 'c:\\Users\\35796\\AppData\\Local\\Programs\\Python\\Python310\\nltk_data'
    - 'c:\\Users\\35796\\AppData\\Local\\Programs\\Python\\Python310\\share\\nltk_data'
    - 'c:\\Users\\35796\\AppData\\Local\\Programs\\Python\\Python310\\lib\\nltk_data'
    - 'C:\\Users\\35796\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [None]:
sentence_words
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word)))

Notice that the words did no change!

This is because there was no context. If we give a part of speech type then the lemmatizer will do what we would expect.

In [None]:
sentence_words
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word, pos="v")))

In [None]:
words = ["dogs","destabilize","misunderstanding","railroad","moonlight",
         "football","pass","passing","friendship", "friends", "friendships",
         "passed","trouble","troubling","care", "believes"]
preprocess = [porter, lancaster, wordnet_lemmatizer]

len_bin = 20
col_formater = "{0:len_bin}{1:len_bin}{2:len_bin}{3:len_bin}".replace("len_bin",str(len_bin))
print(col_formater.format("Word", porter.__class__.__name__, lancaster.__class__.__name__, wordnet_lemmatizer.__class__.__name__))
print("")
for w in words:
    print( col_formater.format(w, porter.stem(w), lancaster.stem(w), wordnet_lemmatizer.lemmatize(w)))

## Features for documents


### From docs to feature vectors: Make your own countvectorizer


Let us build a simple document classifier featurizing each document by word counts


In [None]:
import sklearn.linear_model
import sklearn.model_selection
import sklearn.pipeline
import sklearn.feature_extraction
import sklearn.datasets
import scipy
import scipy.sparse as sp

from sklearn.feature_extraction.text import CountVectorizer

In [None]:
X = sklearn.datasets.fetch_20newsgroups()

X_train = sklearn.datasets.fetch_20newsgroups(subset="train").data
y_train = sklearn.datasets.fetch_20newsgroups(subset="train").target
X_test  = sklearn.datasets.fetch_20newsgroups(subset="test").data
y_test  = sklearn.datasets.fetch_20newsgroups(subset="test").target

In [None]:
x = X_train[0]

In [None]:
np.unique(y_train)

In [None]:
print(x)

In [None]:
y_train[0]

### Tiny function to create a feature matrix of word counts (feature counting)


In [None]:
from collections import defaultdict

docs = [['hello', 'world', 'hello'], ['goodbye', 'cruel', 'teacher', 'goodbye']]

def prepare_word_counts_with_dict(docs: Iterable[str], verbose=False):
    ind_ptr = [0]
    ind_col = []
    data = []
    vocabulary = {}
    
    for m, doc in enumerate(docs):
        word_ind_counter = defaultdict(int)  # document counter for each doc in X
        for word in doc: 
            vocabulary.setdefault(word, len(vocabulary))
            word_ind_counter[word] += 1
                
        data.extend(word_ind_counter.values())
        ind_ptr.append(ind_ptr[-1] + len(word_ind_counter))
        ind_col.extend([vocabulary[w] for w in word_ind_counter.keys()])
    
    if verbose:
        print(data)
        print(ind_col)
        print(ind_ptr)
    return (data, ind_col, ind_ptr)

sp.csr_matrix(prepare_word_counts_with_dict(docs, verbose=True)).toarray()

In [None]:
docs = [['hello', 'world', 'hello'], ['goodbye', 'cruel', 'teacher', 'goodbye']]

def prepare_word_counts_with_dict(docs: Iterable[str], verbose=False):
    ind_ptr = [0]
    ind_col = []
    data = []
    vocabulary = {}
    
    for m, doc in enumerate(docs):
        word_ind_counter = defaultdict(int)  # document counter for each doc in X
        for word in doc: 
            vocabulary.setdefault(word, len(vocabulary))
            word_ind_counter[word] += 1
                
        data.extend(word_ind_counter.values())
        ind_ptr.append(ind_ptr[-1] + len(word_ind_counter))
        ind_col.extend([vocabulary[w] for w in word_ind_counter.keys()])

    if verbose:
        print('len vocab =', len(vocabulary))
        print('vocab =', vocabulary)
        print('data =', data)
        print('ind_ptr =', ind_ptr)
        print('ind_col =', ind_col)
        
    return (data, ind_col, ind_ptr)

In [None]:
prepare_word_counts_with_dict(docs)

In [None]:
sp.csr_matrix(prepare_word_counts_with_dict(docs)).toarray()

We can create a bigger dataset to benchmark

In [None]:
docs_big = docs * 1000

In [None]:
%%timeit 
prepare_word_counts_with_dict(docs_big)

In [None]:
sp.csr_matrix(prepare_word_counts_with_dict(docs_big))

### Tiny function to create a feature matrix of word counts  (no feature counting)

We can create a CSR word count matrix without explicitly counting each word count.

- **Note**: `sp.csr_matrix` is smart enough to join counts of `data`, `ind_col` and `ind_ptr` that happen to be in the same coordinates.

In [None]:
docs = [['hello', 'world', 'hello'], ['goodbye', 'cruel', 'teacher']]

def prepare_word_counts0(docs: Iterable[str]):
    ind_ptr = [0]
    ind_col = []
    data = []
    vocabulary = {}
    
    for doc in docs:
        for w in doc:
            if w in vocabulary:
                index = vocabulary[w]
            else:
                index = len(vocabulary)
                vocabulary[w] = len(vocabulary)
            ind_col.append(index)
            data.append(1)
        ind_ptr.append(len(ind_col))
    return (data, ind_col, ind_ptr)

In [None]:
data, ind_col, ind_ptr = prepare_word_counts0(docs)
print(data)
print(ind_col)
print(ind_ptr)

In [None]:
sp.csr_matrix(prepare_word_counts0(docs)).toarray()

In [None]:
from collections import defaultdict
docs = [['hello', 'world', 'hello'], ['goodbye', 'cruel', 'teacher']]

def prepare_word_counts1(docs: Iterable[str]):
    ind_ptr = [0]
    ind_col = []
    data = []
    vocabulary = defaultdict(int)
    
    for doc in docs:
        for w in doc:
            index = vocabulary.setdefault(w, len(vocabulary))
            ind_col.append(index)
            data.append(1)
        ind_ptr.append(len(ind_col))
    
    return (data, ind_col, ind_ptr)

In [None]:
sp.csr_matrix(prepare_word_counts(docs)).toarray()

#### Benchmarking  approaches

In [None]:
docs_big = docs*1000

In [None]:
%%timeit 
prepare_word_counts0(docs_big)

In [None]:
%%timeit 
prepare_word_counts1(docs_big)

In [None]:
%%timeit 
prepare_word_counts_with_dict(docs_big)

# Customising Vectorizer classes

- **preprocessor**: a callable that takes an entire document as input (as a single string), and returns a possibly transformed version of the document, still as an entire string. This can be used to remove HTML tags, lowercase the entire document, etc.


- **tokenizer**: a callable that takes the output from the preprocessor and splits it into tokens, then returns a list of these.


- **analyzer**: a callable that replaces the preprocessor and tokenizer. The default analyzers all call the preprocessor and tokenizer, but custom analyzers will skip this. N-gram extraction and stop word filtering take place at the analyzer level, so a custom analyzer may have to reproduce these steps.

##### Example of how to encode sparse matrix fast


Notice that in order to build our data as a matrix we need to use sparse matrices due to the high dimensionality (number of words/features) of the vocabulary.

Here there is a little example to illustrate how we can build a csr_matrix (compressed sparse row matrix) fast.

In [None]:
X = np.array([[1,0,2],[2,1,0],[0,1,3]])
X

In [None]:
data = [1,2,2,1,1,3]
row  = [0,0,1,1,2,2]
#ind_ptr = [0,3,4,9]
col = [0,2,0,1,1,2]
sp.csr_matrix( (data,(row,col)), shape=(3,3) ).todense()

#### Exercise: Build a Simple countvectorizer

Complete methods `fit` and `transform`


In [None]:
X_train[4], y_train[4]

In [None]:
# "David's car"

In [None]:
import scipy
import scipy.sparse as sp
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from collections import defaultdict
import re
stemmer =  SnowballStemmer(language='english')

class SimpleCountVectorizer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    
    def __init__(self,
                 min_word_counts=1,
                 doc_cleaner_pattern=r"[^a-zA-Z]",
                 token_pattern=r"(?u)\b\w\w+\b",
                 dtype=np.float32,
                 doc_cleaner_func=None,
                 tokenizer_func=None,
                 word_transformer_func=None):
        
        self._retype = type(re.compile('hello, world'))

        self.min_word_counts     = min_word_counts
        self.doc_cleaner_pattern = doc_cleaner_pattern
        self.token_pattern       = token_pattern
        self.dtype               = dtype
        
        self.doc_cleaner_func      = doc_cleaner_func
        self.tokenizer_func        = tokenizer_func
        self.word_transformer_func = word_transformer_func

        self.word_to_pos = {}


    def build_doc_cleaner(self, lower=True):
        """
        Returns a function that cleans undesirable substrings in a string.
        It also lowers the input string if lower=True
        """
        if self.doc_cleaner_func:
            return self.doc_cleaner_func
        else:
            if isinstance(self.doc_cleaner_pattern, self._retype):
                clean_doc_pattern = re.compile(self.doc_cleaner_pattern)
            else:
                clean_doc_pattern = re.compile(self.doc_cleaner_pattern)

            if lower:
                 return lambda doc: clean_doc_pattern.sub(" ", doc).lower()
            else:
                 return lambda doc: clean_doc_pattern.sub(" ", doc)

    def build_tokenizer(self):
        """Returns a function that splits a string into a sequence of tokens"""
        if self.tokenizer_func:
            return self.tokenizer_func
        
        else:
            token_pattern = re.compile(self.token_pattern)
            return lambda doc: token_pattern.findall(doc)

    def build_word_transformer(self):
        """Returns a stemmer or lemmatizer if object has any"""
        
        if self.word_transformer_func:
            return self.word_transformer_func
        else:
            return lambda word: word
        
    def tokenize(self, doc):
        doc_cleaner      = self.build_doc_cleaner()
        doc_tokenizer    = self.build_tokenizer()
        doc     = doc_cleaner(doc)
        words = doc_tokenizer(doc)
            
        return words
        
    def fit(self, X):

        assert isinstance(X,list), "X is expected to be a list of documents"
        
        i = 0
        word_to_pos = {}
        doc_cleaner      = self.build_doc_cleaner()
        doc_tokenizer    = self.build_tokenizer()
        word_transformer = self.build_word_transformer()
        
        for x in X:
            x     = doc_cleaner(x)
            words = doc_tokenizer(x)
            for word in words:
                word = word_transformer(word)                  
                if word not in word_to_pos:
                    word_to_pos[word] = i
                    i = i + 1

        #self.doc_cleaner = doc_cleaner
        #self.doc_tokenizer = doc_tokenizer
        #self.word_transformer = word_transformer
        
        self.word_to_pos = word_to_pos            
        self.n_features = len(self.word_to_pos)
        return self
    
    def transform(self, X):
        """
        Implements a transform where counts are created at runtime and kept with a dict
        """
        
        doc_cleaner      = self.build_doc_cleaner()
        doc_tokenizer    = self.build_tokenizer()
        word_transformer = self.build_word_transformer()      
        
        col_indices = []
        row_indices = []
        sp_data     = []
        
        for m, doc in enumerate(X):
            doc = doc_cleaner(doc)
            word_ind_counter = defaultdict(int)  # document counter for each doc in X
            for word in doc_tokenizer(doc):
                word = word_transformer(word)   
                if word in self.word_to_pos:
                    word_ind_counter[self.word_to_pos[word]] +=1 # word count aggregation

            sp_data.extend(word_ind_counter.values())
            row_indices.extend([m]*len(word_ind_counter))
            col_indices.extend(word_ind_counter.keys())

        encoded_X = sp.csr_matrix((sp_data,(row_indices,col_indices)),
                                   shape=(len(X), self.n_features),
                                   dtype=self.dtype)
        
        return encoded_X

    def fit_transform(self, X, y=None):
        word_to_pos = {}
        doc_cleaner      = self.build_doc_cleaner()
        doc_tokenizer    = self.build_tokenizer()
        word_transformer = self.build_word_transformer()
        
        data = []
        ind_col = []
        ind_ptr = [0]
        
        for x in X:
            x     = doc_cleaner(x)
            words = doc_tokenizer(x)
            for word in words:
                word = word_transformer(word)                  
                index = word_to_pos.setdefault(word, len(word_to_pos))
                ind_col.append(index)
                data.append(1)
            ind_ptr.append(len(ind_col))
                           
        self.word_to_pos = word_to_pos            
        self.n_features = len(self.word_to_pos)
        self.word_to_pos = word_to_pos
        
        #self.doc_cleaner = doc_cleaner
        #self.doc_tokenizer = doc_tokenizer
        #self.word_transformer = word_transformer
        
        X_transformed = sp.csr_matrix((data, ind_col, ind_ptr))
        return X_transformed

###  Training a document classifier with `SimpleCountVectorizer`

In [None]:
vainilla_count_vectorizer = SimpleCountVectorizer( doc_cleaner_func=lambda doc: doc)
vainilla_count_vectorizer.fit(X_train)

In [None]:
vainilla_count_vectorizer.transform(X_train)

In [None]:
X_train_ = vainilla_count_vectorizer.transform(X_train)
logistic = sklearn.linear_model.LogisticRegression(C=0.1, max_iter=50)
logistic.fit(X_train_, y_train)

In [None]:
np.mean(logistic.predict(X_train_) == y_train.flatten())

#### I) No Stemmer and no doc_cleaner

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vainilla_count_vectorizer = CountVectorizer()

logistic = sklearn.linear_model.LogisticRegression(C=0.1)

model_pipe_0 = sklearn.pipeline.Pipeline([("countvectorizer", vainilla_count_vectorizer),
                                         ("logisticregression", logistic)])

In [None]:
model_pipe_0.steps

In [None]:
model_pipe_0.fit(X_train[0:100],y_train[0:100])

In [None]:
model_pipe_0.predict(X_train[0:100])

In [None]:
%%time
model_pipe_0.fit(X_train,y_train)
y_test_pred  = model_pipe_0.predict(X_test)
y_train_pred = model_pipe_0.predict(X_train)

In [None]:
model_pipe_0.steps[0][1].transform(X_train)

In [None]:
acc_train_0 = np.mean(y_train == y_train_pred)
acc_test_0 = np.mean(y_test == y_test_pred)
print("Accuracy train: {}    Accuracy test: {}".format(acc_train_0, acc_test_0))

#### II) No stemmer but doc_cleaner

In [None]:
simple_count_vectorizer = SimpleCountVectorizer(doc_cleaner_pattern=re.compile("[^a-zA-Z]"))
logistic = sklearn.linear_model.LogisticRegression(C=0.1)

model_pipe_1 = sklearn.pipeline.Pipeline([("countvectorizer", simple_count_vectorizer),
                                        ("logisticregression", logistic)])

In [None]:
%%time
model_pipe_1.fit(X_train,y_train)

In [None]:
y_test_pred  = model_pipe_1.predict(X_test)
y_train_pred = model_pipe_1.predict(X_train)

acc_train_1 = np.mean(y_train == y_train_pred)
acc_test_1 = np.mean(y_test == y_test_pred)

print("Accuracy train: {}    Accuracy test: {}".format(acc_train_1, acc_test_1))

#### III) Use a SnowballStemmer

In [None]:
simple_count_vectorizer_stemmer = SimpleCountVectorizer(word_transformer_func= SnowballStemmer('english').stem,
                                                        doc_cleaner_pattern=re.compile("[^a-zA-Z]"))

logistic = sklearn.linear_model.LogisticRegression(C=0.1)

model_pipe_2 = sklearn.pipeline.Pipeline([("countvectorizer", simple_count_vectorizer_stemmer),
                                        ("logisticregression", logistic)],
                                         )#memory='/Users/Shared/sklearn_mem/')

In [None]:
%%time
model_pipe_2.fit(X_train,y_train)

y_test_pred  = model_pipe_2.predict(X_test)
y_train_pred = model_pipe_2.predict(X_train)

acc_train_2 = np.mean(y_train == y_train_pred)
acc_test_2  = np.mean(y_test == y_test_pred)

print("Accuracy train: {}    Accuracy test: {}".format(acc_train_2, acc_test_2))

#### Table with results for each pipeline

In [None]:
import pandas as pd

In [None]:
df_results = pd.DataFrame()
df_results["no clean no stem"]   = [acc_train_0, acc_test_0]
df_results["yes clean no stem"]  = [acc_train_1, acc_test_1]
df_results["yes clean yes stem"] = [acc_train_2, acc_test_2]
df_results.index=["train","test"] 

In [None]:
df_results


###   Ngram features with Sklearn vectorizer


####  IV) Training a document classifier with sklearn `CountVectorizer`

In [None]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer()
logistic = sklearn.linear_model.LogisticRegression(C=0.1)

model_pipe_3 = sklearn.pipeline.Pipeline([("countvectorizer", count_vectorizer),
                                          ("logisticregression", logistic)])


In [None]:
%%time
model_pipe_3.fit(X_train,y_train)

y_test_pred  = model_pipe_3.predict(X_test)
y_train_pred = model_pipe_3.predict(X_train)

acc_train_3 = np.mean(y_train == y_train_pred)
acc_test_3  = np.mean(y_test == y_test_pred)

print("Accuracy train: {}    Accuracy test: {}".format(acc_train_3, acc_test_3))

In [None]:
df_results["sklearn countvectorizer"] = [acc_train_3, acc_test_3]

In [None]:
df_results

#### V) Training a document classifier with sklearn `CountVectorizer` and ngrams

In [None]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,2))
logistic = sklearn.linear_model.LogisticRegression(C=0.1)

model_pipe_4 = sklearn.pipeline.Pipeline([("countvectorizer", count_vectorizer),
                                          ("logisticregression", logistic)])

In [None]:
%%time
model_pipe_4.fit(X_train,y_train)

y_test_pred  = model_pipe_4.predict(X_test)
y_train_pred = model_pipe_4.predict(X_train)

acc_train_4 = np.mean(y_train == y_train_pred)
acc_test_4  = np.mean(y_test == y_test_pred)

print("Accuracy train: {}    Accuracy test: {}".format(acc_train_4, acc_test_4))

In [None]:
model_pipe_4.steps[0][1].transform(X_train[0:1])

In [None]:
df_results["sklearn countvectorizer 2gram"] = [acc_train_4, acc_test_4]

In [None]:
df_results

In [None]:
%matplotlib inline
df_results.T["test"].plot(kind="barh", xlim=(0.79,0.83))

##  Feature selection

###  SelectKbest 

In [None]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,2))
feature_selector = SelectKBest(chi2, k = 700000)
logistic = sklearn.linear_model.LogisticRegression(C=0.1)

model_pipe_5 = sklearn.pipeline.Pipeline([("count_vectorizer", count_vectorizer),
                                          ("feature_selector", feature_selector),
                                          ("logisticregression", logistic)])

In [None]:
%%time
model_pipe_5.fit(X_train, y_train)

In [None]:
acc_train = np.mean(model_pipe_5.predict(X_train) == y_train)
acc_test = np.mean(model_pipe_5.predict(X_test) == y_test)
df_results["sklearn countvectorizer 2gram + selection"] = [acc_train, acc_test]

In [None]:
df_results.T["test"].plot(kind="barh", xlim=(0.79,0.83))

###  Feature Union

In [None]:
simple_count_vectorizer_stemmer = SimpleCountVectorizer(word_transformer_func= SnowballStemmer('english').stem,
                                                        doc_cleaner_pattern=re.compile("[^a-zA-Z]"))

count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,2))

In [None]:
union = sklearn.pipeline.FeatureUnion([("simple_count_vectorizer_stemmer", simple_count_vectorizer_stemmer),
                                       ("count_vectorizer", count_vectorizer)])

In [None]:
logistic = sklearn.linear_model.LogisticRegression(C=0.1)
feature_selector = SelectKBest(chi2, k = 700000)
model_pipe_6 = sklearn.pipeline.Pipeline([("union_vectorizers", union),
                                          ("feature_selector", feature_selector),
                                          ("logisticregression", logistic)])

In [None]:
%%time
model_pipe_6.fit(X_train, y_train)

In [None]:
acc_train = np.mean(model_pipe_6.predict(X_train) == y_train)
acc_test = np.mean(model_pipe_6.predict(X_test) == y_test)
df_results["Feature union + selection"] = [acc_train, acc_test]

In [None]:
df_results.T["test"].plot(kind="barh", xlim=(0.79,0.83))