# Cleanup Pipeline 3

Adding ngrams to custom **DocumentToWordCounterTransformer** class.

---

In [1]:
import re
import os
import time
import json

import numpy as np
import pandas as pd

import urlextract
from html import unescape

from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import cleanup_module as Cmod

from scipy.sparse import csr_matrix
from collections import Counter

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import train_test_split

---

### Hybrid approach

In [2]:
def expand_contractions(text, contractions_map):
    
    pattern = re.compile('({})'.format('|'.join(contractions_map.keys())), 
                        flags=re.IGNORECASE|re.DOTALL)
    
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_map.get(match)\
                                if contractions_map.get(match)\
                                else contractions_map.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def is_ascii(doc):
    try:
        doc.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [3]:
class DocumentToNgramCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, expand_contractions=True, lower_case=True, 
                 replace_usernames=True, unescape_html=True, 
                 replace_urls=True, replace_numbers=True, 
                 remove_junk=True, remove_punctuation=True, 
                 replace_emojis=True, replace_nonascii=True, 
                 remove_stopwords=True, lemmatization=True,
                 n_grams=2 # defaults to bigram
                ): 
        self.expand_contractions = expand_contractions
        self.lower_case = lower_case
        self.replace_usernames = replace_usernames
        self.unescape_html = unescape_html
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.remove_junk = remove_junk
        self.remove_punctuation = remove_punctuation
        self.replace_emojis = replace_emojis
        self.replace_nonascii = replace_nonascii
        self.remove_stopwords = remove_stopwords
        self.lemmatization = lemmatization
        self.n_grams = n_grams
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for doc in X:
            if self.lower_case:
                doc = doc.lower()
            if self.expand_contractions and contractions_map is not None:
                doc = expand_contractions(doc, contractions_map)
            if self.replace_usernames:
                doc = re.sub(r'^@([^\s]+)',' USERNAME ', doc)
            if self.unescape_html:
                doc = unescape(doc)
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(doc)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    doc = doc.replace(url, ' URL ')
            if self.replace_numbers:
                doc = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' NUMBER ', doc)
            if self.remove_junk:
                pattern = r'\¥|\â|\«|\»|\Ñ|\Ð|\¼|\½|\¾|\!|\?|\¿|\x82\
                            |\x83|\x84|\x85|\x86|\x87|\x88|\x89|\
                            |\x8a|\x8b|\x8c|\x8d|\x8e|\°|\µ|\´|\º|\¹|\³'
                doc = re.sub(pattern,'', doc)
            if self.remove_punctuation:
                doc = re.sub(r'\W+', ' ', doc, flags=re.M)
            if self.replace_emojis:
                doc = re.sub(r'[^\x00-\x7F]+', ' EMOJI ', doc)
            if self.replace_nonascii:
                if is_ascii(doc) == False:
                    doc = ' NONASCII '
            # tokenize
            tokens = doc.split()
            if self.remove_stopwords:
                stop_words = ['a','an','and','are','as','at','be','by','for','from',
                              'has','he','in','is','it','its','of','on','that','the',
                              'to','was','were','will','with']
                tokens = [t for t in tokens if t not in stop_words]
            if self.lemmatization and lemmatizer is not None:
                unigrams = [lemmatizer.lemmatize(t) for t in tokens]
            if self.n_grams:
                for i in range(self.n_grams+1):
                    grams = ngrams(word_tokenize(doc), i)
                    grams = ['_'.join(gram) for gram in grams]
                    tokens = [*tokens, *grams]
            # include counts
            tokens_counts = Counter(tokens)
            # append to list
            X_transformed.append(tokens_counts)
        return np.array(X_transformed)

In [4]:
with open("contractions_map.json") as f:
    contractions_map = json.load(f)

url_extractor = urlextract.URLExtract()
lemmatizer = WordNetLemmatizer()

In [5]:
corpus = ['You love me', 
          'You do not love me',
          'You really really love food']

In [6]:
wordvec = DocumentToNgramCounterTransformer(n_grams=3)
X_trans = wordvec.fit_transform(corpus)

In [7]:
X_trans

array([Counter({'you': 3, 'love': 3, 'me': 3, 'you_love': 1, 'love_me': 1, 'you_love_me': 1}),
       Counter({'you': 3, 'do': 3, 'not': 3, 'love': 3, 'me': 3, 'you_do': 1, 'do_not': 1, 'not_love': 1, 'love_me': 1, 'you_do_not': 1, 'do_not_love': 1, 'not_love_me': 1}),
       Counter({'really': 6, 'you': 3, 'love': 3, 'food': 3, 'you_really': 1, 'really_really': 1, 'really_love': 1, 'love_food': 1, 'you_really_really': 1, 'really_really_love': 1, 'really_love_food': 1})],
      dtype=object)

In [8]:
pipe = Pipeline([('counter', DocumentToNgramCounterTransformer(n_grams=3)),
                 ('bow', Cmod.WordCounterToVectorTransformer(vocabulary_size=20)),
                 ('tfidf', TfidfTransformer())])

In [9]:
pipe_counter = pipe['counter'].fit_transform(corpus)

In [10]:
pipe_bow = pipe['bow'].fit(pipe_counter)

In [11]:
pipe_bow.vocabulary_

{'you': 1,
 'love': 2,
 'me': 3,
 'really': 4,
 'do': 5,
 'not': 6,
 'food': 7,
 'love_me': 8,
 'you_love': 9,
 'you_love_me': 10,
 'you_do': 11,
 'do_not': 12,
 'not_love': 13,
 'you_do_not': 14,
 'do_not_love': 15,
 'not_love_me': 16,
 'you_really': 17,
 'really_really': 18,
 'really_love': 19,
 'love_food': 20}

In [12]:
bow = pipe_bow.transform(pipe_counter)

In [13]:
bow.toarray() # first col is "words missing from vocab"

array([[0, 3, 3, 3, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 3, 3, 3, 0, 3, 3, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
       [3, 3, 3, 0, 6, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]],
      dtype=int32)

In [14]:
 # IDF for the pipe_bow.vocabulary_
[np.around(x,3) for x in pipe['tfidf'].fit(bow).idf_[1:]]

[1.0,
 1.0,
 1.288,
 1.693,
 1.693,
 1.693,
 1.693,
 1.288,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693]

In [15]:
tfidf = pipe['tfidf'].fit_transform(bow)

In [16]:
[np.around(x,4) for x in tfidf.toarray()]

[array([0.    , 0.4725, 0.4725, 0.6084, 0.    , 0.    , 0.    , 0.    ,
        0.2028, 0.2667, 0.2667, 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    ]),
 array([0.    , 0.2951, 0.2951, 0.3799, 0.    , 0.4996, 0.4996, 0.    ,
        0.1266, 0.    , 0.    , 0.1665, 0.1665, 0.1665, 0.1665, 0.1665,
        0.1665, 0.    , 0.    , 0.    , 0.    ]),
 array([0.3742, 0.221 , 0.221 , 0.    , 0.7484, 0.    , 0.    , 0.3742,
        0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.1247, 0.1247, 0.1247, 0.1247])]

In [17]:
# entire pipeline produces same result but does't save IDF or vocab
end_res = pipe.fit_transform(corpus)
[np.around(x,4) for x in end_res.toarray()]

[array([0.    , 0.4725, 0.4725, 0.6084, 0.    , 0.    , 0.    , 0.    ,
        0.2028, 0.2667, 0.2667, 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    ]),
 array([0.    , 0.2951, 0.2951, 0.3799, 0.    , 0.4996, 0.4996, 0.    ,
        0.1266, 0.    , 0.    , 0.1665, 0.1665, 0.1665, 0.1665, 0.1665,
        0.1665, 0.    , 0.    , 0.    , 0.    ]),
 array([0.3742, 0.221 , 0.221 , 0.    , 0.7484, 0.    , 0.    , 0.3742,
        0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.1247, 0.1247, 0.1247, 0.1247])]

### POC: sample 10% of the training data

About 120,000 instances.

In [18]:
# load minimally prepared X, y train subsets
raw_path = os.path.join("..","data","1_raw","sentiment140")
X_train = pd.read_csv(os.path.join(raw_path, "X_train.csv"))
y_train = pd.read_csv(os.path.join(raw_path, "y_train.csv"))

# sample 10%
X, X_rest, y, y_rest = train_test_split(X_train, y_train, test_size=0.9, random_state=42)

# create array
X_array = np.array(X.iloc[:, 2]).ravel()

In [19]:
X_array.shape

(119747,)

In [20]:
# full pipe
pipe = Pipeline([('counter', DocumentToNgramCounterTransformer()), # bigrams
                 ('bow', Cmod.WordCounterToVectorTransformer(vocabulary_size=1000)),
                 ('tfidf', TfidfTransformer())])

In [21]:
start_time = time.time()

X_end = pipe.fit_transform(X_array)

mins, secs = divmod(time.time() - start_time, 60)
print(f'Elapsed: {mins:0.0f} min {secs:0.0f} sec')

Elapsed: 2 min 55 sec


In [22]:
X_end

<119747x1001 sparse matrix of type '<class 'numpy.float64'>'
	with 1539899 stored elements in Compressed Sparse Row format>

In [23]:
[np.around(x,3) for x in X_end.toarray()[:10,:6]]

[array([0.487, 0.169, 0.   , 0.137, 0.   , 0.   ]),
 array([0.501, 0.   , 0.   , 0.   , 0.   , 0.   ]),
 array([0.653, 0.   , 0.   , 0.   , 0.177, 0.   ]),
 array([0.503, 0.   , 0.114, 0.094, 0.   , 0.   ]),
 array([0.481, 0.141, 0.069, 0.171, 0.   , 0.111]),
 array([0.098, 0.136, 0.   , 0.11 , 0.   , 0.   ]),
 array([0.311, 0.115, 0.113, 0.   , 0.   , 0.181]),
 array([0.332, 0.   , 0.065, 0.053, 0.   , 0.   ]),
 array([0.61 , 0.077, 0.   , 0.125, 0.   , 0.   ]),
 array([0.398, 0.092, 0.   , 0.149, 0.   , 0.145])]

In [24]:
# step by step for vocab and idf
start_time = time.time()

pipe_counter = pipe['counter'].fit_transform(X_array)

mins, secs = divmod(time.time() - start_time, 60)
print(f'Elapsed: {mins:0.0f} min {secs:0.0f} sec')

Elapsed: 2 min 48 sec


In [25]:
start_time = time.time()

pipe_bow = pipe['bow'].fit(pipe_counter) 

mins, secs = divmod(time.time() - start_time, 60)
print(f'Elapsed: {mins:0.0f} min {secs:0.0f} sec')

Elapsed: 0 min 4 sec


In [26]:
pipe_bow.vocabulary_size

1000

In [27]:
for ix, w in enumerate(pipe_bow.vocabulary_.items()):
    if ix < 10:
        print(w)

('i', 1)
('USERNAME', 2)
('to', 3)
('the', 4)
('NUMBER', 5)
('my', 6)
('you', 7)
('not', 8)
('a', 9)
('is', 10)


In [28]:
start_time = time.time()

bow = pipe_bow.transform(pipe_counter)

mins, secs = divmod(time.time() - start_time, 60)
print(f'Elapsed: {mins:0.0f} min {secs:0.0f} sec')

Elapsed: 0 min 4 sec


In [29]:
bow.toarray()[:10] # misses too many words?

array([[16,  3,  0, ...,  0,  0,  0],
       [13,  0,  0, ...,  0,  0,  0],
       [17,  0,  0, ...,  0,  0,  0],
       ...,
       [28,  0,  3, ...,  0,  0,  0],
       [44,  3,  0, ...,  0,  0,  0],
       [24,  3,  0, ...,  0,  0,  0]], dtype=int32)

In [30]:
[np.around(x,4) for x in pipe['tfidf'].fit(bow).idf_[:10]] # IDFs

[1.0062, 1.8634, 1.8303, 2.2596, 2.32, 2.935, 2.7413, 2.866, 2.8091, 2.5682]

In [31]:
start_time = time.time()

tfidf = pipe['tfidf'].fit_transform(bow)

mins, secs = divmod(time.time() - start_time, 60)
print(f'Elapsed: {mins:0.0f} min {secs:0.0f} sec')

Elapsed: 0 min 0 sec


In [32]:
tfidf

<119747x1001 sparse matrix of type '<class 'numpy.float64'>'
	with 1539899 stored elements in Compressed Sparse Row format>

In [33]:
[np.around(x,3) for x in tfidf.toarray()[:10,:6]] 

[array([0.487, 0.169, 0.   , 0.137, 0.   , 0.   ]),
 array([0.501, 0.   , 0.   , 0.   , 0.   , 0.   ]),
 array([0.653, 0.   , 0.   , 0.   , 0.177, 0.   ]),
 array([0.503, 0.   , 0.114, 0.094, 0.   , 0.   ]),
 array([0.481, 0.141, 0.069, 0.171, 0.   , 0.111]),
 array([0.098, 0.136, 0.   , 0.11 , 0.   , 0.   ]),
 array([0.311, 0.115, 0.113, 0.   , 0.   , 0.181]),
 array([0.332, 0.   , 0.065, 0.053, 0.   , 0.   ]),
 array([0.61 , 0.077, 0.   , 0.125, 0.   , 0.   ]),
 array([0.398, 0.092, 0.   , 0.149, 0.   , 0.145])]

### Train and evaluate couple quick models on bigrams

In [34]:
pipe = Pipeline([('counter', DocumentToNgramCounterTransformer()), # bigrams
                 ('bow', Cmod.WordCounterToVectorTransformer(vocabulary_size=50000)), # more realistic vocab size
                 ('tfidf', TfidfTransformer())])

In [35]:
start_time = time.time()

X1 = pipe['counter'].fit_transform(X_array)

mins, secs = divmod(time.time() - start_time, 60)
print(f'Elapsed: {mins:0.0f} min {secs:0.0f} sec')

Elapsed: 2 min 49 sec


In [36]:
X2 = pipe['bow'].fit(X1)

In [37]:
for i, w in enumerate(X2.vocabulary_):
    if i > 49980:
        print(i, w)

49981 money_but
49982 brut
49983 kats
49984 wuzzup
49985 cutiest
49986 thers
49987 scarecrow
49988 cleanliness
49989 twitterwall
49990 shuttles
49991 rolla
49992 gardner
49993 surgury
49994 nozzle
49995 printers
49996 entertainer
49997 stephy
49998 selfridges
49999 peppermint


In [38]:
X3 = pipe['bow'].fit_transform(X1) # BoW

In [39]:
X4 = pipe['tfidf'].fit_transform(X3) # Tfidf

In [40]:
y_array = y.iloc[:,0].ravel()

### Naive Bayes

In [41]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

NB_clf = MultinomialNB()

# BoW with bigrams
score = cross_val_score(NB_clf, X3, y_array, cv=10, verbose=1, scoring='accuracy')
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy: 0.7812 (+/- 0.0032)


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.7s finished


In [42]:
# Tfidf with bigrams
score = cross_val_score(NB_clf, X4, y_array, cv=10, verbose=1, scoring='accuracy')
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy: 0.7835 (+/- 0.0035)


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.9s finished


### Logistic Regression

In [43]:
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression(solver="liblinear", random_state=42)

# BoW with bigrams
score = cross_val_score(log_clf, X3, y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  1.6min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.3min finished


Accuracy: 0.7786 (+/- 0.0033)


In [44]:
# Tfidf with bigrams
score = cross_val_score(log_clf, X4, y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    5.2s remaining:    3.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    7.7s finished


Accuracy: 0.7973 (+/- 0.0032)


---