# Cleanup Pipeline 3

Adding bigrams to custom **DocumentToWordCounterTransformer** class.

---

In [1]:
import re
import os
import time
import json

import numpy as np
import pandas as pd

import urlextract
from nltk import ngrams
from nltk.stem import WordNetLemmatizer

import cleanup_module as Cmod
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import train_test_split

---

### Hybrid approach

In [2]:
from html import unescape
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from scipy.sparse import csr_matrix
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin

def expand_contractions(text, contractions_map):
    
    pattern = re.compile('({})'.format('|'.join(contractions_map.keys())), 
                        flags=re.IGNORECASE|re.DOTALL)
    
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_map.get(match)\
                                if contractions_map.get(match)\
                                else contractions_map.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def is_ascii(doc):
    try:
        doc.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [3]:
class DocumentToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, expand_contractions=True, lower_case=True, 
                 replace_usernames=True, unescape_html=True, 
                 replace_urls=True, replace_numbers=True, 
                 remove_junk=True, remove_punctuation=True, 
                 replace_emojis=True, replace_nonascii=True, 
                 remove_stopwords=True, lemmatization=True,
                 bigrams=True):
        self.expand_contractions = expand_contractions
        self.lower_case = lower_case
        self.replace_usernames = replace_usernames
        self.unescape_html = unescape_html
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.remove_junk = remove_junk
        self.remove_punctuation = remove_punctuation
        self.replace_emojis = replace_emojis
        self.replace_nonascii = replace_nonascii
        self.remove_stopwords = remove_stopwords
        self.lemmatization = lemmatization
        self.bigrams = bigrams
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for doc in X:
            if self.lower_case:
                doc = doc.lower()
            if self.expand_contractions and contractions_map is not None:
                doc = expand_contractions(doc, contractions_map)
            if self.replace_usernames:
                doc = re.sub(r'^@([^\s]+)',' USERNAME ', doc)
            if self.unescape_html:
                doc = unescape(doc)
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(doc)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    doc = doc.replace(url, ' URL ')
            if self.replace_numbers:
                doc = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' NUMBER ', doc)
            if self.remove_junk:
                pattern = r'\¥|\â|\«|\»|\Ñ|\Ð|\¼|\½|\¾|\!|\?|\¿|\x82\
                            |\x83|\x84|\x85|\x86|\x87|\x88|\x89|\
                            |\x8a|\x8b|\x8c|\x8d|\x8e|\°|\µ|\´|\º|\¹|\³'
                doc = re.sub(pattern,'', doc)
            if self.remove_punctuation:
                doc = re.sub(r'\W+', ' ', doc, flags=re.M)
            if self.replace_emojis:
                doc = re.sub(r'[^\x00-\x7F]+', ' EMOJI ', doc)
            if self.replace_nonascii:
                if is_ascii(doc) == False:
                    doc = ' NONASCII '
            # tokenize
            tokens = doc.split()
            if self.remove_stopwords:
                stop_words = ['a','an','and','are','as','at','be','by','for','from',
                              'has','he','in','is','it','its','of','on','that','the',
                              'to','was','were','will','with']
                tokens = [t for t in tokens if t not in stop_words]
            if self.lemmatization and lemmatizer is not None:
                unigrams = [lemmatizer.lemmatize(t) for t in tokens]
            if self.bigrams:
                bigrams = ngrams(word_tokenize(doc), 2)
                bigrams = ['_'.join(grams) for grams in bigrams]
                tokens = [*tokens, *bigrams]       
            # include counts
            tokens_counts = Counter(tokens)
            # append to list
            X_transformed.append(tokens_counts)
        return np.array(X_transformed)

In [4]:
with open("contractions_map.json") as f:
    contractions_map = json.load(f)

url_extractor = urlextract.URLExtract()
lemmatizer = WordNetLemmatizer()

In [5]:
corpus = ['You love me', 
          'You do not love me',
          'You really really love food']

In [6]:
wordvec = DocumentToWordCounterTransformer()
X_trans = wordvec.fit_transform(corpus)

In [7]:
X_trans

array([Counter({'you': 1, 'love': 1, 'me': 1, 'you_love': 1, 'love_me': 1}),
       Counter({'you': 1, 'do': 1, 'not': 1, 'love': 1, 'me': 1, 'you_do': 1, 'do_not': 1, 'not_love': 1, 'love_me': 1}),
       Counter({'really': 2, 'you': 1, 'love': 1, 'food': 1, 'you_really': 1, 'really_really': 1, 'really_love': 1, 'love_food': 1})],
      dtype=object)

In [8]:
pipe = Pipeline([('counter', DocumentToWordCounterTransformer()),
                 ('bow', Cmod.WordCounterToVectorTransformer(vocabulary_size=20)),
                 ('tfidf', TfidfTransformer())])

In [9]:
pipe_counter = pipe['counter'].fit_transform(corpus)

In [10]:
pipe_bow = pipe['bow'].fit(pipe_counter)

In [11]:
pipe_bow.vocabulary_

{'you': 1,
 'love': 2,
 'me': 3,
 'love_me': 4,
 'really': 5,
 'you_love': 6,
 'do': 7,
 'not': 8,
 'you_do': 9,
 'do_not': 10,
 'not_love': 11,
 'food': 12,
 'you_really': 13,
 'really_really': 14,
 'really_love': 15,
 'love_food': 16}

In [12]:
bow = pipe_bow.transform(pipe_counter)

In [13]:
bow.toarray() # first col is "words missing from vocab"

array([[0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0]],
      dtype=int32)

In [14]:
 # IDF for the pipe_bow.vocabulary_
[np.around(x,3) for x in pipe['tfidf'].fit(bow).idf_[1:]]

[1.0,
 1.0,
 1.288,
 1.288,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 2.386,
 2.386,
 2.386,
 2.386]

In [15]:
tfidf = pipe['tfidf'].fit_transform(bow)

In [16]:
[np.around(x,4) for x in tfidf.toarray()]

[array([0.    , 0.3496, 0.3496, 0.4501, 0.4501, 0.    , 0.5919, 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    ]),
 array([0.    , 0.2256, 0.2256, 0.2905, 0.2905, 0.    , 0.    , 0.382 ,
        0.382 , 0.382 , 0.382 , 0.382 , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    ]),
 array([0.    , 0.1897, 0.1897, 0.    , 0.    , 0.6422, 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.3211, 0.3211, 0.3211, 0.3211,
        0.3211, 0.    , 0.    , 0.    , 0.    ])]

In [17]:
# entire pipeline produces same result but does't save IDF or vocab
end_res = pipe.fit_transform(corpus)
[np.around(x,4) for x in end_res.toarray()]

[array([0.    , 0.3496, 0.3496, 0.4501, 0.4501, 0.    , 0.5919, 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    ]),
 array([0.    , 0.2256, 0.2256, 0.2905, 0.2905, 0.    , 0.    , 0.382 ,
        0.382 , 0.382 , 0.382 , 0.382 , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    ]),
 array([0.    , 0.1897, 0.1897, 0.    , 0.    , 0.6422, 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.3211, 0.3211, 0.3211, 0.3211,
        0.3211, 0.    , 0.    , 0.    , 0.    ])]

### POC: sample 10% of the training data

About 120,000 instances.

In [18]:
# load minimally prepared X, y train subsets
raw_path = os.path.join("..","data","1_raw","sentiment140")
X_train = pd.read_csv(os.path.join(raw_path, "X_train.csv"))
y_train = pd.read_csv(os.path.join(raw_path, "y_train.csv"))

# sample 10%
X, X_rest, y, y_rest = train_test_split(X_train, y_train, test_size=0.9, random_state=42)

# create array
X_array = np.array(X.iloc[:, 2]).ravel()

In [19]:
X_array.shape

(119747,)

In [20]:
# full pipe
pipe = Pipeline([('counter', DocumentToWordCounterTransformer()),
                 ('bow', Cmod.WordCounterToVectorTransformer(vocabulary_size=1000)),
                 ('tfidf', TfidfTransformer())])

X_end = pipe.fit_transform(X_array)

In [21]:
X_end

<119747x1001 sparse matrix of type '<class 'numpy.float64'>'
	with 1279478 stored elements in Compressed Sparse Row format>

In [22]:
[np.around(x,3) for x in X_end.toarray()[:10,:6]]

[array([0.636, 0.117, 0.   , 0.   , 0.   , 0.   ]),
 array([0.727, 0.   , 0.   , 0.   , 0.   , 0.   ]),
 array([0.816, 0.   , 0.   , 0.   , 0.   , 0.   ]),
 array([0.706, 0.   , 0.085, 0.   , 0.   , 0.   ]),
 array([0.728, 0.1  , 0.049, 0.078, 0.   , 0.   ]),
 array([0.318, 0.118, 0.   , 0.   , 0.   , 0.   ]),
 array([0.568, 0.087, 0.086, 0.138, 0.   , 0.   ]),
 array([0.594, 0.   , 0.051, 0.   , 0.   , 0.   ]),
 array([0.872, 0.052, 0.   , 0.   , 0.076, 0.   ]),
 array([0.583, 0.067, 0.   , 0.106, 0.   , 0.   ])]

In [23]:
# step by step for vocab and idf
pipe_counter = pipe['counter'].fit_transform(X_array)

In [24]:
pipe_bow = pipe['bow'].fit(pipe_counter) 
pipe_bow.vocabulary_size

1000

In [25]:
for ix, w in enumerate(pipe_bow.vocabulary_.items()):
    if ix < 10:
        print(w)

('i', 1)
('USERNAME', 2)
('NUMBER', 3)
('my', 4)
('you', 5)
('not', 6)
('am', 7)
('have', 8)
('i_am', 9)
('me', 10)


In [26]:
bow = pipe_bow.transform(pipe_counter)

In [27]:
bow.toarray()[:10] # misses too many words?

array([[10,  1,  0, ...,  0,  0,  0],
       [ 7,  0,  0, ...,  0,  0,  0],
       [ 9,  0,  0, ...,  0,  0,  0],
       ...,
       [21,  0,  1, ...,  0,  0,  0],
       [31,  1,  0, ...,  0,  0,  0],
       [16,  1,  0, ...,  0,  0,  0]], dtype=int32)

In [28]:
[np.around(x,4) for x in pipe['tfidf'].fit(bow).idf_[:10]] # IDFs

[1.0094, 1.8634, 1.8303, 2.935, 2.7413, 2.866, 2.9798, 3.2305, 3.2467, 3.3431]

In [29]:
tfidf = pipe['tfidf'].fit_transform(bow)

In [30]:
tfidf

<119747x1001 sparse matrix of type '<class 'numpy.float64'>'
	with 1279478 stored elements in Compressed Sparse Row format>

In [31]:
[np.around(x,3) for x in tfidf.toarray()[:10,:6]] 

[array([0.636, 0.117, 0.   , 0.   , 0.   , 0.   ]),
 array([0.727, 0.   , 0.   , 0.   , 0.   , 0.   ]),
 array([0.816, 0.   , 0.   , 0.   , 0.   , 0.   ]),
 array([0.706, 0.   , 0.085, 0.   , 0.   , 0.   ]),
 array([0.728, 0.1  , 0.049, 0.078, 0.   , 0.   ]),
 array([0.318, 0.118, 0.   , 0.   , 0.   , 0.   ]),
 array([0.568, 0.087, 0.086, 0.138, 0.   , 0.   ]),
 array([0.594, 0.   , 0.051, 0.   , 0.   , 0.   ]),
 array([0.872, 0.052, 0.   , 0.   , 0.076, 0.   ]),
 array([0.583, 0.067, 0.   , 0.106, 0.   , 0.   ])]

### Train and evaluate couple quick models

In [32]:
pipe = Pipeline([('counter', DocumentToWordCounterTransformer()),
                 ('bow', Cmod.WordCounterToVectorTransformer(vocabulary_size=50000)), # more realistic vocabulary size
                 ('tfidf', TfidfTransformer())])

In [33]:
X1 = pipe['counter'].fit_transform(X_array)

In [34]:
X2 = pipe['bow'].fit(X1)

In [35]:
for i, w in enumerate(X2.vocabulary_):
    if i > 49990:
        print(i, w)

49991 probably_already
49992 booked_the
49993 gambling
49994 hair_lol
49995 lol_xoxo
49996 so_watching
49997 an_age
49998 drove_by
49999 the_fat


In [36]:
X3 = pipe['bow'].fit_transform(X1)

In [37]:
X3 # BoW

<119747x50001 sparse matrix of type '<class 'numpy.int32'>'
	with 2224008 stored elements in Compressed Sparse Row format>

In [38]:
X4 = pipe['tfidf'].fit_transform(X3) # Tfidf

In [39]:
y_array = y.iloc[:,0].ravel()

### Naive Bayes

In [40]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

NB_clf = MultinomialNB()

# BoW with bigrams
score = cross_val_score(NB_clf, X3, y_array, cv=10, verbose=1, scoring='accuracy')
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy: 0.7885 (+/- 0.0018)


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished


In [41]:
# Tfidf with bigrams
score = cross_val_score(NB_clf, X4, y_array, cv=10, verbose=1, scoring='accuracy')
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy: 0.7911 (+/- 0.0028)


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished


### Logistic Regression

In [42]:
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression(solver="liblinear", random_state=42)

# BoW with bigrams
score = cross_val_score(log_clf, X3, y_array, cv=10, verbose=1, scoring='accuracy')
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy: 0.7899 (+/- 0.0023)


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.4min finished


In [43]:
# Tfidf with bigrams
score = cross_val_score(log_clf, X4, y_array, cv=10, verbose=1, scoring='accuracy')
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy: 0.8008 (+/- 0.0025)


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   18.2s finished


---