# Cleanup Pipeline 3

*Purpose*
- adding bigrams to custom **DocumentToWordCounterTransformer** class. Note that this implementation keeps unigrams. TF-IDF is then performed on top of this Bag-of-upto-Bigrams representation.

*Issues*
- unable to remove stopwords from bigrams.
- unable to select bigrams only option.

*Results*
- using the new **DocumentToBigramCounterTransformer** class on 10% of the training data (120k instances) and a vocabulary of 50k tokens (so 50k features) yields the following accuracies with 10-fold cross-validation:

| Model | Representation | Accuracy | Variance |
|:---|:---|:---|:---|
|Naive Bayes |Bag-of-upto-Bigrams | 0.7890 |(+/- 0.0022)|
|Naive Bayes |BoB + TF-IDF| 0.7908 |(+/- 0.0024)|
|Logistic Regr. |Bag-of-upto-Bigrams  | 0.7895 |(+/- 0.0021)|
|Logistic Regr. |BoB + TF-IDF| 0.8019 |(+/- 0.0024)|

---

In [2]:
import re
import os
import time
import json

import numpy as np
import pandas as pd

import urlextract
from nltk import ngrams
from nltk.stem import WordNetLemmatizer

import cleanup_module as Cmod
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import train_test_split

---

### Hybrid approach

In [3]:
from html import unescape
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from scipy.sparse import csr_matrix
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin

def expand_contractions(text, contractions_map):
    
    pattern = re.compile('({})'.format('|'.join(contractions_map.keys())), 
                        flags=re.IGNORECASE|re.DOTALL)
    
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_map.get(match)\
                                if contractions_map.get(match)\
                                else contractions_map.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def is_ascii(doc):
    try:
        doc.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [4]:
class DocumentToBigramCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, expand_contractions=True, lower_case=True, 
                 replace_usernames=True, unescape_html=True, 
                 replace_urls=True, replace_numbers=True, 
                 remove_junk=True, remove_punctuation=True, 
                 replace_emojis=True, replace_nonascii=True, 
                 remove_stopwords=True, lemmatization=True,
                 bigrams=True):
        self.expand_contractions = expand_contractions
        self.lower_case = lower_case
        self.replace_usernames = replace_usernames
        self.unescape_html = unescape_html
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.remove_junk = remove_junk
        self.remove_punctuation = remove_punctuation
        self.replace_emojis = replace_emojis
        self.replace_nonascii = replace_nonascii
        self.remove_stopwords = remove_stopwords
        self.lemmatization = lemmatization
        self.bigrams = bigrams
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for doc in X:
            if self.lower_case:
                doc = doc.lower()
            if self.expand_contractions and contractions_map is not None:
                doc = expand_contractions(doc, contractions_map)
            if self.replace_usernames:
                doc = re.sub(r'^@([^\s]+)',' USERNAME ', doc)
            if self.unescape_html:
                doc = unescape(doc)
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(doc)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    doc = doc.replace(url, ' URL ')
            if self.replace_numbers:
                doc = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' NUMBER ', doc)
            if self.remove_junk:
                pattern = r'\¥|\â|\«|\»|\Ñ|\Ð|\¼|\½|\¾|\!|\?|\¿|\x82\
                            |\x83|\x84|\x85|\x86|\x87|\x88|\x89|\
                            |\x8a|\x8b|\x8c|\x8d|\x8e|\°|\µ|\´|\º|\¹|\³'
                doc = re.sub(pattern,'', doc)
            if self.remove_punctuation:
                doc = re.sub(r'\W+', ' ', doc, flags=re.M)
            if self.replace_emojis:
                doc = re.sub(r'[^\x00-\x7F]+', ' EMOJI ', doc)
            if self.replace_nonascii:
                if is_ascii(doc) == False:
                    doc = ' NONASCII '
            # tokenize
            tokens = doc.split()
            if self.remove_stopwords:
                stop_words = ['a','an','and','are','as','at','be','by','for','from',
                              'has','he','in','is','it','its','of','on','that','the',
                              'to','was','were','will','with']
                tokens = [t for t in tokens if t not in stop_words]
            if self.lemmatization and lemmatizer is not None:
                tokens = [lemmatizer.lemmatize(t) for t in tokens]
            if self.bigrams:
                bigrams = ngrams(word_tokenize(doc), 2)
                bigrams = ['_'.join(grams) for grams in bigrams]
                tokens = [*tokens, *bigrams]
            # include counts
            tokens_counts = Counter(tokens)
            # append to list
            X_transformed.append(tokens_counts)
        return np.array(X_transformed)

In [5]:
with open("contractions_map.json") as f:
    contractions_map = json.load(f)

url_extractor = urlextract.URLExtract()
lemmatizer = WordNetLemmatizer()

In [9]:
corpus = ['You the love me', 
          'You do not love me',
          'You really really love food']

In [10]:
wordvec = DocumentToBigramCounterTransformer()
X_trans = wordvec.fit_transform(corpus)

In [11]:
X_trans

array([Counter({'you': 1, 'love': 1, 'me': 1, 'you_the': 1, 'the_love': 1, 'love_me': 1}),
       Counter({'you': 1, 'do': 1, 'not': 1, 'love': 1, 'me': 1, 'you_do': 1, 'do_not': 1, 'not_love': 1, 'love_me': 1}),
       Counter({'really': 2, 'you': 1, 'love': 1, 'food': 1, 'you_really': 1, 'really_really': 1, 'really_love': 1, 'love_food': 1})],
      dtype=object)

In [12]:
pipe = Pipeline([('counter', DocumentToBigramCounterTransformer()),
                 ('bow', Cmod.WordCounterToVectorTransformer(vocabulary_size=20)),
                 ('tfidf', TfidfTransformer())])

In [13]:
pipe_counter = pipe['counter'].fit_transform(corpus)

In [14]:
pipe_bow = pipe['bow'].fit(pipe_counter)

In [15]:
pipe_bow.vocabulary_

{'you': 1,
 'love': 2,
 'me': 3,
 'love_me': 4,
 'really': 5,
 'you_the': 6,
 'the_love': 7,
 'do': 8,
 'not': 9,
 'you_do': 10,
 'do_not': 11,
 'not_love': 12,
 'food': 13,
 'you_really': 14,
 'really_really': 15,
 'really_love': 16,
 'love_food': 17}

In [16]:
bow = pipe_bow.transform(pipe_counter)

In [17]:
bow.toarray() # first col is "words missing from vocab"

array([[0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0]],
      dtype=int32)

In [18]:
 # IDF for the pipe_bow.vocabulary_
[np.around(x,3) for x in pipe['tfidf'].fit(bow).idf_[1:]]

[1.0,
 1.0,
 1.288,
 1.288,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 1.693,
 2.386,
 2.386,
 2.386]

In [19]:
tfidf = pipe['tfidf'].fit_transform(bow)

In [20]:
[np.around(x,4) for x in tfidf.toarray()]

[array([0.    , 0.3008, 0.3008, 0.3874, 0.3874, 0.    , 0.5094, 0.5094,
        0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    ]),
 array([0.    , 0.2256, 0.2256, 0.2905, 0.2905, 0.    , 0.    , 0.    ,
        0.382 , 0.382 , 0.382 , 0.382 , 0.382 , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    ]),
 array([0.    , 0.1897, 0.1897, 0.    , 0.    , 0.6422, 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    , 0.3211, 0.3211, 0.3211,
        0.3211, 0.3211, 0.    , 0.    , 0.    ])]

In [21]:
# entire pipeline produces same result but does't save IDF or vocab
end_res = pipe.fit_transform(corpus)
[np.around(x,4) for x in end_res.toarray()]

[array([0.    , 0.3008, 0.3008, 0.3874, 0.3874, 0.    , 0.5094, 0.5094,
        0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    ]),
 array([0.    , 0.2256, 0.2256, 0.2905, 0.2905, 0.    , 0.    , 0.    ,
        0.382 , 0.382 , 0.382 , 0.382 , 0.382 , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    ]),
 array([0.    , 0.1897, 0.1897, 0.    , 0.    , 0.6422, 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    , 0.3211, 0.3211, 0.3211,
        0.3211, 0.3211, 0.    , 0.    , 0.    ])]

### POC: sample 10% of the training data

About 120,000 instances.

In [22]:
# load minimally prepared X, y train subsets
raw_path = os.path.join("..","data","1_raw","sentiment140")
X_train = pd.read_csv(os.path.join(raw_path, "X_train.csv"))
y_train = pd.read_csv(os.path.join(raw_path, "y_train.csv"))

# sample 10%
X, X_rest, y, y_rest = train_test_split(X_train, y_train, test_size=0.9, random_state=42)

# create array
X_array = np.array(X.iloc[:, 2]).ravel()

In [23]:
X_array.shape

(119747,)

In [24]:
# full pipe
pipe = Pipeline([('counter', DocumentToBigramCounterTransformer()),
                 ('bow', Cmod.WordCounterToVectorTransformer(vocabulary_size=1000)),
                 ('tfidf', TfidfTransformer())])

X_end = pipe.fit_transform(X_array)

In [25]:
X_end

<119747x1001 sparse matrix of type '<class 'numpy.float64'>'
	with 1299526 stored elements in Compressed Sparse Row format>

In [26]:
[np.around(x,3) for x in X_end.toarray()[:10,:6]]

[array([0.549, 0.113, 0.   , 0.   , 0.   , 0.   ]),
 array([0.736, 0.   , 0.   , 0.   , 0.   , 0.   ]),
 array([0.674, 0.   , 0.   , 0.   , 0.   , 0.   ]),
 array([0.706, 0.   , 0.085, 0.   , 0.   , 0.   ]),
 array([0.754, 0.099, 0.049, 0.078, 0.   , 0.   ]),
 array([0.319, 0.118, 0.   , 0.   , 0.   , 0.   ]),
 array([0.571, 0.088, 0.086, 0.138, 0.   , 0.   ]),
 array([0.573, 0.   , 0.052, 0.   , 0.   , 0.   ]),
 array([0.86 , 0.053, 0.   , 0.   , 0.078, 0.   ]),
 array([0.6  , 0.069, 0.   , 0.109, 0.   , 0.   ])]

In [27]:
# step by step for vocab and idf
X_counter = pipe['counter'].fit_transform(X_array)

In [28]:
X_counter_fit = pipe['bow'].fit(X_counter) 
X_counter_fit.vocabulary_size

1000

In [29]:
for ix, tuple_ in enumerate(X_counter_fit.vocabulary_.items()):
    if ix < 10 or ix > X_counter_fit.vocabulary_size-10:
        print(tuple_)

('i', 1)
('USERNAME', 2)
('NUMBER', 3)
('my', 4)
('you', 5)
('not', 6)
('am', 7)
('have', 8)
('i_am', 9)
('me', 10)
('to_NUMBER', 992)
('watching_the', 993)
('worst', 994)
('make_me', 995)
('oh_no', 996)
('message', 997)
('my_friends', 998)
('running', 999)
('you_too', 1000)


In [30]:
X_bow = pipe_bow.transform(X_counter)

In [31]:
X_bow.toarray()[6:16,:6] # first row is how many words the dictionary missed: turns out a lot

array([[26,  0,  0,  0,  0,  0],
       [35,  0,  0,  0,  0,  0],
       [41,  0,  0,  0,  0,  1],
       [33,  0,  0,  0,  0,  0],
       [12,  2,  0,  0,  0,  0],
       [11,  0,  0,  0,  0,  0],
       [35,  2,  0,  0,  0,  0],
       [22,  0,  0,  0,  0,  0],
       [10,  2,  0,  0,  0,  0],
       [ 9,  0,  0,  0,  0,  0]], dtype=int32)

In [32]:
[np.around(x,4) for x in pipe['tfidf'].fit(X_bow).idf_[:10]] # IDFs

[1.0, 2.866, 4.2015, 3.3384, 8.2987, 4.5142, 8.3891, 8.6858, 3.6736, 2.9798]

In [33]:
tfidf = pipe['tfidf'].fit_transform(X_bow)

In [34]:
tfidf

<119747x21 sparse matrix of type '<class 'numpy.float64'>'
	with 188390 stored elements in Compressed Sparse Row format>

In [35]:
[np.around(x,3) for x in tfidf.toarray()[6:15,:6]]

[array([0.993, 0.   , 0.   , 0.   , 0.   , 0.   ]),
 array([0.874, 0.   , 0.   , 0.   , 0.   , 0.   ]),
 array([0.994, 0.   , 0.   , 0.   , 0.   , 0.109]),
 array([1., 0., 0., 0., 0., 0.]),
 array([0.902, 0.431, 0.   , 0.   , 0.   , 0.   ]),
 array([1., 0., 0., 0., 0., 0.]),
 array([0.982, 0.161, 0.   , 0.   , 0.   , 0.   ]),
 array([1., 0., 0., 0., 0., 0.]),
 array([0.868, 0.497, 0.   , 0.   , 0.   , 0.   ])]

### Train and evaluate couple quick models

In [36]:
pipe = Pipeline([('counter', DocumentToBigramCounterTransformer()),
                 ('bow', Cmod.WordCounterToVectorTransformer(vocabulary_size=50000)), # more realistic vocabulary size
                 ('tfidf', TfidfTransformer())])

In [37]:
X_counter = pipe['counter'].fit_transform(X_array)

In [38]:
X_counter_fit = pipe['bow'].fit(X_counter)

In [39]:
for ix, tuple_ in enumerate(X_counter_fit.vocabulary_.items()):
    if ix < 10 or ix > X_counter_fit.vocabulary_size-10:
        print(tuple_)

('i', 1)
('USERNAME', 2)
('NUMBER', 3)
('my', 4)
('you', 5)
('not', 6)
('am', 7)
('have', 8)
('i_am', 9)
('me', 10)
('and_nearly', 49992)
('wishing_everyone', 49993)
('in_twitterland', 49994)
('in_december', 49995)
('mouthed', 49996)
('claw', 49997)
('ass_day', 49998)
('bagus', 49999)
('follow_gw', 50000)


In [40]:
X_bow = pipe['bow'].fit_transform(X_counter)
X_bow

<119747x50001 sparse matrix of type '<class 'numpy.intc'>'
	with 2232287 stored elements in Compressed Sparse Row format>

In [41]:
X_tfidf = pipe['tfidf'].fit_transform(X_bow)
X_tfidf

<119747x50001 sparse matrix of type '<class 'numpy.float64'>'
	with 2232287 stored elements in Compressed Sparse Row format>

In [42]:
y_array = y.iloc[:,0].ravel()

### Naive Bayes

In [43]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

NB_clf = MultinomialNB()

# BoW with bigrams
score = cross_val_score(NB_clf, X_bow, y_array, cv=10, verbose=1, scoring='accuracy')
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy: 0.7890 (+/- 0.0022)


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.7s finished


In [44]:
# Tfidf with bigrams
score = cross_val_score(NB_clf, X_tfidf, y_array, cv=10, verbose=1, scoring='accuracy')
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy: 0.7908 (+/- 0.0024)


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.9s finished


### Logistic Regression

In [45]:
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression(solver="liblinear", random_state=42)

# BoW with bigrams
score = cross_val_score(log_clf, X_bow, y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   20.1s remaining:   13.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   29.7s finished


Accuracy: 0.7895 (+/- 0.0021)


In [46]:
# Tfidf with bigrams
score = cross_val_score(log_clf, X_tfidf, y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    3.9s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.2s finished


Accuracy: 0.8019 (+/- 0.0024)


---