# Twitter Sentiment Analysis - POC
---

## 4. Cleanup Pipeline - version 2

Repurposing the original count vectorizer to become a Tfidf vectorizer, ended up with a hybric approach, pipeline with part custom (count vectorizer) and part sklearn (tfidf). 

This is a dev notebook not the final one.

In [1]:
import re
import os
import time
import json

import numpy as np
import pandas as pd

import urlextract
from html import unescape
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

In [2]:
# functions
def expand_contractions(text, contractions_map):
    
    pattern = re.compile('({})'.format('|'.join(contractions_map.keys())), 
                        flags=re.IGNORECASE|re.DOTALL)
    
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_map.get(match)\
                                if contractions_map.get(match)\
                                else contractions_map.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def is_ascii(doc):
    try:
        doc.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [3]:
class DocumentToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, expand_contractions=True, lower_case=True, 
                 replace_usernames=True, unescape_html=True, 
                 replace_urls=True, replace_numbers=True, 
                 remove_junk=True, remove_punctuation=True, 
                 replace_emojis=True, replace_nonascii=True, 
                 remove_stopwords=True, lemmatization=True):
        self.expand_contractions = expand_contractions
        self.lower_case = lower_case
        self.replace_usernames = replace_usernames
        self.unescape_html = unescape_html
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.remove_junk = remove_junk
        self.remove_punctuation = remove_punctuation
        self.replace_emojis = replace_emojis
        self.replace_nonascii = replace_nonascii
        self.remove_stopwords = remove_stopwords
        self.lemmatization = lemmatization
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for doc in X:
            if self.lower_case:
                doc = doc.lower()
            if self.expand_contractions and contractions_map is not None:
                doc = expand_contractions(doc, contractions_map)
            if self.replace_usernames:
                doc = re.sub(r'^@([^\s]+)',' USERNAME ', doc)
            if self.unescape_html:
                doc = unescape(doc)
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(doc)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    doc = doc.replace(url, ' URL ')
            if self.replace_numbers:
                doc = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' NUMBER ', doc)
            if self.remove_junk:
                pattern = r'\¥|\â|\«|\»|\Ñ|\Ð|\¼|\½|\¾|\!|\?|\¿|\x82\
                            |\x83|\x84|\x85|\x86|\x87|\x88|\x89|\
                            |\x8a|\x8b|\x8c|\x8d|\x8e|\°|\µ|\´|\º|\¹|\³'
                doc = re.sub(pattern,'', doc)
            if self.remove_punctuation:
                doc = re.sub(r'\W+', ' ', doc, flags=re.M)
            if self.replace_emojis:
                doc = re.sub(r'[^\x00-\x7F]+', ' EMOJI ', doc)
            if self.replace_nonascii:
                if is_ascii(doc) == False:
                    doc = ' NONASCII '
            word_counts = Counter(doc.split())
            if self.remove_stopwords:
                # 25 semantically non-selective words from the Reuters-RCV1 dataset
                stop_words = ['a','an','and','are','as','at','be','by','for','from',
                              'has','he','in','is','it','its','of','on','that','the',
                              'to','was','were','will','with']
                for word in stop_words:
                    try:
                        word_counts.pop(word)
                    except KeyError:
                        continue
            if self.expand_contractions:
                    leftovers = ['t','s','d','m','ve','re','ll','']
            if self.lemmatization and lemmatizer is not None:
                lemmatized_word_counts = Counter()
                for word, count in word_counts.items():
                    lemmatized_word = lemmatizer.lemmatize(word)
                    lemmatized_word_counts[lemmatized_word] += count
                word_counts = lemmatized_word_counts      
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [4]:
with open("contractions_map.json") as f:
    contractions_map = json.load(f)

url_extractor = urlextract.URLExtract()
lemmatizer = WordNetLemmatizer()

In [5]:
from scipy.sparse import csr_matrix

class WordCounterToRawTfidfTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 1)  
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self   
    def transform(self, X, y=None):               
        rows, cols, tfidf_raw = [], [], []
        # Note that X is the result of DocumentToWordCounterTransformer.fit_transform(X_array)
        # basically the wordcounter, developing the Tfidf implementation can be done with 
        # the nested for loops below
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                col = self.vocabulary_.get(word, 0)
                cols.append(col)
                n_docs = len(X) # corpus size
                tf = count # term freq is actually the raw count
                #tf_norm = count/self.vocabulary_size # normalized TFs
                df = [ct for w, ct in self.most_common_][col-1] # term DFs
                idf = np.log((n_docs + 1)/(df + 1)) + 1 # sklearn's implementation
                tfidf_raw.append(tf*idf) 
        return csr_matrix((tfidf_raw, (rows, cols)), 
                          shape=(n_docs, self.vocabulary_size + 1))

In [6]:
corpus = ['You love me', 
          'You do not love me',
          'You really really love food']

In [7]:
X_wordcounts = DocumentToWordCounterTransformer().fit_transform(corpus)
X_wordvec = WordCounterToRawTfidfTransformer(vocabulary_size=7).fit(X_wordcounts)
#X_wordvec.vocabulary_size #X_wordvec.vocabulary_ #X_wordvec.most_common_
X_output = X_wordvec.transform(X_wordcounts)

In [8]:
[np.around(x,3) for x in X_output.todense()[:,1:]]

[array([[1.   , 1.   , 1.288, 0.   , 0.   , 0.   , 0.   ]]),
 array([[1.   , 1.   , 1.288, 1.693, 1.693, 0.   , 0.   ]]),
 array([[1.   , 1.   , 0.   , 0.   , 0.   , 3.386, 1.693]])]

### Raw Tfidf

In [9]:
df = pd.DataFrame()
df['vocabulary --->'] = [word for word, ct in X_wordvec.vocabulary_.items()]
df.set_index('vocabulary --->', inplace=True)
df['doc 1 vector'] = X_output.toarray()[0][1:]
df['doc 2 vector'] = X_output.toarray()[1][1:]
df['doc 3 vector'] = X_output.toarray()[2][1:]
df.T

vocabulary --->,you,love,me,do,not,really,food
doc 1 vector,1.0,1.0,1.287682,0.0,0.0,0.0,0.0
doc 2 vector,1.0,1.0,1.287682,1.693147,1.693147,0.0,0.0
doc 3 vector,1.0,1.0,0.0,0.0,0.0,3.386294,1.693147


In [10]:
# 3rd doc
[np.around(x,3) for x in X_output.toarray()[2, 1:]]

[1.0, 1.0, 0.0, 0.0, 0.0, 3.386, 1.693]

In [11]:
def get_L2norm(vec):
    squares = [x**2 for x in vec]
    den = np.sqrt(np.sum(squares))
    L2norm = [x/den for x in vec]
    return L2norm

In [12]:
tfidf_norm = [get_L2norm(vec) for vec in X_output.toarray()]

### Normalized Tfidf

In [13]:
df = pd.DataFrame()
df['vocabulary --->'] = [word for word, ct in X_wordvec.vocabulary_.items()]
df.set_index('vocabulary --->', inplace=True)
df['doc 1 vector'] = tfidf_norm[0][1:]
df['doc 2 vector'] = tfidf_norm[1][1:]
df['doc 3 vector'] = tfidf_norm[2][1:]
df.T

vocabulary --->,you,love,me,do,not,really,food
doc 1 vector,0.522842,0.522842,0.673255,0.0,0.0,0.0,0.0
doc 2 vector,0.32631,0.32631,0.420183,0.55249,0.55249,0.0,0.0
doc 3 vector,0.247433,0.247433,0.0,0.0,0.0,0.83788,0.41894


**ISSUES**:

- Is it only possible to normalize after returning the raw Tfidf?
- Is there a way to calculate the L2 norm in the transform method of the WordCounterToVectorTransformer?
- Previous attempts failed as they didn't take into account the 0s, since the nested for loops return the dense bits
- Also needs to retain IDF for vocabulary_ (not the IDFs for all the terms)

---

### Using TfidfVectorizer

Doesn't have the `most_common_` attribute returned, so it's ordered alphabetically... this might be a problem.

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names()

['do', 'food', 'love', 'me', 'not', 'really', 'you']

In [15]:
for i in X.toarray():
    print(np.around(i, 3))

[0.    0.    0.523 0.673 0.    0.    0.523]
[0.552 0.    0.326 0.42  0.552 0.    0.326]
[0.    0.419 0.247 0.    0.    0.838 0.247]


In [16]:
# sequential use, saves IDFs
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vocabulary = ['do', 'food', 'love', 'me', 'not', 'really', 'you']
pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),
                 ('tfid', TfidfTransformer())]).fit(corpus)

pipe['count'].transform(corpus).toarray()

array([[0, 0, 1, 1, 0, 0, 1],
       [1, 0, 1, 1, 1, 0, 1],
       [0, 1, 1, 0, 0, 2, 1]], dtype=int64)

In [17]:
[np.around(x, 3) for x in pipe['tfid'].idf_] # IDFs

[1.693, 1.693, 1.0, 1.288, 1.693, 1.693, 1.0]

In [18]:
for i in pipe.transform(corpus).toarray():
    print(np.around(i, 3))

[0.    0.    0.523 0.673 0.    0.    0.523]
[0.552 0.    0.326 0.42  0.552 0.    0.326]
[0.    0.419 0.247 0.    0.    0.838 0.247]


### Hybrid approach

In [19]:
import cleanup_module_POC as Cmod

pipe = Pipeline([('counter', Cmod.DocumentToWordCounterTransformer()),
                 ('bow', Cmod.WordCounterToVectorTransformer(vocabulary_size=7)),
                 ('tfidf', TfidfTransformer())])

In [20]:
pipe_counter = pipe['counter'].fit_transform(corpus)

In [21]:
pipe_bow = pipe['bow'].fit(pipe_counter)

In [22]:
pipe_bow.vocabulary_

{'you': 1, 'love': 2, 'me': 3, 'really': 4, 'do': 5, 'not': 6, 'food': 7}

In [23]:
bow = pipe_bow.transform(pipe_counter)

In [24]:
bow.toarray() # first col is "words missing from vocab"

array([[0, 1, 1, 1, 0, 0, 0, 0],
       [0, 1, 1, 1, 0, 1, 1, 0],
       [0, 1, 1, 0, 2, 0, 0, 1]], dtype=int32)

In [25]:
 # IDF for the pipe_bow.vocabulary_
[np.around(x,3) for x in pipe['tfidf'].fit(bow).idf_[1:]]

[1.0, 1.0, 1.288, 1.693, 1.693, 1.693, 1.693]

In [26]:
tfidf = pipe['tfidf'].fit_transform(bow)

In [27]:
[np.around(x,3) for x in tfidf.toarray()]

[array([0.   , 0.523, 0.523, 0.673, 0.   , 0.   , 0.   , 0.   ]),
 array([0.   , 0.326, 0.326, 0.42 , 0.   , 0.552, 0.552, 0.   ]),
 array([0.   , 0.247, 0.247, 0.   , 0.838, 0.   , 0.   , 0.419])]

In [28]:
# entire pipeline produces same result but does't save IDF or vocab
end_res = pipe.fit_transform(corpus)
[np.around(x,4) for x in end_res.toarray()]

[array([0.    , 0.5228, 0.5228, 0.6733, 0.    , 0.    , 0.    , 0.    ]),
 array([0.    , 0.3263, 0.3263, 0.4202, 0.    , 0.5525, 0.5525, 0.    ]),
 array([0.    , 0.2474, 0.2474, 0.    , 0.8379, 0.    , 0.    , 0.4189])]

### Using small POC sample

In [29]:
# load minimally prepared X, y train subsets
raw_path = os.path.join("..","data","1_raw","sentiment140")
X_train = pd.read_csv(os.path.join(raw_path, "X_train.csv"))
y_train = pd.read_csv(os.path.join(raw_path, "y_train.csv"))

# sample for dev
X, X_rest, y, y_rest = train_test_split(X_train, y_train, test_size=0.9999, random_state=42)
# create array
X_array = np.array(X.iloc[:, 2]).ravel()

In [30]:
X_array[:2]

array(["@isaimperial I KEEP LISTENING TO EMONESS. ))) I think I'm turning EMO. I need you guys to make me happy.  WHYYYYY",
       '@BRIGHT_RAINBOW lol you always watch it '], dtype=object)

In [31]:
X_array.shape

(119,)

In [32]:
# full pipe
X_end = pipe.fit_transform(X_array)

In [33]:
X_end

<119x8 sparse matrix of type '<class 'numpy.float64'>'
	with 314 stored elements in Compressed Sparse Row format>

In [34]:
[np.around(x,3) for x in X_end.toarray()[:10,]]

[array([0.796, 0.525, 0.119, 0.   , 0.   , 0.   , 0.196, 0.196]),
 array([0.655, 0.   , 0.393, 0.   , 0.   , 0.   , 0.645, 0.   ]),
 array([0.633, 0.537, 0.163, 0.   , 0.   , 0.   , 0.   , 0.534]),
 array([0.966, 0.191, 0.174, 0.   , 0.   , 0.   , 0.   , 0.   ]),
 array([1., 0., 0., 0., 0., 0., 0., 0.]),
 array([0.958, 0.   , 0.287, 0.   , 0.   , 0.   , 0.   , 0.   ]),
 array([1., 0., 0., 0., 0., 0., 0., 0.]),
 array([0.961, 0.   , 0.115, 0.167, 0.   , 0.   , 0.189, 0.   ]),
 array([0.939, 0.207, 0.   , 0.   , 0.277, 0.   , 0.   , 0.   ]),
 array([0.981, 0.   , 0.196, 0.   , 0.   , 0.   , 0.   , 0.   ])]

In [35]:
# step by step for vocab and idf
pipe_counter = pipe['counter'].fit_transform(X_array)

In [36]:
pipe_bow = pipe['bow'].fit(pipe_counter) # Careful w vocabulary_size, 7 is way too low...
pipe_bow.vocabulary_

{'i': 1, 'USERNAME': 2, 'NUMBER': 3, 'my': 4, 'not': 5, 'you': 6, 'am': 7}

In [37]:
bow = pipe_bow.transform(pipe_counter)

In [38]:
bow.toarray()[:10] # misses too many words!

array([[12,  4,  1,  0,  0,  0,  1,  1],
       [ 3,  0,  1,  0,  0,  0,  1,  0],
       [ 7,  3,  1,  0,  0,  0,  0,  2],
       [10,  1,  1,  0,  0,  0,  0,  0],
       [ 7,  0,  0,  0,  0,  0,  0,  0],
       [ 6,  0,  1,  0,  0,  0,  0,  0],
       [ 5,  0,  0,  0,  0,  0,  0,  0],
       [15,  0,  1,  1,  0,  0,  1,  0],
       [ 9,  1,  0,  0,  1,  0,  0,  0],
       [ 9,  0,  1,  0,  0,  0,  0,  0]], dtype=int32)

In [39]:
[np.around(x,4) for x in pipe['tfidf'].fit(bow).idf_[1:]] # IDFs!

[1.9808, 1.7985, 2.6094, 2.652, 2.6964, 2.9543, 2.9543]

In [40]:
tfidf = pipe['tfidf'].fit_transform(bow)

In [41]:
tfidf

<119x8 sparse matrix of type '<class 'numpy.float64'>'
	with 314 stored elements in Compressed Sparse Row format>

In [42]:
[np.around(x,3) for x in tfidf.toarray()[:10,:]] # skip first col? Maybe missing words is informative!

[array([0.796, 0.525, 0.119, 0.   , 0.   , 0.   , 0.196, 0.196]),
 array([0.655, 0.   , 0.393, 0.   , 0.   , 0.   , 0.645, 0.   ]),
 array([0.633, 0.537, 0.163, 0.   , 0.   , 0.   , 0.   , 0.534]),
 array([0.966, 0.191, 0.174, 0.   , 0.   , 0.   , 0.   , 0.   ]),
 array([1., 0., 0., 0., 0., 0., 0., 0.]),
 array([0.958, 0.   , 0.287, 0.   , 0.   , 0.   , 0.   , 0.   ]),
 array([1., 0., 0., 0., 0., 0., 0., 0.]),
 array([0.961, 0.   , 0.115, 0.167, 0.   , 0.   , 0.189, 0.   ]),
 array([0.939, 0.207, 0.   , 0.   , 0.277, 0.   , 0.   , 0.   ]),
 array([0.981, 0.   , 0.196, 0.   , 0.   , 0.   , 0.   , 0.   ])]

### Train and evaluate couple quick models to test `fit_perform` method

In [49]:
pipe = Pipeline([('counter', Cmod.DocumentToWordCounterTransformer()),
                 ('bow', Cmod.WordCounterToVectorTransformer(vocabulary_size=500)), # little bigger vocab
                 ('tfidf', TfidfTransformer())])

In [50]:
X_train_transformed = pipe.fit_transform(X_array)

In [51]:
X_train_transformed

<119x501 sparse matrix of type '<class 'numpy.float64'>'
	with 1161 stored elements in Compressed Sparse Row format>

In [52]:
y_array = y.iloc[:,0].ravel()

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="liblinear", random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_array, cv=10, verbose=1, scoring='accuracy')
print('Mean accuracy: ' + str(round(score.mean(),4)))

Mean accuracy: 0.5712


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [54]:
from sklearn.naive_bayes import MultinomialNB

NB_clf = MultinomialNB()
score = cross_val_score(NB_clf, X_train_transformed, y_array, cv=10, verbose=1, scoring='accuracy')
print('Mean accuracy: ' + str(round(score.mean(),4)))

Mean accuracy: 0.5462


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


---