# Twitter Sentiment Analysis - POC
---

## 4. Cleanup Pipeline - version 2


In [27]:
import re
import os
import time
import json

import numpy as np
import pandas as pd

import urlextract
from html import unescape
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

# load minimally prepared X, y train subsets
#raw_path = os.path.join("..","data","1_raw","sentiment140")
#X_train = pd.read_csv(os.path.join(raw_path, "X_train.csv"))
#y_train = pd.read_csv(os.path.join(raw_path, "y_train.csv"))
#
## sample for dev
#X, X_rest, y, y_rest = train_test_split(X_train, y_train, test_size=0.9999, random_state=158)
## create array
#X_array = np.array(X.iloc[:, 2]).ravel()

In [31]:
# functions
def expand_contractions(text, contractions_map):
    
    pattern = re.compile('({})'.format('|'.join(contractions_map.keys())), 
                        flags=re.IGNORECASE|re.DOTALL)
    
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_map.get(match)\
                                if contractions_map.get(match)\
                                else contractions_map.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def is_ascii(doc):
    try:
        doc.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [33]:
class DocumentToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, expand_contractions=True, lower_case=True, 
                 replace_usernames=True, unescape_html=True, 
                 replace_urls=True, replace_numbers=True, 
                 remove_junk=True, remove_punctuation=True, 
                 replace_emojis=True, replace_nonascii=True, 
                 remove_stopwords=True, lemmatization=True):
        self.expand_contractions = expand_contractions
        self.lower_case = lower_case
        self.replace_usernames = replace_usernames
        self.unescape_html = unescape_html
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.remove_junk = remove_junk
        self.remove_punctuation = remove_punctuation
        self.replace_emojis = replace_emojis
        self.replace_nonascii = replace_nonascii
        self.remove_stopwords = remove_stopwords
        self.lemmatization = lemmatization
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for doc in X:
            if self.lower_case:
                doc = doc.lower()
            if self.expand_contractions and contractions_map is not None:
                doc = expand_contractions(doc, contractions_map)
            if self.replace_usernames:
                doc = re.sub(r'^@([^\s]+)',' USERNAME ', doc)
            if self.unescape_html:
                doc = unescape(doc)
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(doc)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    doc = doc.replace(url, ' URL ')
            if self.replace_numbers:
                doc = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' NUMBER ', doc)
            if self.remove_junk:
                pattern = r'\¥|\â|\«|\»|\Ñ|\Ð|\¼|\½|\¾|\!|\?|\¿|\x82\
                            |\x83|\x84|\x85|\x86|\x87|\x88|\x89|\
                            |\x8a|\x8b|\x8c|\x8d|\x8e|\°|\µ|\´|\º|\¹|\³'
                doc = re.sub(pattern,'', doc)
            if self.remove_punctuation:
                doc = re.sub(r'\W+', ' ', doc, flags=re.M)
            if self.replace_emojis:
                doc = re.sub(r'[^\x00-\x7F]+', ' EMOJI ', doc)
            if self.replace_nonascii:
                if is_ascii(doc) == False:
                    doc = ' NONASCII '
            word_counts = Counter(doc.split())
            if self.remove_stopwords:
                # 25 semantically non-selective words from the Reuters-RCV1 dataset
                stop_words = ['a','an','and','are','as','at','be','by','for','from',
                              'has','he','in','is','it','its','of','on','that','the',
                              'to','was','were','will','with']
                for word in stop_words:
                    try:
                        word_counts.pop(word)
                    except KeyError:
                        continue
            if self.expand_contractions:
                    leftovers = ['t','s','d','m','ve','re','ll','']
            if self.lemmatization and lemmatizer is not None:
                lemmatized_word_counts = Counter()
                for word, count in word_counts.items():
                    lemmatized_word = lemmatizer.lemmatize(word)
                    lemmatized_word_counts[lemmatized_word] += count
                word_counts = lemmatized_word_counts      
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [34]:
with open("contractions_map.json") as f:
    contractions_map = json.load(f)

url_extractor = urlextract.URLExtract()
lemmatizer = WordNetLemmatizer()

In [674]:
from scipy.sparse import csr_matrix

class WordCounterToTfidfVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 1)  
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows, cols, tfidf = [], [], []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                col = self.vocabulary_.get(word, 0)
                cols.append(col)
                corpus_size = len(X)
                tf = count/self.vocabulary_size
                df = [ct for w, ct in self.most_common_][col-1]
                idf = np.log(corpus_size/df)+1
                tfidf.append(round(tf*idf,4))
        return csr_matrix((tfidf, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [692]:
corpus = ['one duck got sauce',
          'one duck duck got no sauce',
          'elfs got no sauce',
          'duck sauce weird weird']

In [675]:
X_wordcounts = DocumentToWordCounterTransformer().fit_transform(corpus)
X_wordvec = WordCounterToTfidfVectorTransformer(vocabulary_size=7).fit(X_wordcounts)
X_wordvec.vocabulary_

{'sauce': 1, 'duck': 2, 'got': 3, 'one': 4, 'no': 5, 'elf': 6, 'weird': 7}

In [684]:
#X_wordvec.vocabulary_size #X_wordvec.vocabulary_ #X_wordvec.most_common_
X_output = X_wordvec.transform(X_wordcounts)

In [677]:
for row, val in enumerate(X_output.toarray()):
    print(val)

[0.     0.1429 0.184  0.184  0.2419 0.     0.     0.    ]
[0.     0.1429 0.3679 0.184  0.2419 0.2419 0.     0.    ]
[0.     0.1429 0.     0.184  0.     0.2419 0.3409 0.    ]
[0.     0.1429 0.184  0.     0.     0.     0.     0.6818]


In [None]:
# test by returning tf, df, idf, all separately
# remember to retain idf

### Using TfidfVectorizer

In [680]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ['one duck got sauce',
          'one duck duck got no sauce',
          'elfs got no sauce',
          'duck sauce weird weird']
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names()

['duck', 'elfs', 'got', 'no', 'one', 'sauce', 'weird']

In [683]:
X.toarray()

array([[0.48829139, 0.        , 0.48829139, 0.        , 0.60313701,
        0.39921021, 0.        ],
       [0.67729143, 0.        , 0.33864572, 0.41829483, 0.41829483,
        0.27686506, 0.        ],
       [0.        , 0.65919112, 0.42075315, 0.51971385, 0.        ,
        0.34399327, 0.        ],
       [0.29505684, 0.        , 0.        , 0.        , 0.        ,
        0.2412283 , 0.92452711]])

In [689]:
vocabulary = ['duck', 'elfs', 'got', 'no', 'one', 'sauce', 'weird']
pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),
                 ('tfid', TfidfTransformer())]).fit(corpus)

pipe['count'].transform(corpus).toarray()

array([[1, 0, 1, 0, 1, 1, 0],
       [2, 0, 1, 1, 1, 1, 0],
       [0, 1, 1, 1, 0, 1, 0],
       [1, 0, 0, 0, 0, 1, 2]], dtype=int64)

In [690]:
pipe['tfid'].idf_

array([1.22314355, 1.91629073, 1.22314355, 1.51082562, 1.51082562,
       1.        , 1.91629073])

In [691]:
pipe.transform(corpus).todense()

matrix([[0.48829139, 0.        , 0.48829139, 0.        , 0.60313701,
         0.39921021, 0.        ],
        [0.67729143, 0.        , 0.33864572, 0.41829483, 0.41829483,
         0.27686506, 0.        ],
        [0.        , 0.65919112, 0.42075315, 0.51971385, 0.        ,
         0.34399327, 0.        ],
        [0.29505684, 0.        , 0.        , 0.        , 0.        ,
         0.2412283 , 0.92452711]])

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("document_to_wordcount", DocumentToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_array)

In [25]:
X_train_transformed

<119x1001 sparse matrix of type '<class 'numpy.int32'>'
	with 1208 stored elements in Compressed Sparse Row format>

In [26]:
y_array = y.iloc[:,0].ravel()

In [27]:
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="liblinear", random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_array, cv=5, verbose=3, scoring='accuracy')
print('Mean accuracy: ' + str(score.mean()))

[CV]  ................................................................
[CV] .................................... , score=0.708, total=   0.0s
[CV]  ................................................................
[CV] .................................... , score=0.500, total=   0.0s
[CV]  ................................................................
[CV] .................................... , score=0.750, total=   0.0s
[CV]  ................................................................
[CV] .................................... , score=0.583, total=   0.0s
[CV]  ................................................................
[CV] .................................... , score=0.739, total=   0.0s
Mean accuracy: 0.6561594202898551


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


In [28]:
from sklearn.naive_bayes import MultinomialNB

NB_clf = MultinomialNB()
score = cross_val_score(NB_clf, X_train_transformed, y_array, cv=5, verbose=3, scoring='accuracy')
print('Mean accuracy: ' + str(score.mean()))

[CV]  ................................................................
[CV] .................................... , score=0.792, total=   0.0s
[CV]  ................................................................
[CV] .................................... , score=0.667, total=   0.0s
[CV]  ................................................................
[CV] .................................... , score=0.667, total=   0.0s
[CV]  ................................................................
[CV] .................................... , score=0.667, total=   0.0s
[CV]  ................................................................
[CV] .................................... , score=0.696, total=   0.0s
Mean accuracy: 0.697463768115942


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


---

In [29]:
# time notebook
mins, secs = divmod(time.time() - start_notebook, 60)
print(f'Total running time: {mins:0.0f} minute(s) and {secs:0.0f} second(s).')

Total running time: 0 minute(s) and 9 second(s).
