# Cleanup Pipeline 2

This notebook is for developing the cleanup pipeline, not implementing it. Implementation is done via the `cleanup_module.py` which is imported into a notebook or script. 

*Purpose*
- Preprocessing is performed for a TF-IDF representation. Text to DFM representations are explained in more detail in this [Document Term Matrices notebook.](10.extra_Document_Term_Matrices.ipynb)

*Results*
- In a hybrid approach, I create a pipeline that repurposes the custom **DocumentToWordCounterTransformer** class from the previous notebook and includes sklearn's **TfidfVectorizer**.  

---

In [26]:
import re
import os
import time
import json

import numpy as np
import pandas as pd

import urlextract
from nltk.stem import WordNetLemmatizer

import cleanup_module as Cmod
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import train_test_split

In [27]:
with open("contractions_map.json") as f:
    contractions_map = json.load(f)

url_extractor = urlextract.URLExtract()
lemmatizer = WordNetLemmatizer()

In [28]:
corpus = ['You love me', 
          'You do not love me',
          'You really really love food']

---

### Hybrid approach

In [29]:
pipe = Pipeline([('counter', Cmod.DocumentToWordCounterTransformer()),
                 ('bow', Cmod.WordCounterToVectorTransformer(vocabulary_size=7)),
                 ('tfidf', TfidfTransformer())])

In [30]:
pipe_counter = pipe['counter'].fit_transform(corpus)

In [31]:
pipe_bow = pipe['bow'].fit(pipe_counter)

In [32]:
pipe_bow.vocabulary_

{'you': 1, 'love': 2, 'me': 3, 'really': 4, 'do': 5, 'not': 6, 'food': 7}

In [33]:
bow = pipe_bow.transform(pipe_counter)

In [34]:
bow.toarray() # first col is "words missing from vocab"

array([[0, 1, 1, 1, 0, 0, 0, 0],
       [0, 1, 1, 1, 0, 1, 1, 0],
       [0, 1, 1, 0, 2, 0, 0, 1]], dtype=int32)

In [35]:
 # IDF for the pipe_bow.vocabulary_
[np.around(x,3) for x in pipe['tfidf'].fit(bow).idf_[1:]]

[1.0, 1.0, 1.288, 1.693, 1.693, 1.693, 1.693]

In [36]:
tfidf = pipe['tfidf'].fit_transform(bow)

In [37]:
[np.around(x,3) for x in tfidf.toarray()]

[array([0.   , 0.523, 0.523, 0.673, 0.   , 0.   , 0.   , 0.   ]),
 array([0.   , 0.326, 0.326, 0.42 , 0.   , 0.552, 0.552, 0.   ]),
 array([0.   , 0.247, 0.247, 0.   , 0.838, 0.   , 0.   , 0.419])]

In [38]:
# entire pipeline produces same result but does't save IDF or vocab
end_res = pipe.fit_transform(corpus)
[np.around(x,4) for x in end_res.toarray()]

[array([0.    , 0.5228, 0.5228, 0.6733, 0.    , 0.    , 0.    , 0.    ]),
 array([0.    , 0.3263, 0.3263, 0.4202, 0.    , 0.5525, 0.5525, 0.    ]),
 array([0.    , 0.2474, 0.2474, 0.    , 0.8379, 0.    , 0.    , 0.4189])]

### Using small POC sample

In [59]:
# load minimally prepared X, y train subsets
raw_path = os.path.join("..","data","1_raw","sentiment140")
X_train = pd.read_csv(os.path.join(raw_path, "X_train.csv"))
y_train = pd.read_csv(os.path.join(raw_path, "y_train.csv"))

# sample for dev
X, X_rest, y, y_rest = train_test_split(X_train, y_train, test_size=0.999, random_state=42)

# create array
X_array = np.array(X.iloc[:, 2]).ravel()

In [60]:
X_array.shape

(1197,)

In [61]:
# full pipe
X_end = pipe.fit_transform(X_array)

In [62]:
X_end

<1197x501 sparse matrix of type '<class 'numpy.float64'>'
	with 9799 stored elements in Compressed Sparse Row format>

In [66]:
[np.around(x,3) for x in X_end.toarray()[:10,:6]]

[array([0.605, 0.   , 0.127, 0.786, 0.   , 0.   ]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0.302, 0.21 , 0.   , 0.   , 0.   , 0.   ]),
 array([0.144, 0.25 , 0.121, 0.   , 0.   , 0.   ]),
 array([0.438, 0.   , 0.056, 0.   , 0.   , 0.   ]),
 array([0.232, 0.   , 0.   , 0.   , 0.289, 0.   ]),
 array([0.319, 0.   , 0.   , 0.   , 0.   , 0.   ]),
 array([0.272, 0.315, 0.   , 0.   , 0.225, 0.   ]),
 array([0.087, 0.302, 0.146, 0.   , 0.   , 0.   ]),
 array([0.116, 0.   , 0.   , 0.   , 0.   , 0.   ])]

In [67]:
# step by step for vocab and idf
pipe_counter = pipe['counter'].fit_transform(X_array)

In [71]:
pipe_bow = pipe['bow'].fit(pipe_counter) 
pipe_bow.vocabulary_size

500

In [74]:
for ix, w in enumerate(pipe_bow.vocabulary_.items()):
    if ix < 10:
        print(w)

('i', 1)
('USERNAME', 2)
('NUMBER', 3)
('my', 4)
('you', 5)
('not', 6)
('have', 7)
('am', 8)
('me', 9)
('just', 10)


In [75]:
bow = pipe_bow.transform(pipe_counter)

In [76]:
bow.toarray()[:10] # misses too many words of course

array([[8, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [5, 2, 0, ..., 0, 0, 0],
       ...,
       [3, 2, 0, ..., 0, 0, 0],
       [1, 2, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [79]:
[np.around(x,4) for x in pipe['tfidf'].fit(bow).idf_[:10]] # IDFs

[1.1111,
 1.9314,
 1.8619,
 2.8899,
 2.7605,
 2.9294,
 2.9123,
 3.1684,
 3.3093,
 3.4537]

In [80]:
tfidf = pipe['tfidf'].fit_transform(bow)

In [81]:
tfidf

<1197x501 sparse matrix of type '<class 'numpy.float64'>'
	with 9799 stored elements in Compressed Sparse Row format>

In [83]:
[np.around(x,3) for x in tfidf.toarray()[:10,:6]] 

[array([0.605, 0.   , 0.127, 0.786, 0.   , 0.   ]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0.302, 0.21 , 0.   , 0.   , 0.   , 0.   ]),
 array([0.144, 0.25 , 0.121, 0.   , 0.   , 0.   ]),
 array([0.438, 0.   , 0.056, 0.   , 0.   , 0.   ]),
 array([0.232, 0.   , 0.   , 0.   , 0.289, 0.   ]),
 array([0.319, 0.   , 0.   , 0.   , 0.   , 0.   ]),
 array([0.272, 0.315, 0.   , 0.   , 0.225, 0.   ]),
 array([0.087, 0.302, 0.146, 0.   , 0.   , 0.   ]),
 array([0.116, 0.   , 0.   , 0.   , 0.   , 0.   ])]

### Train and evaluate couple quick models to test `fit_perform` method

In [86]:
pipe = Pipeline([('counter', Cmod.DocumentToWordCounterTransformer()),
                 ('bow', Cmod.WordCounterToVectorTransformer(vocabulary_size=1000)), # default vocab
                 ('tfidf', TfidfTransformer())])

In [87]:
X_train_transformed = pipe.fit_transform(X_array)

In [88]:
X_train_transformed

<1197x1001 sparse matrix of type '<class 'numpy.float64'>'
	with 10881 stored elements in Compressed Sparse Row format>

In [89]:
y_array = y.iloc[:,0].ravel()

In [90]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="liblinear", random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_array, cv=10, verbose=1, scoring='accuracy')
print('Mean accuracy: ' + str(round(score.mean(),4)))

Mean accuracy: 0.6984


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [91]:
from sklearn.naive_bayes import MultinomialNB

NB_clf = MultinomialNB()
score = cross_val_score(NB_clf, X_train_transformed, y_array, cv=10, verbose=1, scoring='accuracy')
print('Mean accuracy: ' + str(round(score.mean(),4)))

Mean accuracy: 0.7068


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


---