# Pre-process Test Set

---


### Setup

In [1]:
import os
import time
import json

import numpy as np
import pandas as pd

from datetime import datetime

dt_object = datetime.fromtimestamp(time.time())
day, T = str(dt_object).split('.')[0].split(' ')
print('Revised on: ' + day)

Revised on: 2021-02-17


### Load Data

In [25]:
import urlextract
from nltk.stem import WordNetLemmatizer

def load_data(data):
    raw_path = os.path.join("data","1_raw")
    filename = ''.join([data, ".csv"])
    out_dfm = pd.read_csv(os.path.join(raw_path, filename))
    out_arr = np.array(out_dfm.iloc[:,0].ravel())
    return out_arr

X_train = load_data("X_train") 
y_train = load_data("y_train") 
X_test = load_data("X_test") 
y_test = load_data("y_test") 

In [27]:
def make_int(y_array):
    y = y_array.copy()
    y[y=='ham'] = 0
    y[y=='spam'] = 1
    y = y.astype('int')
    return y

y_test_int = make_int(y_test)
y_train_int = make_int(y_train)

# load contractions map for custom cleanup
with open("contractions_map.json") as f:
    contractions_map = json.load(f)

In [31]:
y_train, y_train_int

(array(['ham', 'spam', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object),
 array([0, 1, 0, ..., 0, 0, 0]))

### BoW and Tfidf

INTENTIONAL ERROR FOR COMPARISON?

- test data is different
- IDF is different

In [53]:
import custom.clean_preprocess as cp
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

pipe = Pipeline([('counter', cp.DocumentToNgramCounterTransformer(n_grams=3)),
                 ('bot', cp.WordCounterToVectorTransformer(vocabulary_size=2000)), # careful here
                 ('tfidf', TfidfTransformer(sublinear_tf=True))]) # very careful here

In [61]:
X_train_counter = pipe['counter'].fit_transform(X_train) 
X_train_transformer = pipe['bot'].fit(X_train_counter)

In [76]:
[w for (ct, w) in enumerate(X_train_transformer.vocabulary_) if ct < 10]

['NUM', 'i', 'you', 'u', 'me', 'not', 'my', 'your', 'am', 'have']

In [67]:
X_train_bot = X_train_transformer.transform(X_train_counter)

In [69]:
X_test_counter = pipe['counter'].fit_transform(X_test) 

In [71]:
X_test_bot = X_train_transformer.transform(X_test_counter) # use train transformer for same vocabulary!

In [72]:
X_test_bot

<1672x2001 sparse matrix of type '<class 'numpy.intc'>'
	with 24657 stored elements in Compressed Sparse Row format>

In [74]:
X_train_bot[:10,:10].toarray()

array([[16,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [60,  7,  0,  0,  1,  1,  1,  0,  0,  0],
       [24,  0,  0,  1,  0,  0,  0,  0,  1,  0],
       [13,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [21,  1,  0,  0,  0,  0,  0,  0,  0,  0],
       [23,  0,  0,  0,  0,  0,  0,  1,  1,  0],
       [11,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [12,  0,  1,  0,  0,  0,  1,  0,  0,  0],
       [12,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [12,  0,  0,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)

In [75]:
X_test_bot[:10,:10].toarray()

array([[30,  0,  1,  2,  0,  0,  0,  1,  0,  0],
       [26,  0,  1,  1,  0,  0,  0,  0,  0,  0],
       [15,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [83,  0,  1,  3,  0,  2,  2,  2,  2,  0],
       [ 6,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 8,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [11,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [14,  0,  1,  0,  0,  0,  0,  0,  0,  0],
       [ 8,  0,  0,  1,  0,  0,  0,  0,  0,  0],
       [31,  0,  1,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)

In [101]:
X_train_bot, X_test_bot

(<3900x2001 sparse matrix of type '<class 'numpy.intc'>'
 	with 59102 stored elements in Compressed Sparse Row format>,
 <1672x2001 sparse matrix of type '<class 'numpy.intc'>'
 	with 24657 stored elements in Compressed Sparse Row format>)

In [3]:
X_tfidf_wrong = pipe['tfidf'].fit_transform(X_bot) # WRONGEDY WRONG WRONG

Minute 8 in [Pre-processing our test data](https://www.youtube.com/watch?v=XWUi7RivDJY&list=PLTJTBoU5HOCR5Vkah2Z-AU76ZYsZjGFK6&index=11), here lies the problem:

- The test data contains new n-grams... this is bad, we need the same n-grams from the training data (same features)
- Each column has to be in the same order and have the same meaning, be the same "word", in R:


```
# Ensure the test dfm has the same n-grams as the training dfm.
#
# NOTE - In production we should expect that new text messages will 
#        contain n-grams that did not exist in the original training
#        data. As such, we need to strip those n-grams out.
#
test.tokens.dfm <- dfm_select(test.tokens.dfm, pattern = train.tokens.dfm,
                              selection = "keep")
test.tokens.matrix <- as.matrix(test.tokens.dfm)
test.tokens.dfm
```

- While I did select 2000 ngrams above, they are different 2000 ngrams than the training data. I should, potentially, choose a much higher vocabulary to try to accommodate for the fact that training and test data have diverging vocabularies, in an attempt to rescue as much vocabulary in the trainin as I can in the test data.

### SVD


In [7]:
from scipy.sparse.linalg import svds
from sklearn.utils.extmath import svd_flip

def perform_SVD(X, n_components=300): 
    
    X_array = X.asfptype()
    U, Sigma, VT = svds(X_array.T, 
                        k=n_components)
    # reverse outputs
    Sigma = Sigma[::-1]
    U, VT = svd_flip(U[:, ::-1], VT[::-1])
    
    # return V
    return VT.T

# SVD with 800 components
X_tfidf_svd = perform_SVD(X_tfidf_wrong, # YUP. WRONG.
                          n_components=800) 

## Cosine Similarities

In [8]:
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

X_tfidf_svd_allcos = cosine_similarity(X_tfidf_svd)

test_df = pd.DataFrame({'sms':X_test, 'target':y_test}) # change

# get spam indexes
spam_ix = test_df.loc[test_df['target']=='spam'].index # change

# calculate average spam similarity on SVD
mean_spam_sims = []

for ix in range(X_tfidf_svd_allcos.shape[0]):
    mean_spam_sims.append(np.mean(X_tfidf_svd_allcos[ix, spam_ix]))

X_tfidf_svd_spamcos = sp.hstack((csr_matrix(mean_spam_sims).T, X_tfidf_svd)) 