# Pre-process Test Set

---


### Setup

In [1]:
import os
import time
import json

import numpy as np
import pandas as pd

from datetime import datetime

dt_object = datetime.fromtimestamp(time.time())
day, T = str(dt_object).split('.')[0].split(' ')
print('Revised on: ' + day)

Revised on: 2021-02-15


### Load Test Data

In [2]:
import urlextract
from nltk.stem import WordNetLemmatizer

def load_data(data):
    raw_path = os.path.join("data","1_raw")
    filename = ''.join([data, ".csv"])
    out_dfm = pd.read_csv(os.path.join(raw_path, filename))
    out_arr = np.array(out_dfm.iloc[:,0].ravel())
    return out_arr

X_test = load_data("X_test") # change
y_test = load_data("y_test") # change
y = y_test.copy() # change

# transform y_array into int type
y[y=='ham'] = 0
y[y=='spam'] = 1
y = y.astype('int')

# load contractions map for custom cleanup
with open("contractions_map.json") as f:
    contractions_map = json.load(f)

### BoW and Tfidf

INTENTIONAL ERROR FOR COMPARISON?

- test data is different
- IDF is different

In [3]:
import custom.clean_preprocess as cp
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

pipe = Pipeline([('counter', cp.DocumentToNgramCounterTransformer(n_grams=3)),
                 ('bot', cp.WordCounterToVectorTransformer(vocabulary_size=2000)),
                 ('tfidf', TfidfTransformer(sublinear_tf=True))])

X_counter = pipe['counter'].fit_transform(X_test) # change
X_bot = pipe['bot'].fit_transform(X_counter)

X_tfidf_wrong = pipe['tfidf'].fit_transform(X_bot) # WRONGEDY WRONG WRONG

In [4]:
X_bot.shape

(1672, 2001)

In [5]:
X_bot.todense()

matrix([[26,  0,  1, ...,  0,  0,  0],
        [24,  0,  1, ...,  0,  0,  0],
        [15,  0,  0, ...,  0,  0,  0],
        ...,
        [10,  0,  0, ...,  0,  0,  0],
        [57,  3,  1, ...,  0,  0,  0],
        [27,  1,  1, ...,  0,  0,  0]], dtype=int32)

Minute 8 in [Pre-processing our test data](https://www.youtube.com/watch?v=XWUi7RivDJY&list=PLTJTBoU5HOCR5Vkah2Z-AU76ZYsZjGFK6&index=11), here lies the problem:

- The test data contains new n-grams... this is bad, we need the same n-grams from the training data (same features)
- Each column has to be in the same order and have the same meaning, be the same "word", in R:


```
# Ensure the test dfm has the same n-grams as the training dfm.
#
# NOTE - In production we should expect that new text messages will 
#        contain n-grams that did not exist in the original training
#        data. As such, we need to strip those n-grams out.
#
test.tokens.dfm <- dfm_select(test.tokens.dfm, pattern = train.tokens.dfm,
                              selection = "keep")
test.tokens.matrix <- as.matrix(test.tokens.dfm)
test.tokens.dfm
```

- While I did select 2000 ngrams above, they are different 2000 ngrams than the training data. I should, potentially, choose a much higher vocabulary to try to accommodate for the fact that training and test data have diverging vocabularies, in an attempt to rescue as much vocabulary in the trainin as I can in the test data.

### SVD


In [7]:
from scipy.sparse.linalg import svds
from sklearn.utils.extmath import svd_flip

def perform_SVD(X, n_components=300): 
    
    X_array = X.asfptype()
    U, Sigma, VT = svds(X_array.T, 
                        k=n_components)
    # reverse outputs
    Sigma = Sigma[::-1]
    U, VT = svd_flip(U[:, ::-1], VT[::-1])
    
    # return V
    return VT.T

# SVD with 800 components
X_tfidf_svd = perform_SVD(X_tfidf_wrong, # YUP. WRONG.
                          n_components=800) 

## Cosine Similarities

In [8]:
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

X_tfidf_svd_allcos = cosine_similarity(X_tfidf_svd)

test_df = pd.DataFrame({'sms':X_test, 'target':y_test}) # change

# get spam indexes
spam_ix = test_df.loc[test_df['target']=='spam'].index # change

# calculate average spam similarity on SVD
mean_spam_sims = []

for ix in range(X_tfidf_svd_allcos.shape[0]):
    mean_spam_sims.append(np.mean(X_tfidf_svd_allcos[ix, spam_ix]))

X_tfidf_svd_spamcos = sp.hstack((csr_matrix(mean_spam_sims).T, X_tfidf_svd)) 

In [9]:
X_tfidf_svd_spamcos.shape

(1672, 801)

In [10]:
X_tfidf_svd_spamcos

<1672x801 sparse matrix of type '<class 'numpy.float64'>'
	with 1339272 stored elements in COOrdinate format>

In [11]:
X_tfidf_svd_spamcos.todense()

matrix([[-5.80042398e-04,  2.23569820e-02, -8.40956775e-03, ...,
          2.01629513e-02,  1.30223037e-02,  1.68701983e-02],
        [-2.14483682e-04,  2.38006692e-02, -1.45188925e-02, ...,
         -2.65676338e-02,  2.42388434e-02,  5.53829668e-03],
        [-7.11830359e-04,  5.21375410e-02, -1.36876042e-02, ...,
          4.96163833e-02,  1.41231183e-04, -2.17165222e-03],
        ...,
        [-2.28196181e-04,  3.00118353e-02, -7.88647507e-03, ...,
          1.74274889e-02, -1.95212903e-02, -1.43835931e-03],
        [ 8.01057158e-03,  2.27854786e-02,  2.94052142e-02, ...,
          2.39558645e-02,  3.57047719e-02, -4.26120041e-03],
        [ 4.97053695e-04,  2.13937945e-02,  9.51652762e-05, ...,
          9.07231013e-03, -1.31506711e-02,  4.29393375e-02]])