# Pre-process Test Set

---


### Setup

In [1]:
import os
import time
import json

import numpy as np
import pandas as pd

from datetime import datetime

dt_object = datetime.fromtimestamp(time.time())
day, T = str(dt_object).split('.')[0].split(' ')
print('Revised on: ' + day)

Revised on: 2021-02-17


### Load Data

In [2]:
import urlextract
from nltk.stem import WordNetLemmatizer

def load_data(data):
    raw_path = os.path.join("data","1_raw")
    filename = ''.join([data, ".csv"])
    out_dfm = pd.read_csv(os.path.join(raw_path, filename))
    out_arr = np.array(out_dfm.iloc[:,0].ravel())
    return out_arr

X_train = load_data("X_train") 
y_train = load_data("y_train") 
X_test = load_data("X_test") 
y_test = load_data("y_test") 

In [3]:
def make_int(y_array):
    y = y_array.copy()
    y[y=='ham'] = 0
    y[y=='spam'] = 1
    y = y.astype('int')
    return y

y_test_int = make_int(y_test)
y_train_int = make_int(y_train)

# load contractions map for custom cleanup
with open("contractions_map.json") as f:
    contractions_map = json.load(f)

### BoW and Tfidf


In [4]:
import custom.clean_preprocess as cp
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

pipe = Pipeline([('counter', cp.DocumentToNgramCounterTransformer(n_grams=3)),
                 ('bot', cp.WordCounterToVectorTransformer(vocabulary_size=2000)), # train vocab
                 ('tfidf', TfidfTransformer(sublinear_tf=True))]) # train IDF

In [5]:
X_train_counter = pipe['counter'].fit_transform(X_train) 
X_train_transformer = pipe['bot'].fit(X_train_counter)

In [6]:
first_11_vocab = [w for (ct, w) in enumerate(X_train_transformer.vocabulary_) if ct < 11]
print(first_11_vocab)

['NUM', 'i', 'you', 'u', 'me', 'not', 'my', 'your', 'am', 'have', 'call']


In [7]:
# train bag of trigrams
X_train_bot = X_train_transformer.transform(X_train_counter)

In [8]:
# test counter
X_test_counter = pipe['counter'].fit_transform(X_test) 

In [9]:
# test bag of trigrams using train transformer to keep vocab
X_test_bot = X_train_transformer.transform(X_test_counter)

In [10]:
X_train_bot, X_test_bot

(<3900x2001 sparse matrix of type '<class 'numpy.intc'>'
 	with 59102 stored elements in Compressed Sparse Row format>,
 <1672x2001 sparse matrix of type '<class 'numpy.intc'>'
 	with 24657 stored elements in Compressed Sparse Row format>)

In [11]:
# train BoT
pd.DataFrame(X_train_bot[0:6, 0:12].toarray() 
            , columns=['unknown'] + first_11_vocab)

Unnamed: 0,unknown,NUM,i,you,u,me,not,my,your,am,have,call
0,16,0,0,0,0,0,0,0,0,0,0,0
1,60,7,0,0,1,1,1,0,0,0,0,1
2,24,0,0,1,0,0,0,0,1,0,0,0
3,13,0,0,0,0,0,0,0,0,0,0,0
4,21,1,0,0,0,0,0,0,0,0,0,0
5,23,0,0,0,0,0,0,1,1,0,0,0


In [12]:
print(X_train[1])

Not heard from U4 a while. Call 4 rude chat private line 01223585334 to cum. Wan 2C pics of me gettin shagged then text PIX to 8552. 2End send STOP 8552 SAM xxx


In [13]:
print(X_train_counter[1])

Counter({'NUM': 7, 'not': 1, 'heard': 1, 'u': 1, 'while': 1, 'call': 1, 'rude': 1, 'chat': 1, 'private': 1, 'line': 1, 'cum': 1, 'wan': 1, 'c': 1, 'pic': 1, 'me': 1, 'gettin': 1, 'shagged': 1, 'then': 1, 'text': 1, 'pix': 1, 'end': 1, 'send': 1, 'stop': 1, 'sam': 1, 'xxx': 1, 'not_heard': 1, 'heard_from': 1, 'from_u': 1, 'u_NUM': 1, 'NUM_a': 1, 'a_while': 1, 'while_call': 1, 'call_NUM': 1, 'NUM_rude': 1, 'rude_chat': 1, 'chat_private': 1, 'private_line': 1, 'line_NUM': 1, 'NUM_to': 1, 'to_cum': 1, 'cum_wan': 1, 'wan_NUM': 1, 'NUM_c': 1, 'c_pics': 1, 'pics_of': 1, 'of_me': 1, 'me_gettin': 1, 'gettin_shagged': 1, 'shagged_then': 1, 'then_text': 1, 'text_pix': 1, 'pix_to': 1, 'to_NUM': 1, 'NUM_NUM': 1, 'NUM_end': 1, 'end_send': 1, 'send_stop': 1, 'stop_NUM': 1, 'NUM_sam': 1, 'sam_xxx': 1, 'not_heard_from': 1, 'heard_from_u': 1, 'from_u_NUM': 1, 'u_NUM_a': 1, 'NUM_a_while': 1, 'a_while_call': 1, 'while_call_NUM': 1, 'call_NUM_rude': 1, 'NUM_rude_chat': 1, 'rude_chat_private': 1, 'chat_priv

In [14]:
# test BoT
pd.DataFrame(X_test_bot[0:6, 0:12].toarray() 
            , columns=['unknown'] + first_11_vocab)

Unnamed: 0,unknown,NUM,i,you,u,me,not,my,your,am,have,call
0,30,0,1,2,0,0,0,1,0,0,0,0
1,26,0,1,1,0,0,0,0,0,0,0,0
2,15,0,0,0,0,0,0,0,0,0,0,0
3,83,0,1,3,0,2,2,2,2,0,1,0
4,6,0,0,0,0,0,0,0,0,0,0,0
5,8,0,0,0,0,0,0,0,0,0,0,0


In [15]:
print(X_test[3])

Any chance you might have had with me evaporated as soon as you violated my privacy by stealing my phone number from your employer's paperwork. Not cool at all. Please do not contact me again or I will report you to your supervisor.


In [16]:
print(X_test_counter[3])

Counter({'you': 3, 'me': 2, 'my': 2, 'your': 2, 'not': 2, 'any': 1, 'chance': 1, 'might': 1, 'have': 1, 'had': 1, 'evaporated': 1, 'soon': 1, 'violated': 1, 'privacy': 1, 'stealing': 1, 'phone': 1, 'number': 1, 'employer': 1, 'paperwork': 1, 'cool': 1, 'all': 1, 'please': 1, 'do': 1, 'contact': 1, 'again': 1, 'or': 1, 'i': 1, 'report': 1, 'supervisor': 1, 'any_chance': 1, 'chance_you': 1, 'you_might': 1, 'might_have': 1, 'have_had': 1, 'had_with': 1, 'with_me': 1, 'me_evaporated': 1, 'evaporated_as': 1, 'as_soon': 1, 'soon_as': 1, 'as_you': 1, 'you_violated': 1, 'violated_my': 1, 'my_privacy': 1, 'privacy_by': 1, 'by_stealing': 1, 'stealing_my': 1, 'my_phone': 1, 'phone_number': 1, 'number_from': 1, 'from_your': 1, 'your_employers': 1, 'employers_paperwork': 1, 'paperwork_not': 1, 'not_cool': 1, 'cool_at': 1, 'at_all': 1, 'all_please': 1, 'please_do': 1, 'do_not': 1, 'not_contact': 1, 'contact_me': 1, 'me_again': 1, 'again_or': 1, 'or_i': 1, 'i_will': 1, 'will_report': 1, 'report_you':

### OLD

In [3]:
X_tfidf_wrong = pipe['tfidf'].fit_transform(X_bot) # WRONGEDY WRONG WRONG

Minute 8 in [Pre-processing our test data](https://www.youtube.com/watch?v=XWUi7RivDJY&list=PLTJTBoU5HOCR5Vkah2Z-AU76ZYsZjGFK6&index=11), here lies the problem:

- The test data contains new n-grams... this is bad, we need the same n-grams from the training data (same features)
- Each column has to be in the same order and have the same meaning, be the same "word", in R:


```
# Ensure the test dfm has the same n-grams as the training dfm.
#
# NOTE - In production we should expect that new text messages will 
#        contain n-grams that did not exist in the original training
#        data. As such, we need to strip those n-grams out.
#
test.tokens.dfm <- dfm_select(test.tokens.dfm, pattern = train.tokens.dfm,
                              selection = "keep")
test.tokens.matrix <- as.matrix(test.tokens.dfm)
test.tokens.dfm
```

- While I did select 2000 ngrams above, they are different 2000 ngrams than the training data. I should, potentially, choose a much higher vocabulary to try to accommodate for the fact that training and test data have diverging vocabularies, in an attempt to rescue as much vocabulary in the trainin as I can in the test data.

### SVD


In [7]:
from scipy.sparse.linalg import svds
from sklearn.utils.extmath import svd_flip

def perform_SVD(X, n_components=300): 
    
    X_array = X.asfptype()
    U, Sigma, VT = svds(X_array.T, 
                        k=n_components)
    # reverse outputs
    Sigma = Sigma[::-1]
    U, VT = svd_flip(U[:, ::-1], VT[::-1])
    
    # return V
    return VT.T

# SVD with 800 components
X_tfidf_svd = perform_SVD(X_tfidf_wrong, # YUP. WRONG.
                          n_components=800) 

## Cosine Similarities

In [8]:
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

X_tfidf_svd_allcos = cosine_similarity(X_tfidf_svd)

test_df = pd.DataFrame({'sms':X_test, 'target':y_test}) # change

# get spam indexes
spam_ix = test_df.loc[test_df['target']=='spam'].index # change

# calculate average spam similarity on SVD
mean_spam_sims = []

for ix in range(X_tfidf_svd_allcos.shape[0]):
    mean_spam_sims.append(np.mean(X_tfidf_svd_allcos[ix, spam_ix]))

X_tfidf_svd_spamcos = sp.hstack((csr_matrix(mean_spam_sims).T, X_tfidf_svd)) 