# Model Deployment 2

---

Testing the prediction pipeline.


## Setup

In [1]:
import re
import os
import sys
import time
import joblib 

import numpy as np
import pandas as pd
import scipy.sparse as sp
import custom.clean_preprocess as cp

from datetime import datetime
from xgboost import XGBClassifier
from scipy.sparse import csr_matrix

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split 

In [2]:
# create deployment dir
dep_dir = os.path.join("data","5_deployment")

try:
    os.stat(dep_dir)
except:
    os.mkdir(dep_dir)

## User input 

In [3]:
user_input = "You've earned 10 FREE badges - redeem coupon and subscribe NOW!"

start_time = time.time()
new_data = np.array([user_input])

## Load transformers and model

In [4]:
X_train_transformer_PATH = os.path.join(dep_dir, 
                                        "X_train_transformer.joblib")
with open(X_train_transformer_PATH, 'rb') as f:
    X_train_transformer = joblib.load(f)

X_train_fit_PATH = os.path.join(dep_dir, "X_train_fit.joblib")
with open(X_train_fit_PATH, 'rb') as f:
    X_train_fit = joblib.load(f) 

X_train_svd_transformer_PATH = os.path.join(dep_dir, 
                                            "X_train_svd_transformer.joblib")
with open(X_train_svd_transformer_PATH, 'rb') as f:
    X_train_svd_transformer = joblib.load(f)   

X_train_svd_spam_PATH = os.path.join(dep_dir, 
                                     "X_train_svd_spam.joblib")
with open(X_train_svd_spam_PATH, 'rb') as f:
    X_train_svd_spam = joblib.load(f)   

XGboost_mod1_PATH = os.path.join(dep_dir, 
                                 "XGboost_mod1.joblib")
with open(XGboost_mod1_PATH, 'rb') as f:
    XGboost_mod1 = joblib.load(f)   

## Pre-process

In [5]:
# preprocess pipeline
pipe = Pipeline([('counter', cp.DocumentToNgramCounterTransformer(n_grams=3)),
                 ('bot', cp.WordCounterToVectorTransformer(vocabulary_size=2000)),
                 ('tfidf', TfidfTransformer(sublinear_tf=True))])

# counter
X_test_counter = pipe['counter'].fit_transform(new_data) 

# BoT
X_test_bot = X_train_transformer.transform(X_test_counter)

# Tfidf
X_test_tfidf = X_train_fit.transform(X_test_bot)

# SVD
sigma_inverse = 1 / X_train_svd_transformer.sigma_
U_transpose = X_train_svd_transformer.U_.T
UT_TestTfidfT = (U_transpose @ X_test_tfidf.T)
X_test_svd = (sigma_inverse.reshape(-1,1) * UT_TestTfidfT).T

In [6]:
X_test_svd[0][:10]

array([ 2.08543146e-02,  2.05733092e-02,  3.25309980e-05, -2.28182664e-02,
        1.50388200e-03,  1.32481380e-02, -8.94341236e-03, -4.42912305e-03,
       -9.71310881e-03,  1.88697220e-02])

In [7]:
X_train_svd_spam.shape

(747, 800)

In [8]:
# Cosine Similarities
test_similarities = cosine_similarity(sp.vstack((X_test_svd, 
                                                 X_train_svd_spam)))

In [9]:
test_similarities.shape

(748, 748)

In [10]:
spam_cols = range(X_test_svd.shape[0], test_similarities.shape[0])
spam_cols

range(1, 748)

In [11]:
test_mean_spam_sims = []
for ix in range(X_test_svd.shape[0]):
    mean_spam_sim = np.mean(test_similarities[ix, spam_cols])
    test_mean_spam_sims.append(mean_spam_sim)

In [12]:
test_mean_spam_sims # mean cosine similarity between the new doc

[0.0066333545396542485]

In [13]:
# stack
X_test_processed = sp.hstack((csr_matrix(test_mean_spam_sims).T, 
                              X_test_svd))

## Predict

In [14]:
XGboost_mod1.predict(X_test_processed)

array([1])

---

In [15]:
print(f'Elapsed: {time.time() - start_time:0.3f} sec')

Elapsed: 6.584 sec


---