# Model Deployment 1

---

Prepping the model and the various pre-processing steps and transformers for deployment.


## Setup

In [1]:
import re
import os
import sys
import time
import joblib 

import numpy as np
import pandas as pd
import scipy.sparse as sp
import custom.clean_preprocess as cp

from datetime import datetime
from xgboost import XGBClassifier
from scipy.sparse import csr_matrix

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split 

In [2]:
# create deployment dir
dep_dir = os.path.join("data","5_deployment")

try:
    os.stat(dep_dir)
except:
    os.mkdir(dep_dir)

## Load raw data

In [3]:
def load_raw(data):
    raw_path = os.path.join("data","1_raw")
    filename = ''.join([data, ".csv"])
    out_dfm = pd.read_csv(os.path.join(raw_path, filename))
    out_arr = np.array(out_dfm.iloc[:,0].ravel())
    return out_arr

def make_int(y_array):
    y = y_array.copy()
    y[y=='ham'] = 0
    y[y=='spam'] = 1
    y = y.astype('int')
    return y

X_train_raw = load_raw("X_train")
X_test_raw = load_raw("X_test")
y_train_raw = load_raw("y_train")
y_test_raw = load_raw("y_test")

y_train = make_int(y_train_raw)
y_test = make_int(y_test_raw)

y_array = np.hstack((y_train_raw, y_test_raw))

# concatenate all data
X = np.hstack((X_train_raw, X_test_raw))
y = np.hstack((y_train, y_test))

## Pre-process

In [4]:
# preprocess pipeline
pipe = Pipeline([('counter', cp.DocumentToNgramCounterTransformer(n_grams=3)),
                 ('bot', cp.WordCounterToVectorTransformer(vocabulary_size=2000)),
                 ('tfidf', TfidfTransformer(sublinear_tf=True))])

# counters
X_train_counter = pipe['counter'].fit_transform(X) 
#X_test_counter = pipe['counter'].fit_transform(new_data) # use pipeline counter

X_train_transformer = pipe['bot'].fit(X_train_counter)  # same counter

# SAVE 1
X_train_transformer_path = os.path.join(dep_dir, 'X_train_transformer.joblib')
joblib.dump(X_train_transformer, X_train_transformer_path)

['data\\5_deployment\\X_train_transformer.joblib']

In [5]:
# BoTs
X_train_bot = X_train_transformer.transform(X_train_counter)
#X_test_bot = X_train_transformer.transform(X_test_counter) # same transformer

# fit Tfidf
X_train_fit = pipe['tfidf'].fit(X_train_bot) # save this (has idf)

# SAVE 2
X_train_fit_path = os.path.join(dep_dir, 'X_train_fit.joblib')
joblib.dump(X_train_fit, X_train_fit_path)

['data\\5_deployment\\X_train_fit.joblib']

In [6]:
# transform Tfidf
X_train_tfidf = X_train_fit.transform(X_train_bot)
#X_test_tfidf = X_train_fit.transform(X_test_bot)

In [7]:
import custom.deploy_models as dp

# instantiate SVD transformer
X_train_svd_transformer = dp.TruncatedSVD()

# fit transformer
X_train_svd_transformer.fit(X_train_tfidf.T) # save this

# SAVE 3
X_train_svd_transformer_path = os.path.join(dep_dir, 'X_train_svd_transformer.joblib')
joblib.dump(X_train_svd_transformer, X_train_svd_transformer_path)

['data\\5_deployment\\X_train_svd_transformer.joblib']

In [8]:
# project
sigma_inverse = 1 / X_train_svd_transformer.sigma_
U_transpose = X_train_svd_transformer.U_.T

#UT_TestTfidfT = (U_transpose @ X_test_tfidf.T)

# project into SVD space
X_train_svd = X_train_svd_transformer.V_
#X_test_svd = (sigma_inverse.reshape(-1,1) * UT_TestTfidfT).T

In [9]:
# all similarities
train_similarities = cosine_similarity(X_train_svd)

# spam similarities
train_df = pd.DataFrame({'sms':X, 'target':y_array}) 
train_spam_ix = train_df.loc[train_df['target']=='spam'].index

# mean spam sims
train_mean_spam_sims = []
for ix in range(train_similarities.shape[0]):
    mean_spam_sim = np.mean(train_similarities[ix, train_spam_ix])
    train_mean_spam_sims.append(mean_spam_sim)

# SAVE 4
X_train_svd_spam_path = os.path.join(dep_dir, 'X_train_svd_spam.joblib')
joblib.dump(X_train_svd[train_spam_ix], X_train_svd_spam_path)

['data\\5_deployment\\X_train_svd_spam.joblib']

In [10]:
# stack onto svd
X_train_processed = sp.hstack((csr_matrix(train_mean_spam_sims).T, X_train_svd))

In [11]:
# get test similarities using X_train_svd spam sims
#test_similarities = cosine_similarity(sp.vstack((X_test_svd, 
#                                                 X_train_svd[train_spam_ix]))) # will need this

# get spam cols for spam similarities
#spam_cols = range(X_test_svd.shape[0], test_similarities.shape[0])

# mean spam sims
#test_mean_spam_sims = []
#for ix in range(X_test_svd.shape[0]):
#    mean_spam_sim = np.mean(test_similarities[ix, spam_cols])
#    test_mean_spam_sims.append(mean_spam_sim)

# stack onto svd
#X_test_processed = sp.hstack((csr_matrix(test_mean_spam_sims).T, X_test_svd))

## Train model on all data

In [12]:
clf = XGBClassifier(
    seed=42, eval_metric='error',
    use_label_encoder=False
)

In [13]:
# fit classifier
clf.fit(X_train_processed, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='error',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, seed=42, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [14]:
# SAVE MODEL
XGboost_mod1 = os.path.join(dep_dir, 'XGboost_mod1.joblib')
joblib.dump(clf, XGboost_mod1)

['data\\5_deployment\\XGboost_mod1.joblib']

---

---