# Pipeline Test
You can waste a lot of time by not checking to see if your sklearn pipeline is set up properly before deploying to AWS SageMaker. So we'll test it out here.

In [1]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('srt_train.csv', 
                       header=None,
                       names=['target', 'text'],
                       dtype={'target': np.float64, 'text': str})  

test_df = pd.read_csv('srt_test.csv', 
                       header=None,
                       names=['target', 'text'],
                       dtype={'target': np.float64, 'text': str}) 

In [2]:
import re
import string

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline

class TextPreprocessor(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.control_regex = re.compile(r'[\s]|[a-z]|\b508\b|\b1973\b')
        self.token_pattern = re.compile(r'(?u)\b\w\w+\b')
        self.stopwords = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 
                          'there', 'about', 'once', 'during', 'out', 'very', 'having', 
                          'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its',
                          'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off',
                          'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the',
                          'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 
                          'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more',
                          'himself', 'this', 'down', 'should', 'our', 'their', 'while',
                          'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no',
                          'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been',
                          'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that',
                          'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now',
                          'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too',
                          'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom',
                          't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing',
                          'it', 'how', 'further', 'was', 'here', 'than'}


    def fit(self, X, y = None):
        return self 
    

    def _preprocessing(self, doc):
        # split at any white space and rejoin using a single space. Then lowercase.
        doc_lowered = " ".join(doc.split()).lower()
        # map punctuation to space
        translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) 
        doc_lowered = doc_lowered.translate(translator)
        tokens = "".join(self.control_regex.findall(doc_lowered)).split()
        processed_text = []
        for token in tokens:
            if token in self.stopwords:
                continue
            m = self.token_pattern.search(token)
            if not m:
                continue
            word = m.group().strip()
            processed_text.append(word)
        
        processed_text = " ".join(processed_text)
        
        return processed_text
    
    
    def transform(self, X, y = None):
        X = X['text'].apply(self._preprocessing)
        
        return X

In [3]:
text_transformer = Pipeline(steps=[
        ('cleaner', TextPreprocessor()),
        ('vectorizer', TfidfVectorizer(analyzer=str.split,
                                       ngram_range=(1,2),
                                       sublinear_tf=True)),
        ('select', TruncatedSVD(n_components=100, n_iter=2))])

preprocessor = ColumnTransformer(transformers=[('txt', text_transformer, ['text'])])

In [4]:
print("Fitting preprocessor on training data and transforming")
X_train = preprocessor.fit_transform(train_df)

Fitting preprocessor on training data...


In [5]:
print("Fitting preprocessor on training data and transforming")
X_test = preprocessor.fit_transform(test_df)

Fitting preprocessor on training data and transforming
