# Create Custom Transformer

In [6]:
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ziaeeamir\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ziaeeamir\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ziaeeamir\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [7]:
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator,TransformerMixin

In [8]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

### Implement the StartingVerbExtractor class

In [9]:

class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        # tokenize by sentences
        sentence_list = sent_tokenize(text)
        
        for sentence in sentence_list:
            # tokenize each sentence into words and tag part of speech
            pos_tags = pos_tag(tokenize(sentence))

            # index pos_tags to get the first word and part of speech tag
            first_word, first_tag = pos_tags[0]
            
            # return true if the first word is an appropriate verb or RT for retweet
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True

            return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        # apply starting_verb function to all values in X
        X_tagged = pd.Series(X).apply(self.starting_verb)
        
        return pd.DataFrame(X_tagged)


### First execute the load_data function cell, then the cell below

In [60]:
X, y = load_data()
X_train, X_test, y_train, y_test = train_test_split(X, y)

starting_verb = StartingVerbExtractor()
starting_verb.transform(X_train)

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
1797,False
1798,False
1799,False
1800,False


I'll create **a custom transformer** that simply converts all text to lowercase. I am not setting anything in my init method, or I can actually remove that. I can leave our fit method as is, and focus on the transform method. I can lowercase all the values in X by applying a lambda function that calls lower on each value. I'll have to wrap this in a pandas Series to be able to use this apply function. 

In [10]:
class CaseNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return pd.Series(X).apply(lambda x: x.lower()).values
    

In [11]:
case_normalizer = CaseNormalizer()

In [12]:
X = np.array(['Implementing a Custom Transformer from SCIKIT-LEARN'])
case_normalizer.transform(X)

array(['implementing a custom transformer from scikit-learn'],
      dtype=object)

Another way to **create custom transformers** is by using this **FunctionTransformer** from scikit-learn's preprocessing module. This allows you to wrap an existing function to become a transformer. This provides less flexibility, but is much simpler. You can learn more about these linkes [here](http://scikit-learn.org/stable/modules/preprocessing.html#custom-transformers) and [here](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html#sklearn.preprocessing.FunctionTransformer).


### Run program to test


### Build the pipeline to have this structure:
- Pipeline
    - feature union
        - text pipeline
            - lower case transformer
            - count vectorizer
            - TFIDF transformer
        - starting verb extractor
    - classifier

In [16]:
def load_data():
    df = pd.read_csv('../dataset/corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y


def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        #I passed in the output from the previous noun lemmatization step. This way of chaining procedures is very common.
        clean_tok = lemmatizer.lemmatize(clean_tok, pos='v')
        #It is common to apply both, lemmatization first, and then stemming.
        clean_tok =PorterStemmer().stem(clean_tok)
        
        clean_tokens.append(clean_tok)

    return clean_tokens


def model_pipeline():
    
    
    pipeline = Pipeline([ 
                        ('features',FeatureUnion([
                                                ('text-pipline',Pipeline([
                                                                        ('lowercase', CaseNormalizer()),
                                                                        ('vect', CountVectorizer(tokenizer=tokenize)),
                                                                        ('tfidf', TfidfTransformer())
                                                                         ])),
                                                 ('starting_verb', StartingVerbExtractor())
                            
                                                ])),
                         ('clf', RandomForestClassifier())

                       ])
    
    return pipeline


def display_results(y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)


def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model = model_pipeline()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    display_results(y_test, y_pred)
    
    
main()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 99   0  26]
 [  5  24   6]
 [ 13   1 427]]
Accuracy: 0.9151414309484193
