# Grid Search
Let's incorporate grid search into your modeling process. 


In [27]:
import nltk
nltk.download(['punkt','wordnet','averaged_perceptron_tagger'])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ziaeeamir\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ziaeeamir\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ziaeeamir\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [28]:
import re
import numpy as np
import pandas as pd
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [29]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

In [30]:
def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).strip()
        #I passed in the output from the previous noun lemmatization step. This way of chaining procedures is very common.
        clean_tok = lemmatizer.lemmatize(clean_tok, pos='v')
        #It is common to apply both, lemmatization first, and then stemming.
        clean_tok =PorterStemmer().stem(clean_tok)
        
        clean_tokens.append(clean_tok)

    return clean_tokens


In [31]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        # tokenize by sentences
        sentence_list = sent_tokenize(text)
        
        for sentence in sentence_list:
            # tokenize each sentence into words and tag part of speech
            pos_tags = pos_tag(tokenize(sentence))

            # index pos_tags to get the first word and part of speech tag
            first_word, first_tag = pos_tags[0]
            
            # return true if the first word is an appropriate verb or RT for retweet
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True

            return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        # apply starting_verb function to all values in X
        X_tagged = pd.Series(X).apply(self.starting_verb)
        
        return pd.DataFrame(X_tagged)
        


In [32]:
class CaseNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return pd.Series(X).apply(lambda x: x.lower()).values

In [33]:
case_normalizer = CaseNormalizer()
X = np.array(['Implementing a Custom Transformer from SCIKIT-LEARN'])
case_normalizer.transform(X)

array(['implementing a custom transformer from scikit-learn'],
      dtype=object)

### View parameters in pipeline
Before modifying your build_model method to include grid search, view the parameters in your pipeline here.

In [34]:
pipeline = Pipeline([ 
                    ('features',FeatureUnion([
                                              ('text-pipline',Pipeline([
                                                                        ('lowercase', CaseNormalizer()),
                                                                        ('vect', CountVectorizer(tokenizer=tokenize)),
                                                                        ('tfidf', TfidfTransformer())
                                                                         ])),
                                                ('starting_verb', StartingVerbExtractor())
                            
                                                ])),
                         ('clf', RandomForestClassifier())

                       ])
    

In [35]:
pipeline.get_params()

{'memory': None,
 'steps': [('features', FeatureUnion(n_jobs=1,
          transformer_list=[('text-pipline', Pipeline(memory=None,
        steps=[('lowercase', CaseNormalizer()), ('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_feature... smooth_idf=True, sublinear_tf=False, use_idf=True))])), ('starting_verb', StartingVerbExtractor())],
          transformer_weights=None)),
  ('clf',
   RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
               max_depth=None, max_features='auto', max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
               oob_score=False, random_state=None, verbose=0,
               warm_start=False))],
 'features': Featur

### Modify your `build_model` function to return a GridSearchCV object.
Try to grid search some parameters in your data transformation steps as well as those for your classifier! Browse the parameters you can search above.

In [39]:
def build_model():
    
    pipeline = Pipeline([ 
                    ('features',FeatureUnion([
                                              ('text-pipline',Pipeline([
                                                                        ('lowercase', CaseNormalizer()),
                                                                        ('vect', CountVectorizer(tokenizer=tokenize)),
                                                                        ('tfidf', TfidfTransformer())
                                                                         ])),
                                                ('starting_verb', StartingVerbExtractor())
                            
                                                ])),
                         ('clf', RandomForestClassifier())

                       ])

    # specify parameters for grid search
    parameters = {
                    #'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
                    #'features__text_pipeline__vect__max_df': (0.5, 0.75, 1.0),
                    #'features__text_pipeline__vect__max_features': (None, 5000, 10000),
                    #'features__text_pipeline__tfidf__use_idf': (True, False),

                    'clf__n_estimators': [50, 100, 200],
                    'clf__min_samples_split': [2, 3, 4],
                    'features__transformer_weights': (
                                                     {'text_pipeline': 1, 'starting_verb': 0.5},
                                                     {'text_pipeline': 0.5, 'starting_verb': 1},
                                                     {'text_pipeline': 0.8, 'starting_verb': 1},
                                                    )
                 }

    # create grid search object
    cv = GridSearchCV(pipeline, parameters)
    
    return cv

### Run program to test
Running grid search can take a while, especially if you are searching over a lot of parameters! If you want to reduce it to a few minutes, try commenting out some of your parameters to grid search over just 1 or 2 parameters with a small number of values each. Once you know that works, feel free to add more parameters and see how well your final model can perform! You can try this out in the next page.

In [40]:
def load_data():
    df = pd.read_csv('../dataset/corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y


def display_results(cv, y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)
    print("\nBest Parameters:", cv.best_params_)


def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model = build_model()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    display_results(model, y_test, y_pred)


main()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 81   0  28]
 [  0  24   4]
 [  5   0 459]]
Accuracy: 0.9384359400998337

Best Parameters: {'clf__min_samples_split': 2, 'clf__n_estimators': 50, 'features__transformer_weights': {'text_pipeline': 1, 'starting_verb': 0.5}}
