In [None]:
import re
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger') # for pos tags
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

In [None]:
def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [None]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            for phrase in re.split(';|,|-|\|', sentence):
                pos_tags = nltk.pos_tag(tokenize(sentence))
                first_word, first_tag = pos_tags[0]
                if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                    return True
        return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

In [6]:
vectorizer = CountVectorizer(tokenizer=tokenize, ngram_range=(1,1))
rf = RandomForestClassifier(n_estimators=100)

pipeline = Pipeline([
    ('features', FeatureUnion(
        transformer_list=[

            ('text_pipeline', Pipeline([
                ('vectorizer', vectorizer),
                ('tfidf', TfidfTransformer())
            ])),
            
            ('starting_verb', StartingVerbExtractor())
        ],

#         transformer_weights={
#             'text_pipeline': 0.5,
#             'verb_feature': 1.0,         
#         },
    )),
    ('rf', rf)
])

In [7]:
def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

In [8]:
X, y = load_data()
X_train, X_test, y_train, y_test = train_test_split(X, y)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

labels = np.unique(y)
mat = confusion_matrix(y_test, y_pred, labels=labels)
acc = pipeline.score(X_test, y_test)

print("Confusion Matrix:\n", mat)
print("Labels:", labels)
print("Accuracy:", acc)

Confusion Matrix:
 [[ 97   0  24]
 [  1  26   4]
 [  2   1 446]]
Labels: ['Action' 'Dialogue' 'Information']
Accuracy: 0.946755407654


In [9]:
parameters = {
    'rf__n_estimators': [50, 100, 200],
    'rf__min_samples_split': [2, 3, 4]
    'features__text_pipeline__tfidf__use_idf': (True, False),
    'features__transformer_weights': (
        {'text_pipeline': 1, 'verb_feature': 0.5},
        {'text_pipeline': 0.5, 'verb_feature': 1},
        {'text_pipeline': 0.8, 'verb_feature': 1},
    ),
    'vect__max_n': (1, 2),
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000)
}

cv = GridSearchCV(pipeline, param_grid=parameters)

cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

In [10]:
report = classification_report(y_test, y_pred )

In [11]:
print(report)

             precision    recall  f1-score   support

     Action       0.96      0.79      0.86       121
   Dialogue       0.96      0.84      0.90        31
Information       0.94      0.99      0.96       449

avg / total       0.94      0.94      0.94       601



In [12]:
labels = np.unique(y)
mat = confusion_matrix(y_test, y_pred, labels=labels)
acc = cv.score(X_test, y_test)

print("Confusion Matrix:\n", mat)
print("Labels:", labels)
print("Accuracy:", acc)

Confusion Matrix:
 [[ 95   0  26]
 [  1  26   4]
 [  3   1 445]]
Labels: ['Action' 'Dialogue' 'Information']
Accuracy: 0.941763727121


In [13]:
cv.best_params_

{'features__transformer_weights': {'text_pipeline': 0.8, 'verb_feature': 1}}