In [43]:
import re
import nltk
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix, precision_score

In [71]:
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [72]:
def tokenize(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    clean_tokens = []
    for tok in tokens:
        clean_tokens.append(lemmatizer.lemmatize(tok).lower().strip())
    return clean_tokens

def score(model, X_test, y_test):
    score = model.score(X_test, y_test)
    return score

In [73]:
exp = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

In [74]:
class TextLinkExtractor(BaseEstimator, TransformerMixin):

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        features = np.recarray(shape=(len(posts),),
                               dtype=[('text', object), ('links', list)])
        for i, text in enumerate(posts):
            
            link_matches = re.findall(exp, text)
            for link in link_matches:
                text = text.replace(link, "")
            
            features['text'][i] = text
            features['links'][i] = link_matches

        return features

class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


class LinkCountExtractor(BaseEstimator, TransformerMixin):

    def fit(self, x, y=None):
        return self

    def transform(self, link_matches):
        return [{'contains_link': len(link_matches) > 0}
#                  'num_sentences': text.count('.')}
                for text in link_matches]

In [83]:
vectorizer = CountVectorizer(tokenizer=tokenize, ngram_range=(1,1))
rf = RandomForestClassifier(n_estimators=100)

model = Pipeline([
    ('textlink', TextLinkExtractor()),

    ('features', FeatureUnion(
        transformer_list=[

            ('text_pipeline', Pipeline([
                ('selector', ItemSelector(key='text')),
                ('vectorizer', vectorizer),
                ('tfidf', TfidfTransformer())
            ])),

            ('links_pipeline', Pipeline([
                ('selector', ItemSelector(key='links')),
                ('stats', LinkCountExtractor()),
                ('vect', DictVectorizer()),
            ])),

        ],

#         transformer_weights={
#             'subject': 0.8,
#             'body_bow': 0.5,
#             'body_stats': 1.0,
#         },
    )),

    ('rf', rf)
])

In [84]:
df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
X = df.text.values
y = df.category.values
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [85]:
labels = np.unique(y)

In [86]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mat = confusion_matrix(y_test, y_pred, labels=labels)
acc = score(model, X_test, y_test)

print("Value Counts:")
print(df.category.value_counts())
print("\nConfusion Matrix:\n", mat)
print("Labels:", labels)
print("Accuracy:", acc)

Value Counts:
Information    1823
Action          456
Dialogue        124
Name: category, dtype: int64

Confusion Matrix:
 [[ 81   0  23]
 [  0  27   7]
 [  3   1 459]]
Labels: ['Action' 'Dialogue' 'Information']
Accuracy: 0.943427620632


In [87]:
incorrect = y_pred != y_test
for x, yt, yp in zip(X_test[incorrect], y_test[incorrect], y_pred[incorrect]):
    print('Correct: {}  Prediction: {}'.format(yt, yp))
    print(x, '\n')

Correct: Action  Prediction: Information
Can the U.S. help break the cycle of global poverty and disease?  Discuss with Tamara Evans at Think Science Now http://bit.ly/8YCX2m 

Correct: Action  Prediction: Information
What is the bond between duct tape and #healthcare? See what @vrulon has to say http://t.co/b76CPJcT 

Correct: Action  Prediction: Information
Gen Y: 65% seek financial education &amp  guidance says latest research. View our infographic for more details. http://t.co/Lq2539U7W1 

Correct: Information  Prediction: Action
For 80 years @ApolloTheater has rocked #Harlem with #legends - proud to be a sponsor. Here's to many more! #Apollo80 http://t.co/q1VABZ9NWa 

Correct: Action  Prediction: Information
#HPV causes most cases of #cervicalcancer. During #CervicalHealthMonth, learn the facts: http://t.co/mFIH4yJCws 

Correct: Action  Prediction: Information
MONDAY'S RIDDLE: when was #NestlÌ©`s first soluble cocoa beverage, #Nesquik developed? Take a guess! http://t.co/VO3a1p4bK