In [91]:
import re
import nltk
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix, precision_score

In [92]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger') # for pos tags
# nltk.download('universal_tagset')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [93]:
def tokenize(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    clean_tokens = []
    for tok in tokens:
        clean_tokens.append(lemmatizer.lemmatize(tok).lower().strip())
    return clean_tokens

def score(model, X_test, y_test):
    score = model.score(X_test, y_test)
    return score

In [94]:
exp = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

In [95]:
class TextLinkExtractor(BaseEstimator, TransformerMixin):

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        features = np.recarray(shape=(len(posts),),
                               dtype=[('text', object), ('links', list)])
        for i, text in enumerate(posts):
            
            link_matches = re.findall(exp, text)
            for link in link_matches:
                text = text.replace(link, "")
            
            features['text'][i] = text
            features['links'][i] = link_matches

        return features

class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


class LinkCountExtractor(BaseEstimator, TransformerMixin):

    def fit(self, x, y=None):
        return self

    def transform(self, link_matches):
        return [{'contains_link': len(link_matches) > 0}
#                  'num_sentences': text.count('.')}
                for text in link_matches]

In [96]:
class VerbPositionExtractor(BaseEstimator, TransformerMixin):

    def start_verb(self, text):
        sent_text = nltk.sent_tokenize(text)
        start_verbs = 0
        for sentence in sent_text:
            for phrase in re.split(';|,|-|\|', sentence):
                first_word, first_tag = nltk.pos_tag(tokenize(sentence))[0]
                if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                    start_verbs += 1
    #                 print(first_word, first_tag)
    #                 print(sentence)
        if start_verbs > 0:
#             print(sentence)
            return True
        return False
    
#     def count_present_verbs(self, text):
#         sent_text = nltk.sent_tokenize(text)
#         present_verbs = 0
#         for sentence in sent_text:
#             for word, tag in nltk.pos_tag(tokenize(sentence)):
#                 if tag in ['VBP', 'VB']:
#                     present_verbs += 1
#                     print(word, tag)
#                     print(sentence)
#         if present_verbs > 0:
#             return True
#         return False
    
    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.start_verb)
#         X_tagged = pd.Series(X).apply(self.count_present_verbs)
#         print(X_tagged)
        return pd.DataFrame(X_tagged)

In [97]:
# from collections import Counter
# class PosTagMatrix(BaseEstimator, TransformerMixin):

#     #helper function to tokenize and count parts of speech
#     def pos_func(self, sentence):
#         return Counter(tag for word,tag in nltk.pos_tag(tokenize(sentence), tagset='universal'))

#     def fit(self, X, y = None):
#         X_tagged = pd.Series(X).apply(self.pos_func).apply(pd.Series)
#         self.pos_tags = list(X_tagged.columns)
#         self.pos_tags = [tag for tag in self.pos_tags if tag not in ['.', 'X']]
#         return self

#     #all the work is done here
#     def transform(self, X):
#         if self.pos_tags:
#             X_tagged = pd.Series(X).apply(self.pos_func).apply(pd.Series)
#             X_tagged = X_tagged.loc[:, self.pos_tags].fillna(0)
#             X_tagged['n_tokens'] = X_tagged.apply(sum, axis=1)
#         else:
#             X_tagged = pd.Series(X).apply(self.pos_func).apply(pd.Series).fillna(0)
#             X_tagged['n_tokens'] = X_tagged.apply(sum, axis=1)
#         print(X_tagged.loc[0, :])
#         return X_tagged

In [98]:
vectorizer = CountVectorizer(tokenizer=tokenize, ngram_range=(1,1))
rf = RandomForestClassifier(n_estimators=100)

model = Pipeline([
#     ('textlink', TextLinkExtractor()),

    ('features', FeatureUnion(
        transformer_list=[

            ('text_pipeline', Pipeline([
#                 ('selector', ItemSelector(key='text')),
                ('vectorizer', vectorizer),
                ('tfidf', TfidfTransformer())
            ])),
            
#             ('pos_feature', Pipeline([
#                 ('selector', ItemSelector(key='text')),
#                 ('pos_tags', PosTagMatrix()),
#             ])),
            
            ('verb_feature', Pipeline([
#                 ('selector', ItemSelector(key='text')),
                ('verb_position', VerbPositionExtractor()),
            ])),

#             ('links_pipeline', Pipeline([
#                 ('selector', ItemSelector(key='links')),
#                 ('stats', LinkCountExtractor()),
#                 ('vect', DictVectorizer()),
#             ])),

        ],

#         transformer_weights={
#             'text_pipeline': 0.8,
# #             'pos_feature': 0.5,
#             'verb_feature': 1.0,
# #             'links_pipeline': 7.0,            
#         },
    )),

    ('rf', rf)
])

In [99]:
df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
X = df.text.values
y = df.category.values
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [100]:
labels = np.unique(y)

In [101]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mat = confusion_matrix(y_test, y_pred, labels=labels)
acc = score(model, X_test, y_test)

# y_pred = model.predict(X_test[1])
# mat = confusion_matrix(y_test[1], y_pred, labels=labels)
# acc = score(model, X_test[1], y_test[1])

print("Value Counts:")
print(df.category.value_counts())
print("\nConfusion Matrix:\n", mat)
print("Labels:", labels)
print("Accuracy:", acc)

Value Counts:
Information    1823
Action          456
Dialogue        124
Name: category, dtype: int64

Confusion Matrix:
 [[104   0  23]
 [  2  28   1]
 [  3   0 440]]
Labels: ['Action' 'Dialogue' 'Information']
Accuracy: 0.951747088186


In [102]:
    def start_verb(text):
        sent_text = nltk.sent_tokenize(text)
        start_verbs = 0
        for sentence in sent_text:
            for phrase in re.split(';|,|-|\|', sentence):
                first_word, first_tag = nltk.pos_tag(tokenize(sentence))[0]
                if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                    start_verbs += 1
#                     print(first_word, first_tag)
#                     print(text)
#                 else:
#                     print(first_word, first_tag)
#                     print(phrase)
        if start_verbs > 0:
            return True
#         print(text, '\n')
        return False

In [103]:
info = y == 'Information'
action = y == 'Action'
dial = y == 'Dialogue'

In [104]:
def start_verb(text):
    sent_text = nltk.sent_tokenize(text)
    start_verbs = 0
    for sentence in sent_text:
        for phrase in re.split(';|,|-|\|', sentence):
            first_word, first_tag = nltk.pos_tag(tokenize(sentence))[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                start_verbs += 1
    if start_verbs > 0:
        return True
    return False

start_info = pd.Series(X[info]).apply(start_verb)
print(start_info.sum()/len(start_info), len(start_info))

action_info = pd.Series(X[action]).apply(start_verb)
print(action_info.sum()/len(action_info), len(action_info))

dial_info = pd.Series(X[dial]).apply(start_verb)
print(dial_info.sum()/len(dial_info), len(dial_info))

0.00603400987383 1823
0.438596491228 456
0.0322580645161 124


In [105]:
def has_link(text):
    link_matches = re.findall(exp, text)
    if len(link_matches) > 0:
        return True
    return False
        
start_info = pd.Series(X[info]).apply(has_link)
print(start_info.sum()/len(start_info), len(start_info))

action_info = pd.Series(X[action]).apply(has_link)
print(action_info.sum()/len(action_info), len(action_info))

dial_info = pd.Series(X[dial]).apply(has_link)
print(dial_info.sum()/len(dial_info), len(dial_info))

0.72901810203 1823
0.923245614035 456
0.362903225806 124


In [106]:
incorrect = (y_pred != y_test) & (y_test == 'Action')
for x, yt, yp in zip(X_test[incorrect], y_test[incorrect], y_pred[incorrect]):
    print('Correct: {}  Prediction: {}'.format(yt, yp))
    print(x, '\n')

Correct: Action  Prediction: Information
2 Banks You Can Buy Right Now:  http://t.co/lzatdIFeRA #Citigroup #BRK 

Correct: Action  Prediction: Information
The foundation of our strategy is our commitment to values &amp  integrity. Learn about our ethics &amp  compliance program: http://t.co/ZF2GIIu66K 

Correct: Action  Prediction: Information
How do banks communicate online? @WIMAccelerator co-founder, @jkhoey hosts soc. financial biz conversation. Register: http://t.co/EczHnSP8J7 

Correct: Action  Prediction: Information
A retrospective study on the lifetime risk of developing COPD was recently released. Weigh in here http://t.co/gEF922m0 

Correct: Action  Prediction: Information
Oct is ADHD Awareness Month - did you know #ADHD affects 5M children in US? Visit     http://t.co/ieoxcyuPvH for caregiver articles &amp  videos 

Correct: Action  Prediction: Information
#BarclaysJBS are sponsoring WetWheels charity @jerseyboatshow come and show your support http://t.co/yvir0lqutf #Barcla

In [107]:
correct = (y_pred == y_test) & (y_test == 'Information')
for x, yt, yp in zip(X_test[correct], y_test[correct], y_pred[correct]):
    print('Correct: {}  Prediction: {}'.format(yt, yp))
    print(x, '\n')

Correct: Information  Prediction: Information
Isle of Man News: Barclays Wealth launches student of the year awards http://su.pr/7yGDT5 

Correct: Information  Prediction: Information
#ICYMI: #Nestle to acquire aesthetic dermatology assets for NestlÌ© Skin Health. http://t.co/EI3UwzD0ec 

Correct: Information  Prediction: Information
Barbara Cannon is talking about whether humans have 'brown fat' - a type of adipose tissue that generates body heat #NINS2013 

Correct: Information  Prediction: Information
From our blog: RT @DownTo_Earth_ Today on down to Earth, an exclusivity: the weather forecast for September, 2085 http://t.co/3XUtrEyp Û_ 

Correct: Information  Prediction: Information
Fed Governor Raskin Says Bank Reputations Harmed by Bailouts:  http://t.co/laNsGncMMp #Citigroup #BRK 

Correct: Information  Prediction: Information
This monthÛªs White Paper explores Barclays new Entrepreneurs Index which maps business activity and wealth creation. http://t.co/8G5wgEcB 

Correct: In