In [66]:
import nltk
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix, precision_score

In [36]:
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [27]:
df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')

In [18]:
df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]

In [19]:
X = df.text.values
y = df.category.values

In [20]:
np.unique(y)

array(['Action', 'Dialogue', 'Information'], dtype=object)

In [31]:
def tokenize(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    clean_tokens = []
    for tok in tokens:
        clean_tokens.append(lemmatizer.lemmatize(tok).lower().strip())
    return clean_tokens

def score(model, X_test, y_test):
    score = model.score(X_test, y_test)
    return score

In [70]:
vectorizer = CountVectorizer(tokenizer=tokenize, ngram_range=(1,1))
X_train, X_test, y_train, y_test = train_test_split(X, y)

rf = RandomForestClassifier(n_estimators=100)
model = Pipeline([('vectorizer', vectorizer), ('tfidf', TfidfTransformer()), ('rf', rf)])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mat = confusion_matrix(y_test, y_pred)
acc = score(model, X_test, y_test)

print("Accuracy:", acc)

Accuracy: 0.936772046589


In [51]:
X[9:14]

array([ 'You no longer have one workforce. You have five. Want the employee benefits plan to match? Our video tells you how. http://t.co/Mxydes7LHG',
       'UK entrepreneurial activity in 2013 at a glance - Barclays and @BGF_team Entrepreneurs Index: http://t.co/8F6YStpAqS http://t.co/Uqg4l09RC9',
       'Emma Turner, Head of Client Philanthropy provides a short insight into giving. Catch her latest blog here. #charity http://t.co/5YaLwoeoSm',
       '...Visit us @NAPFnews 5-7 March for more workforce insights. #NAPF #pension http://t.co/eR9REnMww6 http://t.co/HCbfiJBJkB',
       'Will the chill in #emergingmarkets last a little longer as developed markets remain broadly positive? http://t.co/uiv1zY0eC0'], dtype=object)

In [52]:
y[9:14]

array(['Action', 'Information', 'Action', 'Action', 'Information'], dtype=object)

In [47]:
df.category.value_counts()

Information    2129
Action          724
Dialogue        226
Exclude          39
Name: category, dtype: int64