# Machine Learning Workflow
Complete the steps below to complete the machine learning workflow for this classifier.

In [1]:
import nltk
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [3]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    print("Loaded data {}\n".format(df.shape))
    X = df.text.values
    y = df.category.values
    return X, y

def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### Step 1: Load data and perform a train test split

In [4]:
# load data
X, y = load_data()
print("X {}:\n{}\n".format(X.shape, X[:3]))
print("y {}:\n{}\n".format(y.shape, y[:3]))

# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y)
print("X_train {}:\n{}\n".format(X_train.shape, X_train[:3]))
print("y_train {}:\n{}\n".format(y_train.shape, y_train[:3]))
print("\nX_test {}:\n{}\n".format(X_test.shape, X_test[:3]))
print("y_test {}:\n{}\n".format(y_test.shape, y_test[:3]))

Loaded data (2403, 11)

X (2403,):
[ 'Barclays CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference  http://t.co/Ge9Lp7hpyG'
 'Barclays announces result of Rights Issue http://t.co/LbIqqh3wwG'
 'Barclays publishes its prospectus for its å£5.8bn Rights Issue: http://t.co/YZk24iE8G6']

y (2403,):
['Information' 'Information' 'Information']

X_train (1802,):
[ 'President and CEO of NestlÌ© Health Science, Luis Cantarell, now talking about pioneering science-based nutritional solutions #NestleIR'
 'Recent studies indicate that 45% of residents of elderly homes are malnourished: http://t.co/AouKkLdt'
 'Emma Turner recalls her first \x89Û÷giving\x89Ûª encounter as a child as she unveils the newly released #philanthropy guide. http://t.co/2txBDrgbpw']

y_train (1802,):
['Information' 'Information' 'Information']


X_test (601,):
[ 'Today is Int\x89Ûªl Day of #ruralwomen. Find out how we help to empower female farmers in Pakistan http://t.c

### Step 2: Train classifier
* Fit and transform the training data with `CountVectorizer`. Hint: You can include your tokenize function in the `tokenizer` keyword argument!
* Fit and transform these word counts with `TfidfTransformer`.
* Fit a classifier to these tfidf values.

In [5]:
# Instantiate transformers and classifier
vect = CountVectorizer(tokenizer=tokenize)
tfidf = TfidfTransformer()
clf = RandomForestClassifier()

# Fit and/or transform each to the data
tfidf_values_train = tfidf.fit_transform(vect.fit_transform(X_train))
print("tfidf_values_train ({}):\n{}\n".format(tfidf_values_train.shape, tfidf_values_train[0]))
clf.fit(tfidf_values_train, y_train)

tfidf_values_train ((1802, 5509)):
  (0, 3920)	0.23006639872
  (0, 520)	0.122309121586
  (0, 1036)	0.19262820842
  (0, 3582)	0.09776976122
  (0, 3478)	0.188682041678
  (0, 2460)	0.127103328743
  (0, 4385)	0.213633621136
  (0, 45)	0.204372306337
  (0, 3109)	0.300032402047
  (0, 975)	0.300032402047
  (0, 3545)	0.210995685157
  (0, 4797)	0.270786281669
  (0, 351)	0.172079472183
  (0, 3820)	0.329278522424
  (0, 4386)	0.329278522424
  (0, 3558)	0.270786281669
  (0, 4565)	0.241540161292
  (0, 1)	0.0635831227114
  (0, 3477)	0.257349689246



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Step 3: Predict on test data
* Transform (**no fitting**) the test data with the same CountVectorizer and TfidfTransformer
* Predict labels on these tfidf values.

In [6]:
# Transform test data
tfidf_values_test = tfidf.transform(vect.transform(X_test))
print("tfidf_values_test {}:\n{}\n".format(tfidf_values_test.shape, tfidf_values_test[0]))

# Predict test labels
y_pred = clf.predict(tfidf_values_test)

tfidf_values_test (601, 5509):
  (0, 5232)	0.188858902546
  (0, 5086)	0.144641525689
  (0, 4902)	0.263363636986
  (0, 4900)	0.108775429829
  (0, 3640)	0.246759125525
  (0, 3582)	0.131322553549
  (0, 2788)	0.159844058693
  (0, 2650)	0.126274127821
  (0, 2573)	0.237638690205
  (0, 2497)	0.243270444099
  (0, 2091)	0.288807072708
  (0, 2059)	0.402998030193
  (0, 2028)	0.357040037724
  (0, 1811)	0.402998030193
  (0, 1502)	0.255495334438
  (0, 60)	0.136606869857
  (0, 1)	0.0854036865066



### Step 4: Display results
Display a confusion matrix and accuracy score based on the model's predictions.

In [7]:
labels = np.unique(y)
confusion_mat = confusion_matrix(y_test, y_pred, labels= labels)
accuracy = (y_pred == y_test).mean()

print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 85   0  25]
 [  1  26   5]
 [  9   2 448]]
Accuracy: 0.930116472546


# Final Step: Refactor
Organize these steps into the following functions.

In [8]:
def display_results(y_test, y_pred):
    # insert step 4 here
    labels = np.unique(y)
    confusion_mat = confusion_matrix(y_test, y_pred, labels= labels)
    accuracy = (y_pred == y_test).mean()
    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)


def main():
    # insert steps 1 through 3 here
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    vect = CountVectorizer(tokenizer=tokenize)
    tfidf = TfidfTransformer()
    clf = RandomForestClassifier()
    tfidf_values_train = tfidf.fit_transform(vect.fit_transform(X_train))
    clf.fit(tfidf_values_train, y_train)
    
    tfidf_values_test = tfidf.transform(vect.transform(X_test))
    y_pred = clf.predict(tfidf_values_test)
    
    display_results(y_test, y_pred)

In [9]:
# run program
main()

Loaded data (2403, 11)

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 88   1  31]
 [  0  23   8]
 [  5   2 443]]
Accuracy: 0.921797004992
