# Machine Learning Workflow
Complete the steps below to complete the machine learning workflow for this classifier.

In [43]:
import nltk
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to /Users/d5mit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/d5mit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [44]:
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
from sklearn.metrics import confusion_matrix

In [45]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

def display_results(y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)


### Step 1: Load data and perform a train test split

In [46]:
# load data
X, y = load_data()

# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)



### Step 2: Train classifier
* Fit and transform the training data with `CountVectorizer`. Hint: You can include your tokenize function in the `tokenizer` keyword argument!
* Fit and transform these word counts with `TfidfTransformer`.
* Fit a classifier to these tfidf values.

In [52]:
# Instantiate transformers and classifier
vect = CountVectorizer(tokenizer=tokenize)

# initialize tf-idf transformer object
tfidf = TfidfTransformer(smooth_idf=False)

# Fit a classifier to these tfidf values.
clf = RandomForestClassifier()

# Transform test data

print(X_train)
X_train_counts = vect.fit_transform(X_train)
print(X_train_counts)
X_train_tfidf = tfidf.fit_transform(X_train_counts)
print(X_train_tfidf)
clf.fit(X_train_tfidf, y_train)


['Merck Fdn proud to support #mhealth texts/apps study improving diabetes outcomes. Learn more: http://t.co/tX8jSXa8qQ  via @Health_Affairs'
 'Z Bhutta  Many developing countries are facing problem of increasing life expectancy but not necessarily improved quality of life  #NINS2013'
 "Citi's strategy remains the right one: Corbat:  http://t.co/pCda2ioJua #Citigroup #BRK"
 ...
 '\x89ÛÏMerck has built a reputation for discovering and developing breakthrough therapeutics.\x89Û\x9d \x89ÛÒ R. Perlmutter #MerckBiz14'
 'See how we work with coffee cooperatives to help improve #sustainability of production in Colombia #CSV2011 http://youtu.be/DEy4-vCYPP8'
 'RT @atelierstat: Brazilian #Internet users spend 36% of their time on the internet on#SocialNetworks http://t.co/WLnHlHES5T  cc @comScore']
  (0, 3059)	1
  (0, 1941)	1
  (0, 3761)	1
  (0, 4604)	1
  (0, 4465)	1
  (0, 1)	1
  (0, 3077)	1
  (0, 4546)	1
  (0, 4437)	1
  (0, 2504)	1
  (0, 1495)	1
  (0, 3429)	1
  (0, 49)	1
  (0, 2815)	1
  (0, 3154

RandomForestClassifier()

In [53]:
vect.vocabulary_

{'merck': 3059,
 'fdn': 1941,
 'proud': 3761,
 'to': 4604,
 'support': 4465,
 '#': 1,
 'mhealth': 3077,
 'texts/apps': 4546,
 'study': 4437,
 'improving': 2504,
 'diabetes': 1495,
 'outcome': 3429,
 '.': 49,
 'learn': 2815,
 'more': 3154,
 ':': 313,
 'urlplaceholder': 4768,
 'via': 4814,
 '@': 316,
 'health_affairs': 2332,
 'z': 5093,
 'bhutta': 750,
 'many': 2985,
 'developing': 1489,
 'country': 1313,
 'are': 565,
 'facing': 1894,
 'problem': 3715,
 'of': 3366,
 'increasing': 2517,
 'life': 2859,
 'expectancy': 1857,
 'but': 897,
 'not': 3319,
 'necessarily': 3242,
 'improved': 2502,
 'quality': 3798,
 'nins2013': 3300,
 'citi': 1076,
 "'s": 22,
 'strategy': 4415,
 'remains': 3936,
 'the': 4556,
 'right': 4012,
 'one': 3386,
 'corbat': 1291,
 'citigroup': 1087,
 'brk': 869,
 'a': 323,
 'live': 2890,
 'webcast': 4924,
 'from': 2081,
 '2014': 158,
 'nestle': 3262,
 'investor': 2604,
 'seminar': 4167,
 'take': 4496,
 'place': 3590,
 'today': 4606,
 'at': 605,
 '4.40pm': 220,
 'cet': 100

### Step 3: Predict on test data
* Transform (no fitting) the test data with the same CountVectorizer and TfidfTransformer
* Predict labels on these tfidf values.

In [49]:
# Predict test labels
X_test_counts = vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test_counts)
y_pred = clf.predict(X_test_tfidf)

# display results
display_results(y_test, y_pred)


Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[103   0  34]
 [  1  40  11]
 [  3   0 601]]
Accuracy: 0.9382093316519546


### Step 4: Display results
Display a confusion matrix and accuracy score based on the model's predictions.

In [40]:
labels = np.unique(y_pred)
confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
accuracy = (y_pred == y_test).mean()

print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[102   0  35]
 [  3  39  10]
 [  5   1 598]]
Accuracy: 0.9319041614123581


# Final Step: Refactor
Organize these steps into the following functions.

In [41]:
def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    vect = CountVectorizer(tokenizer=tokenize)
    tfidf = TfidfTransformer()
    clf = RandomForestClassifier()

    # train classifier
    X_train_counts = vect.fit_transform(X_train)
    X_train_tfidf = tfidf.fit_transform(X_train_counts)
    clf.fit(X_train_tfidf, y_train)

    # predict on test data
    X_test_counts = vect.transform(X_test)
    X_test_tfidf = tfidf.transform(X_test_counts)
    y_pred = clf.predict(X_test_tfidf)

    # display results
    display_results(y_test, y_pred)


main()


Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 75   1  29]
 [  1  19   5]
 [  2   0 469]]
Accuracy: 0.9367720465890182


In [42]:
# run program
main()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 75   1  34]
 [  0  32   0]
 [  6   2 451]]
Accuracy: 0.9284525790349417
