# Machine Learning Workflow
Complete the steps below to complete the machine learning workflow for this classifier.

In [1]:
import nltk
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### Step 1: Load data and perform a train test split

In [25]:
# load data
X, y = load_data()

# perform train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Step 2: Train classifier
* Fit and transform the training data with `CountVectorizer`. Hint: You can include your tokenize function in the `tokenizer` keyword argument!
* Fit and transform these word counts with `TfidfTransformer`.
* Fit a classifier to these tfidf values.

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [28]:
X_test.shape

(601,)

In [29]:
X_train.shape

(1802,)

In [56]:
# Instantiate transformers and classifier

vect = CountVectorizer(tokenizer=tokenize)
tfidf = TfidfTransformer()
clf =  RandomForestClassifier()

In [57]:
X_train_tfidf.shape

(1802, 5567)

In [58]:
# Fit and/or transform each to the data
vect_train = vect.fit_transform(X_train)
tfidf_train= tfidf.fit_transform(vect_train)
clf.fit(tfidf_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [59]:
tfidf_train.shape

(1802, 5567)

### Step 3: Predict on test data
* Transform (no fitting) the test data with the same CountVectorizer and TfidfTransformer
* Predict labels on these tfidf values.

In [60]:
# Transform test data
vect_test = vect.transform(X_test)
tfidf_test = tfidf.transform(vect_test)
# Predict test labels
y_pred = clf.predict(tfidf_test)

### Step 4: Display results
Display a confusion matrix and accuracy score based on the model's predictions.

In [61]:
import numpy as np

In [62]:
from sklearn.metrics import accuracy_score

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 86   0  35]
 [  1  23   4]
 [  3   0 449]]
Accuracy: 0.928452579035


# Final Step: Refactor
Organize these steps into the following functions.

In [67]:
def display_results():
    # insert step 4 here
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test,y_pred,labels=labels)
    accuracy = accuracy_score(y_test,y_pred)

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)


def main():
    # insert steps 1 through 3 here
    
    # load data
    X, y = load_data()

    # perform train test split
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # Instantiate transformers and classifier
    vect = CountVectorizer(tokenizer=tokenize)
    tfidf = TfidfTransformer()
    clf =  RandomForestClassifier()
    # Fit and/or transform each to the data

    vect_train = vect.fit_transform(X_train)
    tfidf_train= tfidf.fit_transform(vect_train)
    clf.fit(tfidf_train,y_train)

    # Transform test data
    vect_test = vect.transform(X_test)
    tfidf_test = tfidf.transform(vect_test)
    # Predict test labels
    y_pred = clf.predict(tfidf_test)
    
    ## Display results
    display_results()

In [68]:
# run program
main()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 86   0  35]
 [  1  23   4]
 [  3   0 449]]
Accuracy: 0.928452579035
