# Machine Learning Workflow
Complete the steps below to complete the machine learning workflow for this classifier.

In [1]:
import nltk
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/patrickdaigle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/patrickdaigle/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import re
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

In [3]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

def tokenize(text: str) -> list:
    """
    Function to clean text
    
    - Replaces URLs with "urlplaceholder"
    - Tokenizes
    - Lemmatizes
    - Removes extra whitespace
    - Transforms to lowercase
    
    :param text (str): string data
    
    :return clean_tokens (lst): list of cleaned tokens
    """
    detected_urls = re.findall(url_regex, text)    
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    clean_tokens = [lemmatizer.lemmatize(token).lower().strip() for token in tokens]
    clean_tokens = [lemmatizer.lemmatize(token, pos='v') for token in clean_tokens]

    return clean_tokens

### Step 1: Load data and perform a train test split

In [4]:
# load data
X, y = load_data()
# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [5]:
type(y_train)

numpy.ndarray

### Step 2: Train classifier
* Fit and transform the training data with `CountVectorizer`. Hint: You can include your tokenize function in the `tokenizer` keyword argument!
* Fit and transform these word counts with `TfidfTransformer`.
* Fit a classifier to these tfidf values.

In [6]:
# Instantiate transformers and classifier
vect = CountVectorizer(tokenizer=tokenize)
tfidf = TfidfTransformer()
# clf = AdaBoostClassifier()
clf = RandomForestClassifier(random_state=0)

In [7]:
# Fit and/or transform each to the data
X_train_count = vect.fit_transform(X_train)
X_train_tfidf = tfidf.fit_transform(X_train_count)

In [8]:
clf.fit(X_train_tfidf, y_train)

RandomForestClassifier(random_state=0)

### Step 3: Predict on test data
* Transform (no fitting) the test data with the same CountVectorizer and TfidfTransformer
* Predict labels on these tfidf values.

In [9]:
# Transform test data
X_test_count = vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test_count)
# Predict test labels
y_pred = clf.predict(X_test_tfidf)

### Step 4: Display results
Display a confusion matrix and accuracy score based on the model's predictions.

In [10]:
%%time
np.array(list(set(y_train)), dtype='object')

CPU times: user 59 µs, sys: 0 ns, total: 59 µs
Wall time: 62 µs


array(['Action', 'Information', 'Dialogue'], dtype=object)

In [11]:
%%time
np.unique(y_train)

CPU times: user 1.01 ms, sys: 77 µs, total: 1.09 ms
Wall time: 1.19 ms


array(['Action', 'Dialogue', 'Information'], dtype=object)

In [12]:
labels = np.array(list(set(y_train)), dtype='object')
confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
accuracy = accuracy_score(y_test, y_pred)
# accuracy = clf.score(X_test_tfidf, y_test)

display(pd.DataFrame(confusion_mat,
            columns=[lab + "_true" for lab in labels],
            index=[lab + "_pred" for lab in labels]))
print("Accuracy:", round(accuracy, 4))

Unnamed: 0,Action_true,Information_true,Dialogue_true
Action_pred,102,35,0
Information_pred,4,542,1
Dialogue_pred,1,6,30


Accuracy: 0.9348


# Final Step: Refactor
Organize these steps into the following functions.

In [13]:
import nltk
nltk.download(['punkt', 'wordnet'])
import re
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix, accuracy_score


def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    
    return X, y


def tokenize(text: str) -> list:
    """
    Function to clean text
    
    - Replaces URLs with "urlplaceholder"
    - Tokenizes
    - Lemmatizes
    - Removes extra whitespace
    - Transforms to lowercase
    
    :param text (str): string data
    
    :return clean_tokens (lst): list of cleaned tokens
    """
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    detected_urls = re.findall(url_regex, text)    
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    clean_tokens = [lemmatizer.lemmatize(token).lower().strip() for token in tokens]
    clean_tokens = [lemmatizer.lemmatize(token, pos='v') for token in clean_tokens]

    return clean_tokens


def transformer(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
    #2
    vect = CountVectorizer(tokenizer=tokenize)
    tfidf = TfidfTransformer()
    
    X_train_count = vect.fit_transform(X_train)
    X_train_tfidf = tfidf.fit_transform(X_train_count)
    #3
    X_test_count = vect.transform(X_test)
    X_test_tfidf = tfidf.transform(X_test_count)
    
    return X_train_tfidf, y_train, X_test_tfidf, y_test


def trainer(X_train_tfidf,
            y_train,
            X_test_tfidf,
            clf):
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)
    
    return y_pred


def display_results(y_test, y_pred):
    labels = np.array(list(set(y_test)), dtype='object')
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = accuracy_score(y_test, y_pred)
    
    display(pd.DataFrame(confusion_mat,
            columns=[lab + "_true" for lab in labels],
            index=[lab + "_pred" for lab in labels]))
    print("Accuracy:", round(accuracy, 4))
    
    return None


def main(clf):
    #1
    X, y = load_data()
    #2, 3
    X_train_tfidf, y_train, X_test_tfidf, y_test = transformer(X, y)
    y_pred = trainer(X_train_tfidf, y_train, X_test_tfidf, clf)
    #4
    display_results(y_test, y_pred)
    
    return None

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/patrickdaigle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/patrickdaigle/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
# run program
main(clf=AdaBoostClassifier(random_state=0))

Unnamed: 0,Action_true,Information_true,Dialogue_true
Action_pred,40,96,1
Information_pred,56,490,1
Dialogue_pred,1,5,31


Accuracy: 0.7781


In [15]:
from sklearn.ensemble import RandomForestClassifier
main(clf = RandomForestClassifier(random_state=0))

Unnamed: 0,Action_true,Information_true,Dialogue_true
Action_pred,102,35,0
Information_pred,4,542,1
Dialogue_pred,1,6,30


Accuracy: 0.9348


In [16]:
import workflow

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/patrickdaigle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/patrickdaigle/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
#TODO: Validate issue with module
# from sklearn.ensemble import RandomForestClassifier
# workflow.main(clf=RandomForestClassifier(0))