# Machine Learning Workflow
Complete the steps below to complete the machine learning workflow for this classifier.

In [19]:
import nltk
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ziaeeamir\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ziaeeamir\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
import re
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import confusion_matrix

In [21]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def load_data():
    df = pd.read_csv('../dataset/corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### Step 1: Load data and perform a train test split

In [22]:
# load data
X, y = load_data()

# perform train test split
X_train, X_test, y_train ,  y_test = train_test_split(X,y)

In [23]:
print(X_train.shape, y_train.shape , X_test.shape, y_test.shape)

(1802,) (1802,) (601,) (601,)


### Step 2: Train classifier
* Fit and transform the training data with `CountVectorizer`. Hint: You can include your tokenize function in the `tokenizer` keyword argument!
* Fit and transform these word counts with `TfidfTransformer`.
* Fit a classifier to these tfidf values.

In [24]:
# Instantiate transformers and classifier
vect = CountVectorizer(tokenizer=tokenize)
tfidf = TfidfTransformer()
clf =  RandomForestClassifier()

# Fit and/or transform each to the data

X_train_count = vect.fit_transform(X_train)
X_train_tfidf = tfidf.fit_transform(X_train_count)
print(X_train_tfidf.toarray().shape,y_train.shape)
clf.fit(X_train_tfidf, y_train)




(1802, 5607) (1802,)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Convert sparse matrix to numpy array to view


In [26]:
X_train_count.toarray()[0].shape, X_train_count.toarray()[0]

((5607,), array([0, 1, 0, ..., 0, 0, 0], dtype=int64))

In [27]:
vect.vocabulary_

{'what': 5374,
 'is': 2834,
 'the': 4936,
 'bond': 856,
 'between': 782,
 'duct': 1736,
 'tape': 4889,
 'and': 540,
 '#': 1,
 'healthcare': 2501,
 '?': 341,
 'see': 4478,
 '@': 342,
 'vrulon': 5268,
 'ha': 2448,
 'to': 4989,
 'say': 4431,
 'urlplaceholder': 5175,
 'avieweiss': 671,
 'we`ve': 5328,
 'voluntarily': 5257,
 'recalled': 4167,
 'limited': 3098,
 'amount': 530,
 'of': 3648,
 'nesquik': 3527,
 'chocolate': 1124,
 'powder': 3941,
 'sold': 4631,
 'only': 3674,
 'in': 2693,
 'u': 5109,
 '.': 60,
 'for': 2183,
 'info': 2731,
 'danone': 1505,
 'water': 5318,
 'toured': 5018,
 'seaside': 4462,
 'town': 5024,
 'france': 2217,
 '3rd': 236,
 'yr': 5519,
 'this': 4962,
 'summer': 4818,
 '&': 4,
 'amp': 531,
 '1st': 148,
 'time': 4982,
 'brazil': 892,
 'spain': 4658,
 ',': 47,
 'promote': 4049,
 'healthy': 2506,
 'hydration': 2640,
 'barclays': 708,
 'wealth': 5330,
 'tell': 4917,
 'client': 1193,
 'add': 411,
 'developed': 1607,
 'market': 3238,
 'stock': 4755,
 'how': 2609,
 'easy': 17

### Step 3: Predict on test data
* Transform (no fitting) the test data with the same CountVectorizer and TfidfTransformer
* Predict labels on these tfidf values.

In [28]:
# Transform test data
X_test_count = vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test_count)

# Predict test labels
y_pred = clf.predict(X_test_tfidf)

### Step 4: Display results
Display a confusion matrix and accuracy score based on the model's predictions.

In [29]:
labels = np.unique(y_test)
confusion_mat = confusion_matrix(y_test, y_pred,labels=labels)
accuracy = (y_test==y_pred).mean()

print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 84   1  31]
 [  2  26   2]
 [ 18   2 435]]
Accuracy: 0.9068219633943427


# Final Step: Refactor
Organize these steps into the following functions.

In [30]:
def display_results(y_test, y_pred):
    # insert step 4 here 
    labels = np.unique(y_test)
    confusion_mat = confusion_matrix(y_test, y_pred,labels=labels)
    accuracy = (y_test==y_pred).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)



def main():
    # load data
    X, y = load_data()

    # perform train test split
    X_train, X_test, y_train ,  y_test = train_test_split(X,y)
    
    # Instantiate transformers and classifier
    vect = CountVectorizer(tokenizer=tokenize)
    tfidf = TfidfTransformer()
    clf =  RandomForestClassifier()

    # Fit and/or transform each to the data

    X_train_count = vect.fit_transform(X_train)
    X_train_tfidf = tfidf.fit_transform(X_train_count)
    clf.fit(X_train_tfidf, y_train)
    
    # Transform test data
    X_test_count = vect.transform(X_test)
    X_test_tfidf = tfidf.transform(X_test_count)

    # Predict test labels
    y_pred = clf.predict(X_test_tfidf)
    
    display_results(y_test, y_pred)


In [31]:
# run program
main()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 78   0  39]
 [  3  20   7]
 [ 10   0 444]]
Accuracy: 0.9018302828618968
