# Machine Learning Workflow
Complete the steps below to complete the machine learning workflow for this classifier.

In [None]:
import nltk
nltk.download(['punkt', 'wordnet'])

In [2]:
import re
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [3]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### Step 1: Load data and perform a train test split

In [4]:
# load data
X, y = load_data()

# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [5]:
X_test.shape

(601,)

### Step 2: Train classifier
* Fit and transform the training data with `CountVectorizer`. Hint: You can include your tokenize function in the `tokenizer` keyword argument!
* Fit and transform these word counts with `TfidfTransformer`.
* Fit a classifier to these tfidf values.

In [7]:
TfidfTransformer?

[1;31mInit signature:[0m
[0mTfidfTransformer[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mnorm[0m[1;33m=[0m[1;34m'l2'[0m[1;33m,[0m[1;33m
[0m    [0muse_idf[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0msmooth_idf[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0msublinear_tf[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Transform a count matrix to a normalized tf or tf-idf representation

Tf means term-frequency while tf-idf means term-frequency times inverse
document-frequency. This is a common term weighting scheme in information
retrieval, that has also found good use in document classification.

The goal of using tf-idf instead of the raw frequencies of occurrence of a
token in a given document is to scale down the impact of tokens that occur
very frequently in a given corpus and that are hence empirically less
informative than features th

In [6]:
# Instantiate transformers and classifier

vect = CountVectorizer(tokenizer=tokenize)
tfidf = TfidfTransformer()
clf = LogisticRegression()

# Fit and/or transform each to the data
X_train_vect = vect.fit_transform(X_train)
X_train_tfidf = tfidf.fit_transform(X_train_vect)
clf.fit(X_train_tfidf, y_train)

LogisticRegression()

In [9]:
set(clf.predict(X_train_tfidf))

{'Action', 'Dialogue', 'Information'}

### Step 3: Predict on test data
* Transform (no fitting) the test data with the same CountVectorizer and TfidfTransformer
* Predict labels on these tfidf values.

In [56]:
# Transform test data
X_test_vect = vect.transform(X_test)  ## not fit_transform!!!
X_test_tfidf = tfidf.transform(X_test_vect)  ## not fit_transform!!!

# Predict test labels
y_pred = clf.predict(X_train_tfidf)

In [1]:
y_pred.unique()

NameError: name 'y_pred' is not defined

### Step 4: Display results
Display a confusion matrix and accuracy score based on the model's predictions.

In [58]:
labels = np.unique(y_pred)
confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
accuracy = (y_pred == y_test).mean()

print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)

NameError: name 'np' is not defined

# Final Step: Refactor
Organize these steps into the following functions.

In [66]:
def display_results(y_pred, y_test):
    # insert step 4 here
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)


def main():
    # insert steps 1 through 3 here
    # Instantiate transformers and classifier
    
    # load data
    X, y = load_data()

    # perform train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    vect = CountVectorizer(tokenizer=tokenize)
    tfidf = TfidfTransformer()
    clf = LogisticRegression()

    # Fit and/or transform each to the data
    X_train_vect = vect.fit_transform(X_train)
    X_train_tfidf = tfidf.fit_transform(X_train_vect)
    clf.fit(X_train_tfidf, y_train)
    
    # Transform test data
    X_test_vect = vect.transform(X_test)  ## not fit_transform!!!
    X_test_tfidf = tfidf.transform(X_test_vect)  ## not fit_transform!!!

    # Predict test labels
    y_pred = clf.predict(X_test_tfidf)
    
    # display results
    display_results(y_pred, y_test)

In [67]:
# run program
main()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 75   0  42]
 [  0  20   6]
 [  1   0 457]]
Accuracy: 0.9184692179700499
