# Scikit Learn & Keras Text Classifier Pipeline with Word2Vec Featurization

### This notebook shows how to train pipelines for word embeddigns and scikit classifier

The pipeline takes as input a dataframe containing tweets in a text column, a word embedding file, and trains a scikit learn model. The modeling pipeline is then saved to a .zip file. This file can be loaded later and used for prediction and evaluation of new data sets.

In [1]:
import os

working_dir = r"D:\Sentiment140_Classification"

## Set training and test files
training_tweet_filename = os.path.join(working_dir, 'training_text.csv')
training_label_filename = os.path.join(working_dir, 'training_label.csv')
test_tweet_filename = os.path.join(working_dir, 'testing_text.csv')
test_label_filename = os.path.join(working_dir, 'testing_label.csv')

## Set word2vec file, parameter file for model, and model pipeline file
w2v_embeddings_filename = os.path.join(working_dir, 'w2vec.txt')
params_file_path = os.path.join(working_dir, "params.tsv")
model_file = os.path.join(working_dir, 'sk_model.zip') 

In [2]:
from tatk.feature_extraction.word2vec_vectorizer import Word2VecVectorizer

from __future__ import absolute_import
from __future__ import division
import tatk
import collections
import math
import sys
import random
import numpy as np
from six.moves import urllib
from six.moves import xrange  
from timeit import default_timer as timer
import pandas as pd
import re
import io
from nltk.tokenize import TweetTokenizer
import num2words

import math
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from tatk.pipelines.text_classification.text_classifier import TextClassifier
from tatk.pipelines.text_classification.keras_text_classifier import KerasTextClassifier
from tatk.feature_extraction.callable_vectorizer import CallableVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV 

!pip show azureml-tatk

%reload_ext autoreload
%autoreload 2
%matplotlib inline

Name: azureml-tatk
Version: 0.1.18108.8a1
Summary: Microsoft Azure Machine Learning Package for Text Analytics
Home-page: https://microsoft.sharepoint.com/teams/TextAnalyticsPackagePreview
Author: Microsoft Corporation
Author-email: amltap@microsoft.com
License: UNKNOWN
Location: c:\users\remoteuser\appdata\local\amlworkbench\python\lib\site-packages
Requires: azure-storage, qgrid, sklearn-crfsuite, docker, validators, h5py, pytest, ruamel.yaml, ipywidgets, requests, unidecode, numpy, nose, nltk, gensim, pdfminer.six, lxml, scikit-learn, ipython, scipy, keras, azure-ml-api-sdk, pandas, bqplot, matplotlib, pyspark
Required-by: 


## Load and preprocess tweet data

### Define data loading the preprocessing functions for tweets

In [3]:
# Data processing
# In the following code, we replace Emails, URLS, emoticons etc with special labels
pos_emoticons=["(^.^)","(^-^)","(^_^)","(^_~)","(^3^)","(^o^)","(~_^)","*)",":)",":*",":-*",":]",":^)",":}",
               ":>",":3",":b",":-b",":c)",":D",":-D",":O",":-O",":o)",":p",":-p",":P",":-P",":Þ",":-Þ",":X",
               ":-X",";)",";-)",";]",";D","^)","^.~","_)m"," ~.^","<=8","<3","<333","=)","=///=","=]","=^_^=",
               "=<_<=","=>.<="," =>.>="," =3","=D","=p","0-0","0w0","8D","8O","B)","C:","d'-'","d(>w<)b",":-)",
               "d^_^b","qB-)","X3","xD","XD","XP","ʘ‿ʘ","❤","💜","💚","💕","💙","💛","💓","💝","💖","💞",
               "💘","💗","😗","😘","😙","😚","😻","😀","😁","😃","☺","😄","😆","😇","😉","😊","😋","😍",
               "😎","😏","😛","😜","😝","😮","😸","😹","😺","😻","😼","👍"]

neg_emoticons=["--!--","(,_,)","(-.-)","(._.)","(;.;)9","(>.<)","(>_<)","(>_>)","(¬_¬)","(X_X)",":&",":(",":'(",
               ":-(",":-/",":-@[1]",":[",":\\",":{",":<",":-9",":c",":S",";(",";*(",";_;","^>_>^","^o)","_|_",
               "`_´","</3","<=3","=/","=\\",">:(",">:-(","💔","☹️","😌","😒","😓","😔","😕","😖","😞","😟",
               "😠","😡","😢","😣","😤","😥","😦","😧","😨","😩","😪","😫","😬","😭","😯","😰","😱","😲",
               "😳","😴","😷","😾","😿","🙀","💀","👎"]

emoticonsDict = {}
for i,each in enumerate(pos_emoticons):
    emoticonsDict[each]=' POS_EMOTICON_'+num2words.num2words(i).upper()+' '
    
for i,each in enumerate(neg_emoticons):
    emoticonsDict[each]=' NEG_EMOTICON_'+num2words.num2words(i).upper()+' '
    
# use these three lines to do the replacement
rep = dict((re.escape(k), v) for k, v in emoticonsDict.items())
emoticonsPattern = re.compile("|".join(rep.keys()))

# Read in files
def read_tweets(filename):
    """Read the raw tweet data from a file. Replace Emails etc with special tokens """
    with open(filename, 'r') as f:
        all_lines=f.readlines()
        padded_lines=[]
        for line in all_lines:
            line = emoticonsPattern.sub(lambda m: rep[re.escape(m.group(0))], line.lower().strip())
            line=re.sub(r'(.)\1{2,}', r'\1\1',line)
            words_tokens=[token for token in TweetTokenizer().tokenize(line)]                    
            line= ' '.join(token for token in words_tokens )         
            padded_lines.append(line)
    return padded_lines

### Read in tweet training data and training labels, and combine into one data-frame

In [4]:
## Read in tweet file. Labels are 4 and 0 (4 being positive and 0 being negative)
tweets = read_tweets(training_tweet_filename)

with open(training_label_filename, 'r') as f:
    all_lines=f.readlines()
    tweet_labels = []
    for line in all_lines:
        line = line.strip()
        label = 0
        if line == '4':
            label = 1
        #else:
        #    label = 'negative'
        tweet_labels.append(label)

df = pd.DataFrame({'raw_tweets':tweets, 'labels':tweet_labels}, columns=['raw_tweets','labels'])
display(df[:3])

Unnamed: 0,raw_tweets,labels
0,damn fixtated on @kokupuff lovely thighs / hip...,1
1,god bless firefox's ' restore previous session...,1
2,@sherrieshepherd http://twitpic.com/6vn4a - da...,1


In [5]:
## Count the labels in training set
df['labels'].value_counts()

1    640257
0    639743
Name: labels, dtype: int64

### Use CallableVectorizer to pre-process tweets in a given dataframe
#### This takes raw tweets and further processes them to generate processed tweets which are used to run training / testing

In [6]:
# Define functions wihch are to be used to pre-process tweets
def to_lower_case(x):
    return x.lower()

def emailsReplace(x):
    return x.replace(r'[\w\.-]+@[\w\.-]+', ' EMAIL ')

def numsReplace(x):
    return x.replace(r'[\w\.-]+@[\w\.-]+', ' NUM ')

def userMentionsReplace(x):
    return x.replace(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)', ' USER ')

def urlReplace(x):
    return x.replace(r'r(f|ht)(tp)(s?)(://)(.*)[.|/][^ ]+', ' URL ')

def punctuationReplace(x):
    return x.replace(r'(?<=\w)[^\s\w](?![^\s\w])', ' PUN ')

def atReplace(x):
    return x.replace(r'@', ' AT ')

# Chain functions into a list
featFuncs=[to_lower_case, emailsReplace, numsReplace, userMentionsReplace, urlReplace, punctuationReplace, atReplace]

# Create a transformer specifying functions, 
callable = CallableVectorizer(input_col="raw_tweets", output_col="tweets", feat_list=featFuncs, preprocessor = True)
processed_df = callable.tatk_fit_transform(df)

processed_df.head(3)


CallableVectorizer::tatk_fit_transform ==> start
CallableVectorizer::tatk_fit_transform ==> end 	 Time taken: 0.1 mins


Unnamed: 0,raw_tweets,labels,tweets
0,damn fixtated on @kokupuff lovely thighs / hip...,1,damn fixtated on AT kokupuff lovely thighs / ...
1,god bless firefox's ' restore previous session...,1,god bless firefox's ' restore previous session...
2,@sherrieshepherd http://twitpic.com/6vn4a - da...,1,AT sherrieshepherd http://twitpic.com/6vn4a -...


In [7]:
processed_df[['tweets','labels']].head(3)

Unnamed: 0,tweets,labels
0,damn fixtated on AT kokupuff lovely thighs / ...,1
1,god bless firefox's ' restore previous session...,1
2,AT sherrieshepherd http://twitpic.com/6vn4a -...,1


### Split data into training and validation

In [8]:
train, valid = train_test_split(df, test_size=0.33)
train.head(3)

Unnamed: 0,raw_tweets,labels,tweets
758097,reading . probably will be for awhile .. god i...,0,reading . probably will be for awhile .. god i...
972688,pandora won't work on my phone .. sad day,0,pandora won't work on my phone .. sad day
277285,@ceurok i know right .. i finally got food ! !,1,AT ceurok i know right .. i finally got food ! !


## Scikit learn text classification pipeline

### Define scikit learn pipeline

In [9]:
from sklearn.model_selection import ParameterGrid
#params = {'tol': [0.0001, 0.001], 'max_iter': [5, 10]}

## Define classifier from scikit learn
log_reg_learner =  LogisticRegression(penalty='l2', dual=False, tol=0.0001, 
                            C=1.0, fit_intercept=True, intercept_scaling=1, 
                            class_weight=None, random_state=None, 
                            solver='lbfgs', max_iter=10, 
                            verbose=1, warm_start=True, n_jobs=3) 

# Train the model on tweets
text_classifier = TextClassifier(embedding_file_path = w2v_embeddings_filename,
                                estimator=log_reg_learner, 
                                extract_word_ngrams=False,
                                text_cols = ["tweets"], 
                                label_cols = ["labels"])
#text_classifier = SklearnTextClassifier(scikit_estimator=log_reg_learner, 
#                                            input_col="review", label_col ="sentiment",
#                                            prediction_col ="prediction",
#                                            extract_word_ngrams=True,
#                                            extract_char_ngrams=True)
#pipe_gs = GridSearchCV(text_classifier, param_grid=params, scoring="roc_auc", cv=3)

## Export parameters of the model
text_classifier.export_params(params_file_path)

TextClassifier::create_pipeline ==> start
:: number of jobs for the pipeline : 6
0	tweets_nltk_preprocessor
1	tweets_word2vec
2	assembler
3	learner
TextClassifier::create_pipeline ==> end


### Fit classifier on training data

In [10]:
text_classifier.fit(train)
#pipe_gs.fit(train)

TextClassifier::fit ==> start
schema: col=raw_tweets:TX:0 col=labels:I8:1 col=tweets:TX:2 header+
NltkPreprocessor::tatk_fit_transform ==> start
NltkPreprocessor::tatk_fit_transform ==> end 	 Time taken: 0.34 mins
Word2VecVectorizer::tatk_fit_transform ==> start
Word2VecVectorizer::tatk_fit_transform ==> end 	 Time taken: 1.61 mins
VectorAssembler::transform ==> start, num of input records=857600
(857600, 50)
all_features::
(857600, 50)
Time taken: 0.01 mins
VectorAssembler::transform ==> end
LogisticRegression::tatk_fit ==> start


[Parallel(n_jobs=3)]: Done   1 out of   1 | elapsed:    9.0s finished


LogisticRegression::tatk_fit ==> end 	 Time taken: 0.16 mins
Time taken: 2.13 mins
TextClassifier::fit ==> end


TextClassifier(add_index_col=False, callable_proprocessors_list=None,
        cat_cols=None, char_hashing_original=False, col_prefix='tmp_00_',
        decompose_n_grams=False, detect_phrases=False,
        dictionary_categories=None, dictionary_file_path=None,
        embedding_file_path='D:\\Sentiment140_Classification\\w2vec.txt',
        embedding_file_path_fasttext=None,
        estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10, multi_class='ovr', n_jobs=3,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=1, warm_start=True),
        estimator_vectorizers_list=None, extract_char_ngrams=False,
        extract_word_ngrams=False, label_cols=['labels'],
        numeric_cols=None, pos_tagger_vectorizer=False,
        preprocessor_dictionary_file_path=None, regex_replcaement='',
        replace_regex_pattern=None, scale_numeric_cols=False,
        text_callable_list=No

### Save the scikit learn training pipleine

In [11]:
text_classifier.save(model_file)

BaseTextModel::save ==> start
TatkPipeline::save ==> start
copy embedding file from  D:\Sentiment140_Classification\w2vec.txt
Time taken: 0.0 mins
TatkPipeline::save ==> end
Time taken: 0.55 mins
BaseTextModel::save ==> end


### Load and the training pipleine, predict and evaluate the accuracy on held-out validation set

In [12]:
text_classifier_reloaded = TextClassifier.load(model_file)

BaseTextModel::load ==> start
TatkPipeline::load ==> start
Word2VecVectorizer: Word2Vec model loaded from D:\Sentiment140_Classification\sk_model.zip 2018-04-19 20.56.26\sk_model\pipeline\tweets_word2vec\embedding_table.txt
Time taken: 0.07 mins
TatkPipeline::load ==> end
Time taken: 0.13 mins
BaseTextModel::load ==> end


In [13]:
predicted_df = text_classifier_reloaded.predict(valid)
display(predicted_df[:3])

TextClassifier::predict ==> start
NltkPreprocessor::tatk_transform ==> start
NltkPreprocessor::tatk_transform ==> end 	 Time taken: 0.17 mins
Word2VecVectorizer::tatk_transform ==> start
Word2VecVectorizer::tatk_transform ==> end 	 Time taken: 0.8 mins
VectorAssembler::transform ==> start, num of input records=422400
(422400, 50)
all_features::
(422400, 50)
Time taken: 0.01 mins
VectorAssembler::transform ==> end
LogisticRegression::tatk_predict ==> start
LogisticRegression::tatk_predict ==> end 	 Time taken: 0.01 mins
Time taken: 0.98 mins
TextClassifier::predict ==> end


Unnamed: 0,raw_tweets,labels,tweets,prediction
1177509,@melissavaldez * __ * two more little hours an...,1,AT melissavaldez * __ * two more little hours...,0
836185,@ashleylynnek hahaha they have them in the mtv...,1,AT ashleylynnek hahaha they have them in the ...,1
1265665,is needing a part time job .. any ideas was th...,0,is needing a part time job .. any ideas was th...,0


In [14]:
text_classifier_reloaded.evaluate(valid)

TextClassifier::evaluate ==> start
schema: col=raw_tweets:TX:0 col=labels:I8:1 col=tweets:TX:2 col=prediction:I8:3 header+
NltkPreprocessor::tatk_transform ==> start
NltkPreprocessor::tatk_transform ==> end 	 Time taken: 0.17 mins
Word2VecVectorizer::tatk_transform ==> start
Word2VecVectorizer::tatk_transform ==> end 	 Time taken: 0.8 mins
VectorAssembler::transform ==> start, num of input records=422400
(422400, 50)
all_features::
(422400, 50)
Time taken: 0.01 mins
VectorAssembler::transform ==> end
LogisticRegression::tatk_predict ==> start
LogisticRegression::tatk_predict ==> end 	 Time taken: 0.01 mins
[[157597  53002]
 [ 51195 160606]]
macro_f1 = 0.7533089777810893
Time taken: 0.99 mins
TextClassifier::evaluate ==> end


(array([[157597,  53002],
        [ 51195, 160606]], dtype=int64), 0.7533089777810893)

## Keras classification pipeline

### Define keras classifier

In [16]:
from tatk.pipelines.text_classification.keras_embedding_text_classifier import KerasEmbeddingTextClassifier
keras_text_classifier = KerasEmbeddingTextClassifier(embedding_file_path=w2v_embeddings_filename, 
                                                     input_col="tweets", 
                                                     label_col="labels",
                                                     model_type="CNN",
                                                     binary_format=False, 
                                                     callbacks=True)

KerasEmbeddingTextClassifier::create_pipeline ==> start
Word2VecVectorizer::load_embeddings ==> start
Time taken: 0.07 mins
Word2VecVectorizer::load_embeddings ==> end
num_words=69901
:: number of jobs for the pipeline : 6
0	nltk_preprocessor
1	vectorizer
2	learner
KerasEmbeddingTextClassifier::create_pipeline ==> end


In [22]:
# Set parameters
keras_text_classifier.set_step_params_by_name("learner", n_epochs = 10)
keras_text_classifier.set_step_params_by_name("learner", batch_size = 250)
keras_text_classifier.set_step_params_by_name("learner", validation_split = 0.2)

keras_text_classifier.set_step_params_by_name("learner", model__kernel_size=[3,4,5])
keras_text_classifier.set_step_params_by_name("learner", model__num_filters=25)
keras_text_classifier.set_step_params_by_name("learner", model__dropout_rate  = 0.5)
keras_text_classifier.set_step_params_by_name("learner", model__hidden_dims  = 100)

keras_text_classifier.set_step_params_by_name("vectorizer", get_from_path=False)

# Get parameters
keras_text_classifier.get_step_params_by_name("learner")

{'batch_size': 250,
 'callbacks_list': ['tensorboard', 'checkpoint', 'early'],
 'cuda_devices': '0',
 'feature_cols': ['features'],
 'input_padding_value': 69900,
 'label_col': 'labels',
 'log_path': 'C:\\Users\\remoteuser\\tatk\\resources\\logs',
 'max_len': None,
 'model__class_type': 'single-label',
 'model__dropout_rate': 0.5,
 'model__hidden_dims': 100,
 'model__init_wordvecs': array([[ 0.51494402, -0.01342   ,  1.57105696, ...,  0.256001  ,
         -0.079393  ,  0.787498  ],
        [-0.86031699,  0.164866  , -0.25757501, ...,  0.48005101,
          0.073384  , -0.211935  ],
        [ 0.935996  , -0.119759  ,  1.55779195, ...,  0.28700501,
         -0.13183101,  0.87609899],
        ...,
        [-0.26659   , -0.080407  , -0.104858  , ..., -0.021256  ,
          0.237636  ,  0.022621  ],
        [ 0.00239503,  0.03211888, -0.17313817, ...,  0.11466235,
         -0.34803781,  0.03757122],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0. 

### Fit Keras classifier 

In [23]:
keras_text_classifier.fit(train)

KerasEmbeddingTextClassifier::fit ==> start
schema: col=raw_tweets:TX:0 col=labels:I8:1 col=tweets:TX:2 header+
NltkPreprocessor::tatk_fit_transform ==> start
NltkPreprocessor::tatk_fit_transform ==> end 	 Time taken: 0.32 mins
Word2VecVectorizer::tatk_fit_transform ==> start
Word2VecVectorizer::tatk_fit_transform ==> end 	 Time taken: 0.22 mins
KerasEmbeddingTextClassifierLearner::tatk_fit ==> start
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 50)     3495050     input_2[0][0]                    
__________________________________________________________________________________________________
co





Epoch 00001: val_loss improved from inf to 0.46333, saving model to C:\Users\remoteuser\tatk\resources\logs\checkpoints
Epoch 2/10




Epoch 00002: val_loss improved from 0.46333 to 0.45891, saving model to C:\Users\remoteuser\tatk\resources\logs\checkpoints
Epoch 3/10




Epoch 00003: val_loss did not improve
Epoch 4/10




Epoch 00004: val_loss improved from 0.45891 to 0.45282, saving model to C:\Users\remoteuser\tatk\resources\logs\checkpoints
Epoch 5/10




Epoch 00005: val_loss improved from 0.45282 to 0.44892, saving model to C:\Users\remoteuser\tatk\resources\logs\checkpoints
Epoch 6/10




Epoch 00006: val_loss improved from 0.44892 to 0.44463, saving model to C:\Users\remoteuser\tatk\resources\logs\checkpoints
Epoch 7/10




Epoch 00007: val_loss improved from 0.44463 to 0.43867, saving model to C:\Users\remoteuser\tatk\resources\logs\checkpoints
Epoch 8/10




Epoch 00008: val_loss improved from 0.43867 to 0.43741, saving model to C:\Users\remoteuser\tatk\resources\logs\checkpoints
Epoch 9/10




Epoch 00009: val_loss did not improve
Epoch 10/10




Epoch 00010: val_loss improved from 0.43741 to 0.43567, saving model to C:\Users\remoteuser\tatk\resources\logs\checkpoints
KerasEmbeddingTextClassifierLearner::tatk_fit ==> end 	 Time taken: 8.89 mins
Time taken: 9.43 mins
KerasEmbeddingTextClassifier::fit ==> end


KerasEmbeddingTextClassifier(binary_format=False, callbacks=True,
               class_type='single-label', cuda_devices='0',
               embedding_file_path='D:\\Sentiment140_Classification\\w2vec.txt',
               input_col='tweets', label_col='labels', limit=None,
               log_dir='C:\\Users\\remoteuser\\tatk\\resources\\logs',
               model_type='CNN', n_labels=None,
               prediction_col='prediction',
               probabilities_col='probabilities', regex=None,
               trainable_embedding=False)

### Save keras model

In [24]:
keras_text_classifier.save(model_file)

BaseTextModel::save ==> start
TatkPipeline::save ==> start
Time taken: 0.04 mins
TatkPipeline::save ==> end
Time taken: 0.68 mins
BaseTextModel::save ==> end


### Load model and evaluate performance on a set of training data

In [25]:
keras_text_classifier_reloaded = KerasTextClassifier.load(model_file)

BaseTextModel::load ==> start
TatkPipeline::load ==> start
Word '<UNK>' is already in vocabulary.
Word '<<ZERO>>' is already in vocabulary.
Word2VecVectorizer: Word2Vec model loaded from D:\Sentiment140_Classification\sk_model.zip 2018-04-19 21.19.11\sk_model\pipeline\vectorizer\embedding_table.txt
Time taken: 0.11 mins
TatkPipeline::load ==> end
Time taken: 0.17 mins
BaseTextModel::load ==> end


In [26]:
predictions = keras_text_classifier_reloaded.evaluate(valid)

predictions

KerasEmbeddingTextClassifier::evaluate ==> start
schema: col=raw_tweets:TX:0 col=labels:I8:1 col=tweets:TX:2 col=prediction:I8:3 header+
KerasEmbeddingTextClassifier::predict ==> start
NltkPreprocessor::tatk_transform ==> start
NltkPreprocessor::tatk_transform ==> end 	 Time taken: 0.16 mins
Word2VecVectorizer::tatk_transform ==> start
Word2VecVectorizer::tatk_transform ==> end 	 Time taken: 0.12 mins
KerasEmbeddingTextClassifierLearner::tatk_predict ==> start
KerasEmbeddingTextClassifierLearner::tatk_predict ==> end 	 Time taken: 0.5 mins
Time taken: 0.78 mins
KerasEmbeddingTextClassifier::predict ==> end
[[169911  40688]
 [ 39409 172392]]
macro_f1 score = 0.81
hamming loss = 0.19
Time taken: 0.79 mins
KerasEmbeddingTextClassifier::evaluate ==> end


{'confusion_matrix': array([[169911,  40688],
        [ 39409, 172392]], dtype=int64),
 'hamming_loss': 0.18962357954545456,
 'macro_f1': 0.810369878416989}