# Scikit Learn & Keras Text Classifier Pipeline with Word2Vec Featurization

### This notebook shows how to train pipelines for word embeddigns and scikit classifier

The pipeline takes as input a dataframe containing tweets in a text column, a word embedding file, and trains a scikit learn model. The modeling pipeline is then saved to a .zip file. This file can be loaded later and used for prediction and evaluation of new data sets.

In [1]:
import os

working_dir = r"D:\Sentiment140_Classification"

## Set training and test files
training_tweet_filename = os.path.join(working_dir, 'training_text.csv')
training_label_filename = os.path.join(working_dir, 'training_label.csv')
test_tweet_filename = os.path.join(working_dir, 'testing_text.csv')
test_label_filename = os.path.join(working_dir, 'testing_label.csv')

## Set word2vec file, parameter file for model, and model pipeline file
w2v_embeddings_filename = os.path.join(working_dir, 'w2vec.txt')
params_file_path = os.path.join(working_dir, "params.tsv")
model_file = os.path.join(working_dir, 'sk_model.zip') 

In [2]:
from tatk.feature_extraction.word2vec_vectorizer import Word2VecVectorizer

from __future__ import absolute_import
from __future__ import division
import tatk
import collections
import math
import sys
import random
import numpy as np
from six.moves import urllib
from six.moves import xrange  
from timeit import default_timer as timer
import pandas as pd
import re
import io
from nltk.tokenize import TweetTokenizer
import num2words

import math
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from tatk.pipelines.text_classification.text_classifier import TextClassifier
from tatk.pipelines.text_classification.keras_text_classifier import KerasTextClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV 

!pip show azureml-tatk

%reload_ext autoreload
%autoreload 2
%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Name: azureml-tatk
Version: 0.0.687318
Summary: Microsoft Azure Machine Learning Text Analytics Toolkit
Home-page: https://msdata.visualstudio.com/DefaultCollection/AlgorithmsAndDataScience/_git/TATK
Author: Microsoft Corporation
Author-email: azml-tatk@microsoft.com
License: UNKNOWN
Location: c:\users\remoteuser\appdata\local\amlworkbench\python\lib\site-packages
Requires: lxml, validators, matplotlib, scipy, azure-ml-api-sdk, unidecode, nose, docker, pytest, tensorflow-gpu, requests, bqplot, h5py, ipython, sklearn-crfsuite, scikit-learn, nltk, gensim, azure-storage, keras, pandas, numpy, pyspark, pdfminer.six


## Load and preprocess tweet data

### Define data loading the preprocessing functions for tweets

In [3]:
# Data processing
# In the following code, we replace Emails, URLS, emoticons etc with special labels
pos_emoticons=["(^.^)","(^-^)","(^_^)","(^_~)","(^3^)","(^o^)","(~_^)","*)",":)",":*",":-*",":]",":^)",":}",
               ":>",":3",":b",":-b",":c)",":D",":-D",":O",":-O",":o)",":p",":-p",":P",":-P",":Þ",":-Þ",":X",
               ":-X",";)",";-)",";]",";D","^)","^.~","_)m"," ~.^","<=8","<3","<333","=)","=///=","=]","=^_^=",
               "=<_<=","=>.<="," =>.>="," =3","=D","=p","0-0","0w0","8D","8O","B)","C:","d'-'","d(>w<)b",":-)",
               "d^_^b","qB-)","X3","xD","XD","XP","ʘ‿ʘ","❤","💜","💚","💕","💙","💛","💓","💝","💖","💞",
               "💘","💗","😗","😘","😙","😚","😻","😀","😁","😃","☺","😄","😆","😇","😉","😊","😋","😍",
               "😎","😏","😛","😜","😝","😮","😸","😹","😺","😻","😼","👍"]

neg_emoticons=["--!--","(,_,)","(-.-)","(._.)","(;.;)9","(>.<)","(>_<)","(>_>)","(¬_¬)","(X_X)",":&",":(",":'(",
               ":-(",":-/",":-@[1]",":[",":\\",":{",":<",":-9",":c",":S",";(",";*(",";_;","^>_>^","^o)","_|_",
               "`_´","</3","<=3","=/","=\\",">:(",">:-(","💔","☹️","😌","😒","😓","😔","😕","😖","😞","😟",
               "😠","😡","😢","😣","😤","😥","😦","😧","😨","😩","😪","😫","😬","😭","😯","😰","😱","😲",
               "😳","😴","😷","😾","😿","🙀","💀","👎"]

# Emails
emailsRegex=re.compile(r'[\w\.-]+@[\w\.-]+')

# Mentions
userMentionsRegex=re.compile(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)')

#Urls
urlsRegex=re.compile('r(f|ht)(tp)(s?)(://)(.*)[.|/][^ ]+') # It may not be handling all the cases like t.co without http

#Numerics
numsRegex=re.compile(r"\b\d+\b")

punctuationNotEmoticonsRegex=re.compile(r'(?<=\w)[^\s\w](?![^\s\w])')

emoticonsDict = {}
for i,each in enumerate(pos_emoticons):
    emoticonsDict[each]=' POS_EMOTICON_'+num2words.num2words(i).upper()+' '
    
for i,each in enumerate(neg_emoticons):
    emoticonsDict[each]=' NEG_EMOTICON_'+num2words.num2words(i).upper()+' '
    
# use these three lines to do the replacement
rep = dict((re.escape(k), v) for k, v in emoticonsDict.items())
emoticonsPattern = re.compile("|".join(rep.keys()))


##########################
def read_tweets(filename):
    """Read the raw tweet data from a file. Replace Emails etc with special tokens """
    with open(filename, 'r') as f:
        all_lines=f.readlines()
        padded_lines=[]
        for line in all_lines:
            line = emoticonsPattern.sub(lambda m: rep[re.escape(m.group(0))], line.lower().strip())
            line = userMentionsRegex.sub(' USER ', line )
            line = emailsRegex.sub(' EMAIL ', line )
            line=urlsRegex.sub(' URL ', line)
            line=numsRegex.sub(' NUM ',line)
            line=punctuationNotEmoticonsRegex.sub(' PUN ',line)
            line=re.sub(r'(.)\1{2,}', r'\1\1',line)
            words_tokens=[token for token in TweetTokenizer().tokenize(line)]                    
            line= ' '.join(token for token in words_tokens )         
            padded_lines.append(line)
    return padded_lines

### Read in tweet training data and training labels, and combine into one data-frame

In [5]:
## Read in tweet file. Labels are 4 and 0 (4 being positive and 0 being negative)
tweets = read_tweets(training_tweet_filename)

with open(training_label_filename, 'r') as f:
    all_lines=f.readlines()
    tweet_labels = []
    for line in all_lines:
        line = line.strip()
        label = 0
        if line == '4':
            label = 1
        #else:
        #    label = 'negative'
        tweet_labels.append(label)

df = pd.DataFrame({'tweets':tweets, 'labels':tweet_labels}, columns=['tweets','labels'])
display(df[:3])

Unnamed: 0,tweets,labels
0,damn fixtated on USER lovely thighs PUN hips o...,1
1,god bless firefox PUN s ' restore previous ses...,1
2,USER http://twitpic PUN com PUN 6vn4a - dang g...,1


In [6]:
## Count the labels in training set
df['labels'].value_counts()

1    640257
0    639743
Name: labels, dtype: int64

### Split data into training and validation

In [7]:
train, valid = train_test_split(df, test_size=0.33)
train.head(3)

Unnamed: 0,tweets,labels
379834,laalallalalaalalla too early good day so far (:,0
313748,""" now that #bgt has finished how will i surviv...",0
158848,cant belive jd woke up early and is still in n...,0


## Scikit learn text classification pipeline

### Define scikit learn pipeline

In [8]:
from sklearn.model_selection import ParameterGrid
params = {'tol': [0.0001, 0.001], 'max_iter': [5, 10]}

## Define classifier from scikit learn
log_reg_learner =  LogisticRegression(penalty='l2', dual=False, tol=0.0001, 
                            C=1.0, fit_intercept=True, intercept_scaling=1, 
                            class_weight=None, random_state=None, 
                            solver='lbfgs', max_iter=10, 
                            verbose=1, warm_start=True, n_jobs=3) 

# Train the model on tweets
text_classifier = TextClassifier(embedding_file_path = w2v_embeddings_filename,
                                estimator=log_reg_learner, 
                                extract_word_ngrams=False,
                                text_cols = ["tweets"], 
                                label_cols = ["labels"])
#text_classifier = SklearnTextClassifier(scikit_estimator=log_reg_learner, 
#                                            input_col="review", label_col ="sentiment",
#                                            prediction_col ="prediction",
#                                            extract_word_ngrams=True,
#                                            extract_char_ngrams=True)
#pipe_gs = GridSearchCV(text_classifier, param_grid=params, scoring="roc_auc", cv=3)

## Export parameters of the model
text_classifier.export_params(params_file_path)

TextClassifier::create_pipeline ==> start
:: number of jobs for the pipeline : 6
0	tweets_nltk_preprocessor
1	tweets_word2vec
2	assembler
3	learner
TextClassifier::create_pipeline ==> end


### Fit classifier on training data

In [9]:
text_classifier.fit(train)
#pipe_gs.fit(train)

TextClassifier::fit ==> start
NltkPreprocessor::tatk_fit_transform ==> start
NltkPreprocessor::tatk_fit_transform ==> end 	 Time taken: 0.24 mins
Word2VecVectorizer::tatk_fit_transform ==> start
Word2VecVectorizer::tatk_fit_transform ==> end 	 Time taken: 1.61 mins
VectorAssembler::transform ==> start, num of input records=857600
(857600, 50)
all_features::
(857600, 50)
Time taken: 0.01 mins
VectorAssembler::transform ==> end
LogisticRegression::tatk_fit ==> start


  y = column_or_1d(y, warn=True)


LogisticRegression::tatk_fit ==> end 	 Time taken: 0.15 mins
Time taken: 2.01 mins
TextClassifier::fit ==> end


[Parallel(n_jobs=3)]: Done   1 out of   1 | elapsed:    8.5s finished


TextClassifier(add_index_col=False, cat_cols=None,
        char_hashing_original=False, col_prefix='tmp_00_',
        decompose_n_grams=False, detect_phrases=False,
        dictionary_categories=None, dictionary_file_path=None,
        embedding_file_path='D:\\Sentiment140_Classification\\w2vec.txt',
        embedding_file_path_fasttext=None,
        estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10, multi_class='ovr', n_jobs=3,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=1, warm_start=True),
        estimator_vectorizers_list=None, extract_char_ngrams=False,
        extract_word_ngrams=False, label_cols=['labels'],
        numeric_cols=None, pos_tagger_vectorizer=False,
        preprocessor_dictionary_file_path=None, regex_replcaement='',
        replace_regex_pattern=None, text_callable_list=None,
        text_cols=['tweets'], text_regex_list=None, weight_col=N

### Save the scikit learn training pipleine

In [10]:
text_classifier.save(model_file)

BaseTextModel::save ==> start
TatkPipeline::save ==> start
Time taken: 0.02 mins
TatkPipeline::save ==> end
Time taken: 2.65 mins
BaseTextModel::save ==> end


### Load and the training pipleine, predict and evaluate the accuracy on held-out validation set

In [11]:
text_classifier_reloaded = TextClassifier.load(model_file)

BaseTextModel::load ==> start
TatkPipeline::load ==> start
Word '<UNK>' is already in vocabulary.
Word2VecVectorizer: Word2Vec model loaded from D:\Sentiment140_Classification\sk_model.zip 2018-03-21 19.07.43\sk_model\pipeline\tweets_word2vec\embedding_table.txt
Time taken: 0.06 mins
TatkPipeline::load ==> end
Time taken: 1.11 mins
BaseTextModel::load ==> end


In [12]:
predicted_df = text_classifier_reloaded.predict(valid)
display(predicted_df[:3])

TextClassifier::predict ==> start
NltkPreprocessor::tatk_transform ==> start
NltkPreprocessor::tatk_transform ==> end 	 Time taken: 0.12 mins
Word2VecVectorizer::tatk_transform ==> start
Word2VecVectorizer::tatk_transform ==> end 	 Time taken: 0.81 mins
VectorAssembler::transform ==> start, num of input records=422400
(422400, 50)
all_features::
(422400, 50)
Time taken: 0.0 mins
VectorAssembler::transform ==> end
LogisticRegression::tatk_predict ==> start
LogisticRegression::tatk_predict ==> end 	 Time taken: 0.01 mins
Time taken: 0.95 mins
TextClassifier::predict ==> end


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  dataset[col_name] = [row for row in res]


Unnamed: 0,tweets,labels,prediction
459817,USER they always end up down me,1,0
557411,brewrat PUN s illiterate guy died of bacterial...,0,1
881788,@1omarion please say hi to me and USER she is ...,0,1


In [24]:
text_classifier_reloaded.evaluate(valid)

TextClassifier::evaluate ==> start
NltkPreprocessor::tatk_transform ==> start
NltkPreprocessor::tatk_transform ==> end 	 Time taken: 0.12 mins
Word2VecVectorizer::tatk_transform ==> start
Word2VecVectorizer::tatk_transform ==> end 	 Time taken: 0.78 mins
VectorAssembler::transform ==> start, num of input records=422400
(422400, 50)
all_features::
(422400, 50)
Time taken: 0.0 mins
VectorAssembler::transform ==> end
LogisticRegression::tatk_predict ==> start


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  dataset[col_name] = [row for row in res]


LogisticRegression::tatk_predict ==> end 	 Time taken: 0.01 mins
[[154246  56704]
 [ 54207 157243]]
macro_f1 = 0.7374133908594638
Time taken: 0.93 mins
TextClassifier::evaluate ==> end


(array([[154246,  56704],
        [ 54207, 157243]], dtype=int64), 0.7374133908594638)

## Keras classification pipeline

### Define keras classifier

In [25]:
keras_text_classifier = KerasTextClassifier(embedding_file_path=w2v_embeddings_filename, 
                                            input_col="tweets", label_col="labels",
                                            model_type = 'binary')

KerasTextClassifier::create_pipeline ==> start
Word2VecVectorizer::load_embeddings ==> start
Time taken: 0.06 mins
Word2VecVectorizer::load_embeddings ==> end
num_words=49519
:: number of jobs for the pipeline : 6
0	index_generator
1	nltk_preprocessor
2	vectorizer
3	learner
[0]
KerasTextClassifier::create_pipeline ==> end


### Fit Keras classifier 

In [26]:
keras_text_classifier.get_params()

{'binary_format': False,
 'callbacks': False,
 'embedding_file_path': 'D:\\Sentiment140_Classification\\w2vec.txt',
 'input_col': 'tweets',
 'label_col': 'labels',
 'limit': None,
 'model_type': 'binary',
 'num_classes': 1,
 'prediction_col': 'prediction',
 'probabilities_col': 'probabilities'}

In [27]:
keras_text_classifier.fit(train)

KerasTextClassifier::fit ==> start
GenerateIndexTransformer::tatk_fit_transform ==> startNltkPreprocessor::tatk_fit_transform ==> start

GenerateIndexTransformer::tatk_fit_transform ==> end 	 Time taken: 0.0 mins
NltkPreprocessor::tatk_fit_transform ==> end 	 Time taken: 0.21 mins
Word2VecVectorizer::tatk_fit_transform ==> start
Word2VecVectorizer::tatk_fit_transform ==> end 	 Time taken: 0.25 mins
KerasTextClassifierLearner::tatk_fit ==> start
(857600, 2)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 50)          2475950   
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 50)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 250)         37750     
_________________________________________________________________
global_max_

KerasTextClassifier(binary_format=False, callbacks=False,
          embedding_file_path='D:\\Sentiment140_Classification\\w2vec.txt',
          input_col='tweets', label_col='labels', limit=None,
          model_type='binary', num_classes=1, prediction_col='prediction',
          probabilities_col='probabilities')

### Save keras model

In [None]:
keras_text_classifier.save(model_file)

### Load model and evaluate performance on a set of training data

In [None]:
keras_text_classifier_reloaded = KerasTextClassifier.load(model_file)

In [None]:
predictions = keras_text_classifier_reloaded.evaluate(valid)

### Plot confusion matrix

In [None]:
conf_matrix = pd.DataFrame(predictions[0], index = ['pos', 'neg'], columns = ['pos', 'neg'])
plt.figure(figsize = (10,7))
sns.heatmap(conf_matrix / conf_matrix.sum(axis=1), annot=True)