## Step 1: Configure and import modules.

In [2]:
# Show azureml-tatk version
!pip show azureml-tatk

In [3]:
from tatk.utils import load_biomedical_data, download_embedding_model, data_dir, dictionaries_dir, models_dir
from tatk.connectors.blob_storage_data_connector import AzureBlobStorageDataConnector
from tatk.pipelines.feature_extraction.word2vec_model import Word2VecModel
from tatk.feature_extraction.word2vec_vectorizer import Word2VecVectorizer

from __future__ import absolute_import
from __future__ import division
import collections
import math
import os
import sys
import random
import numpy as np
from six.moves import urllib
from six.moves import xrange  
import tensorflow as tf
from timeit import default_timer as timer
import pandas as pd
import re
import io
from nltk.tokenize import TweetTokenizer
import num2words

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
# Data processing and batch preparation
# In the following code, we replace Emails, URLS, emoticons etc with special labels

pos_emoticons=["(^.^)","(^-^)","(^_^)","(^_~)","(^3^)","(^o^)","(~_^)","*)",":)",":*",":-*",":]",":^)",":}",
               ":>",":3",":b",":-b",":c)",":D",":-D",":O",":-O",":o)",":p",":-p",":P",":-P",":Þ",":-Þ",":X",
               ":-X",";)",";-)",";]",";D","^)","^.~","_)m"," ~.^","<=8","<3","<333","=)","=///=","=]","=^_^=",
               "=<_<=","=>.<="," =>.>="," =3","=D","=p","0-0","0w0","8D","8O","B)","C:","d'-'","d(>w<)b",":-)",
               "d^_^b","qB-)","X3","xD","XD","XP","ʘ‿ʘ","❤","💜","💚","💕","💙","💛","💓","💝","💖","💞",
               "💘","💗","😗","😘","😙","😚","😻","😀","😁","😃","☺","😄","😆","😇","😉","😊","😋","😍",
               "😎","😏","😛","😜","😝","😮","😸","😹","😺","😻","😼","👍"]

neg_emoticons=["--!--","(,_,)","(-.-)","(._.)","(;.;)9","(>.<)","(>_<)","(>_>)","(¬_¬)","(X_X)",":&",":(",":'(",
               ":-(",":-/",":-@[1]",":[",":\\",":{",":<",":-9",":c",":S",";(",";*(",";_;","^>_>^","^o)","_|_",
               "`_´","</3","<=3","=/","=\\",">:(",">:-(","💔","☹️","😌","😒","😓","😔","😕","😖","😞","😟",
               "😠","😡","😢","😣","😤","😥","😦","😧","😨","😩","😪","😫","😬","😭","😯","😰","😱","😲",
               "😳","😴","😷","😾","😿","🙀","💀","👎"]

# Emails
emailsRegex=re.compile(r'[\w\.-]+@[\w\.-]+')

# Mentions
userMentionsRegex=re.compile(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)')

#Urls
urlsRegex=re.compile('r(f|ht)(tp)(s?)(://)(.*)[.|/][^ ]+') # It may not be handling all the cases like t.co without http

#Numerics
numsRegex=re.compile(r"\b\d+\b")

punctuationNotEmoticonsRegex=re.compile(r'(?<=\w)[^\s\w](?![^\s\w])')

emoticonsDict = {}
for i,each in enumerate(pos_emoticons):
    emoticonsDict[each]=' POS_EMOTICON_'+num2words.num2words(i).upper()+' '
    
for i,each in enumerate(neg_emoticons):
    emoticonsDict[each]=' NEG_EMOTICON_'+num2words.num2words(i).upper()+' '
    
# use these three lines to do the replacement
rep = dict((re.escape(k), v) for k, v in emoticonsDict.items())
emoticonsPattern = re.compile("|".join(rep.keys()))

In [5]:
def read_tweets(filename):
    """Read the raw tweet data from a file. Replace Emails etc with special tokens """
    with open(filename, 'r') as f:
        all_lines=f.readlines()
        padded_lines=[]
        for line in all_lines:
            line = emoticonsPattern.sub(lambda m: rep[re.escape(m.group(0))], line.lower().strip())
            line = userMentionsRegex.sub(' USER ', line )
            line = emailsRegex.sub(' EMAIL ', line )
            line=urlsRegex.sub(' URL ', line)
            line=numsRegex.sub(' NUM ',line)
            line=punctuationNotEmoticonsRegex.sub(' PUN ',line)
            line=re.sub(r'(.)\1{2,}', r'\1\1',line)
            words_tokens=[token for token in TweetTokenizer().tokenize(line)]                    
            line= ' '.join(token for token in words_tokens )         
            padded_lines.append(line)
    return padded_lines

In [6]:
training_filename = r"D:\Sentiment140_Classification\training_text.csv"
tweets = read_tweets(training_filename)

df = pd.DataFrame({'tweets':tweets})
display(df[:5])

Unnamed: 0,tweets
0,damn fixtated on USER lovely thighs PUN hips o...
1,god bless firefox PUN s ' restore previous ses...
2,USER http://twitpic PUN com PUN 6vn4a - dang g...
3,USER hey PUN sorry u had to come back to work ...
4,bye mommy PUN we PUN ll miss you PUN


In [7]:
word2vec_model = Word2VecModel(input_col = 'tweets', regex = None, detect_sentences = True)

Word2VecModel::create_pipeline ==> start
input_col=tweets
input_col=NltkPreprocessor1f41071076a64f93b082fd888a6015e7
input_col=UngroupTransformer5d9b2013721e45dea4a222afc1cd23c6
:: number of jobs for the pipeline : 6
0	nltk_preprocessor
1	ungroup_transformer
Word2VecModel::create_pipeline ==> end


In [8]:
Word2VecModel

tatk.pipelines.feature_extraction.word2vec_model.Word2VecModel

In [54]:
print(word2vec_model)

Word2VecModel TATK Pipeline:
0 - nltk_preprocessor(tweets,NltkPreprocessorcdf2b441e3ec469da0dfc5e0e447bd42)
1 - ungroup_transformer(NltkPreprocessorcdf2b441e3ec469da0dfc5e0e447bd42,UngroupTransformer3832ece195f0418b921b5c0988877d3b)
2 - vectorizer(UngroupTransformer3832ece195f0418b921b5c0988877d3b,Word2VecVectorizer0a03e9fbaea945fb99b3482cedf830e2)



### (3.2) Display and Change default pipeline parameters

In [55]:
# Get step indices. 
print("word2vec_model.get_step_id({})".format("nltk_preprocessor"))
print(word2vec_model.get_step_id("nltk_preprocessor"))

print("word2vec_model.get_step_id({})".format("vectorizer"))
print(word2vec_model.get_step_id("vectorizer"))

word2vec_model.get_step_id(nltk_preprocessor)
0
word2vec_model.get_step_id(vectorizer)
2


In [57]:
word2vec_model.get_step_params_by_name('vectorizer')

{'aggregation_func': <function tatk.feature_extraction.word2vec_vectorizer.Word2VecVectorizer.aggregate_mean>,
 'case_sensitive': False,
 'context_window_size': 5,
 'copy_from_path': True,
 'embedding_size': 100,
 'embedding_table': None,
 'get_from_path': True,
 'input_col': 'UngroupTransformer3832ece195f0418b921b5c0988877d3b',
 'min_df': 5,
 'negative_sample_size': 5,
 'num_epochs': 5,
 'num_workers': 4,
 'output_col': 'Word2VecVectorizer0a03e9fbaea945fb99b3482cedf830e2',
 'return_type': 'word_vector',
 'save_overwrite': True,
 'skip_OOV': False,
 'trainable': True,
 'trained_model': None,
 'use_hierarchical_softmax': 0,
 'use_skipgram': 0}

In [58]:
# Change minimum and maximum subwords size.  
#word2vec_model.set_step_params_by_name('vectorizer', min_char_ngrams = 4, max_char_ngrams = 5) 
#word2vec_model.get_step_params_by_name('vectorizer')

### (3.3) Fit the model on the training set

In [59]:
word2vec_model.fit(df)

Word2VecModel::fit ==> start
NltkPreprocessor::tatk_fit_transform ==> start
NltkPreprocessor::tatk_fit_transform ==> end 	 Time taken: 1.52 mins
UngroupTransformer::tatk_fit_transform ==> start
UngroupTransformer::tatk_fit_transform ==> end 	 Time taken: 0.03 mins
Word2VecVectorizer::tatk_fit ==> start
vocabulary size =49252
Word2VecVectorizer::tatk_fit ==> end 	 Time taken: 1.16 mins
Time taken: 2.71 mins
Word2VecModel::fit ==> end


Word2VecModel(detect_sentences=True, input_col='tweets', regex=None)

## Step 4: Save and Load pipeline for additional training

### (4.1) Save and Load the pipeline

In [60]:
models_dir = r'D:\Sentiment140_Classification'
pipeline_path = os.path.join(models_dir, 'word2vec_model')
word2vec_model.save(pipeline_path, create_folders_on_path=True)
word2vec_model2 = Word2VecModel.load(pipeline_path)

BaseTextModel::save ==> start
TatkPipeline::save ==> start
Time taken: 0.01 mins
TatkPipeline::save ==> end
Time taken: 0.01 mins
BaseTextModel::save ==> end
BaseTextModel::load ==> start
TatkPipeline::load ==> start
Word2VecVectorizer: Word2Vec model loaded from D:\Sentiment140_Classification\word2vec_model\pipeline\vectorizer\embedding_model.gen
Time taken: 0.01 mins
TatkPipeline::load ==> end
Time taken: 0.01 mins
BaseTextModel::load ==> end


### (4.2) Perform additional training on new data

## Step 5: Save and Load Embeddings For Lookup

### (5.1) Save the embeddings from the model

In [61]:
# Saved embeddings file is in textual format and is readable if opened with a text editor
embeddings_file_path = os.path.join(models_dir, 'word2vec_embeddings.txt')
word2vec_model2.save_embeddings(embeddings_file_path)

Word2VecVectorizer::save_embeddings ==> start
Time taken: 0.05 mins
Word2VecVectorizer::save_embeddings ==> end


### (5.2) Load the embeddings to memory with include_unk set to True to add OOV treatment

In [62]:
#embeddings_file_path = r'D:\Sentiment140_Classification'
vectorizer = Word2VecVectorizer.load_embeddings(embeddings_file_path, include_unk = True, unk_method = 'rnd', unk_vector = None, unk_word = '<UNK>')

Word2VecVectorizer::load_embeddings ==> start
Time taken: 0.11 mins
Word2VecVectorizer::load_embeddings ==> end


### (5.3) Embedding Lookup: Get word and subword indices.

In [63]:
df_predict = pd.DataFrame({'text' : ["I have fever", "My doctor prescribed me ibuprofen."]})
vectorizer.input_col = 'text'
vectorizer.output_col = 'indices'
vectorizer.return_type = 'word_index'
result = vectorizer.tatk_transform(df_predict)
display(result)

Word2VecVectorizer::tatk_transform ==> start
Word2VecVectorizer::tatk_transform ==> end 	 Time taken: 0.0 mins


Unnamed: 0,text,indices
0,I have fever,"[1, 24, 1188]"
1,My doctor prescribed me ibuprofen.,"[9, 1215, 15983, 21, 49252]"


### (5.4) Embedding Lookup: Get word embeddings.

In [20]:
vectorizer.output_col = 'word_vector'
vectorizer.return_type = 'word_vector'
result = vectorizer.tatk_transform(df_predict)
display(result)

Word2VecVectorizer::tatk_transform ==> start
Word2VecVectorizer::tatk_transform ==> end 	 Time taken: 0.0 mins


Unnamed: 0,text,indices,word_vector
0,I have fever,"[116, 55, 1739]","[[-0.743229985237, -0.0243050009012, -0.760702..."
1,My doctor prescribed me ibuprofen.,"[10065, 5543, 8558, 5103, 25585]","[[-0.0086089996621, -0.321725994349, 0.0310159..."


### (5.5) Embedding Lookup: Get sentence embedding.

In [21]:
vectorizer.output_col = 'sentence_vector'
vectorizer.return_type = 'sentence_vector'
result = vectorizer.tatk_transform(df_predict)
display(result)

Word2VecVectorizer::tatk_transform ==> start
Word2VecVectorizer::tatk_transform ==> end 	 Time taken: 0.0 mins


Unnamed: 0,text,indices,word_vector,sentence_vector
0,I have fever,"[116, 55, 1739]","[[-0.743229985237, -0.0243050009012, -0.760702...","[-0.723917007446, 0.354095672568, -1.114667336..."
1,My doctor prescribed me ibuprofen.,"[10065, 5543, 8558, 5103, 25585]","[[-0.0086089996621, -0.321725994349, 0.0310159...","[0.0336762743894, -0.0964680787281, -0.1395916..."


### (5.6) Embedding Lookup: Get most similar word to a given word.

In [65]:
vectorizer.embedding_table.most_similar('fever')

[('great', 0.7867355346679688),
 ('tough', 0.7338866591453552),
 ('rough', 0.7106154561042786),
 ('nice', 0.6891409158706665),
 ('bad', 0.6804108619689941),
 ('fantastic', 0.658706784248352),
 ('fabulous', 0.6579225659370422),
 ('successful', 0.6407154202461243),
 ('terrible', 0.6375681161880493),
 ('fab', 0.6368352174758911)]