In [None]:
# If you run this in Colab please uncomment and execute the line below
# !pip install wordninja

In [None]:
import pandas as pd
import numpy as np

from gensim.models import FastText, Word2Vec, Doc2Vec
import tensorflow as tf
keras = tf.keras

import wordninja
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from gensim.models.callbacks import CallbackAny2Vec

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

# NLP Concepts 3

## Helper functions

In [None]:
def clean_tokenize(text):
    """
    Tokenizes and cleans a provided string, removing all punctuation and lowercasing.
    
    Input: arbitrary text <str>
    Returns: a <list> of tokens <str>
    """
    
    clean_text = []
    
    tokenized = text_to_word_sequence(text)
    splitted = [wordninja.split(string) for string in tokenized]
    
    for phrase in splitted:
        clean_text += phrase
    
    return clean_text

In [None]:
def get_avg_embedding(article, model, in_vocab_check = True):
    emb_sum = 0
    n_elems = 0
    
    for word in article:
        
        if in_vocab_check:
            
            if word in model.wv.vocab:
                emb_sum += model.wv[word]
                n_elems += 1
        else:
            emb_sum += model.wv[word]
            n_elems += 1
        
    return emb_sum / n_elems

## Prepare the data

### Read in

In [None]:
data_path = r'.\data\SMSSpamCollection.txt'

In [None]:
# Read the data
data_tuples = []

with open(data_path, 'r') as f:
    for line in f:
        line_splt = line.split('\t')
        data_tuples.append((line_splt[0], line_splt[1]))

In [None]:
# Create a dataframe
data = pd.DataFrame(data_tuples)

In [None]:
# Update colnames
data.columns = ['label', 'content']

data.head()

In [None]:
# Cast labels to ints
data['label'] = data.label.apply(lambda x: 1 if x == 'spam' else 0)

In [None]:
data.head()

In [None]:
data.shape

### Tokenize

In [None]:
data['content'][5083]

In [None]:
' '.join(clean_tokenize(data['content'][5083]))

In [None]:
tokenized = [clean_tokenize(row) for row in data.content.values]

## Build an SMS classifier

### Using `gensim` & `keras`

In [None]:
# Define callbacks

class EpochLogger(CallbackAny2Vec):

    def __init__(self):
        self.epoch = 1

    def on_epoch_begin(self, model):
        if self.epoch % 10 == 0:
            print(f"Epoch {self.epoch:02d} started...")

    def on_epoch_end(self, model):
        self.epoch += 1

#### Define Word2Vec model

In [None]:
# Define model params
N_EPOCHS  = 100
EMB_DIM   = 300
WINDOW    = 20
MIN_COUNT = 1
SKIP_GRAM = 1

gensim_params = dict(
    size = EMB_DIM, 
    sg = SKIP_GRAM,
    iter = N_EPOCHS,
    window = WINDOW,
    min_count = MIN_COUNT
)

#### Train Word2Vec model

In [None]:
# Train the model
model = Word2Vec(tokenized, **gensim_params, callbacks = [EpochLogger()])

#### Build document representations

In [None]:
labels = data.label.values

labels_clean = []
doc_vecs = []

for label, row in zip(labels, tokenized):
    try:
        doc_vecs.append(get_avg_embedding(row, model))
        labels_clean.append(label)
    except ZeroDivisionError:
        pass

In [None]:
len(doc_vecs)

#### Scale the vectors & prepare labels

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(doc_vecs)

y_train = np.array(labels_clean).astype('float')

#### Visualize doc embeddings and labels

In [None]:
# Dimensionality reduction
pca = PCA(n_components = 2)
X_train_2d = pca.fit_transform(X_scaled)

In [None]:
# Plot
plt.figure(figsize = (15, 8))
sns.scatterplot(X_train_2d[:, 0], 
                X_train_2d[:, 1], 
                hue = ['spam' if i == 1 else 'ham' for i in labels_clean], 
                alpha = .2)
plt.title('Two first components of document-level averaged word vectors')
plt.show()


#### Build a classifier

In [None]:
clf_01 = keras.Sequential([
    keras.layers.Dense(16, input_shape = (300,), activation = 'selu', kernel_initializer = 'lecun_normal'),
    keras.layers.AlphaDropout(.2),
    keras.layers.Dense(64, activation = 'selu', kernel_initializer = 'lecun_normal'),
    keras.layers.AlphaDropout(.2),
    keras.layers.Dense(32, activation = 'selu', kernel_initializer = 'lecun_normal'),
    keras.layers.AlphaDropout(.2),
    keras.layers.Dense(2, activation = 'sigmoid')
])

clf_01.compile(loss = keras.losses.SparseCategoricalCrossentropy(), 
               optimizer = keras.optimizers.RMSprop(learning_rate=.001),
               metrics = ['accuracy'])

In [None]:
clf_01.summary()

In [None]:
clf_01.fit(X_scaled, y_train, epochs = 50, verbose = 1)

In [None]:
# Plot model loss
plt.plot(clf_01.history.history['loss'], label = 'Loss', lw = 1)
plt.legend()
plt.xlabel('Epoch')
plt.show()

In [None]:
for message, pred, label in zip(tokenized, clf_01.predict(X_scaled).argmax(axis = 1), y_train):
    if pred != label:
        print(f'{" ".join(message)[:80]:90}| Label: {int(label)} Pred: {pred}')

### Train pure `keras` model

We can also train embeddings simultaneously with the classifier using [Keras `Embedding` layer](https://keras.io/api/layers/core_layers/embedding/).

This layer learns embeddings specific for your supervised task. That's different from unsupervised (or self-supervised) approach used in pure Word2Vec. 

Both approaches might have advantages, depending on your task's context. E.g., [this paper](https://arxiv.org/abs/1804.06323) describes when using pre-trained vs learned embeddings can be beneficial.

Please note that the keras model that we train does not perform document-level averaging for embeddings. That's another difference comparing to our previous approach presented in ***Using gensim & keras*** section.

#### Preprocess the data

In [None]:
# Get maxlen 
MAX_LEN = max([len(row) for row in tokenized])

# Get vocab size
unique_words = []

for row in tokenized:
    for word in row:
        if word not in unique_words:
            unique_words.append(word)
          
VOCAB_SIZE = len(unique_words)

In [None]:
VOCAB_SIZE

In [None]:
# Define tokenizer
tokenizer = keras.preprocessing.text.Tokenizer(num_words = VOCAB_SIZE, filters = '')
tokenizer.fit_on_texts([' '.join(row) for row in tokenized])

sequences = tokenizer.texts_to_sequences([' '.join(row) for row in tokenized])
padded    = keras.preprocessing.sequence.pad_sequences(sequences, maxlen = MAX_LEN, padding='post') 

In [None]:
# Prepare y_train
y_train_02 = data.label.values.astype('float')

#### Define the model

In [None]:
clf_02 = keras.Sequential([
    keras.layers.Embedding(VOCAB_SIZE, 300, mask_zero = True, input_length = MAX_LEN),
    keras.layers.Flatten(),
    keras.layers.Dense(16, activation = 'selu', kernel_initializer = 'lecun_normal'),
    keras.layers.AlphaDropout(.2),
    keras.layers.Dense(64, activation = 'selu', kernel_initializer = 'lecun_normal'),
    keras.layers.AlphaDropout(.2),
    keras.layers.Dense(32, activation = 'selu', kernel_initializer = 'lecun_normal'),
    keras.layers.AlphaDropout(.2),
    keras.layers.Dense(2, activation = 'sigmoid')
])

clf_02.compile(loss = keras.losses.SparseCategoricalCrossentropy(), 
               optimizer = keras.optimizers.RMSprop(learning_rate=.001),
               metrics = ['accuracy'])

In [None]:
clf_02.summary()

In [None]:
clf_02.fit(padded, y_train_02, epochs = 15, verbose = 1)

In [None]:
# Plot model loss
plt.plot(clf_01.history.history['loss'], label = 'Loss clf_01', lw = 1)
plt.plot(clf_02.history.history['loss'], label = 'Loss clf_02', lw = 1)
plt.legend()
plt.xlabel('Epoch')
plt.show()

In [None]:
for message, pred, label in zip(tokenized, clf_02.predict(padded).argmax(axis = 1), y_train_02):
    if pred != label:
        print(f'{" ".join(message)[:80]:90}| Label: {int(label)} Pred: {pred}')