<a href="https://colab.research.google.com/github/AnyaMit/ANLPwTF/blob/main/ANLPwTF_AnyaMit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
import os
import io

tf.__version__

In [None]:
# Download the zip file

path_to_zip = tf.keras.utils.get_file("smsspamcollection.zip",origin="https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip",extract=True)

# Unzip the file int a folder

!unzip $path_to_zip -d data

In [None]:
# Let's seee if we  read the data correctly

lines = io.open('data/SMSSpamCollection').read().strip().split('\n')
lines[0]

In [None]:
## PRE PROCESSING DATA SECTION

In [None]:
spam_dataset = []
for line in lines:
  label, text = line.split('\t')
  if label.strip() == 'spam':
    spam_dataset.append((1, text.strip()))
  else:
    spam_dataset.append((0, text.strip()))
print(spam_dataset)

In [None]:
import pandas as pd

df = pd.DataFrame(spam_dataset, columns=['Spam','Message'])

import re

def message_length(x):
  # returns totaal number  of characters
  return len(x)

def num_capitals(x):
  _, count = re.subn(r'[A-Zz]', '', x) # only works in english
  return count

def num_punctuation(x):
  _, count = re.subn(r'\W', '', x) 
  return count




In [None]:
df['Capitals'] = df['Message'].apply(num_capitals)
df['Punctuation'] = df['Message'].apply(num_punctuation)
df['Length'] = df['Message'].apply(message_length)
df.describe()

In [None]:
## Split into Test and Train

train = df.sample(frac=0.8, random_state = 42)
test = df.drop(train.index)

x_train = train[['Length','Capitals','Punctuation']]
y_train = train[['Spam']]

x_test = test[['Length','Capitals','Punctuation']]
y_test = test[['Spam']]

In [None]:
## Model Normalization

# 1- Layer neural network model for evalutaion

def make_model(input_dims = 3, num_units = 12):
  model = tf.keras.Sequential()

  # Add a densely - connected layer with 12 units to the model:
  model.add(tf.keras.layers.Dense(num_units,
                                  input_dim=input_dims,
                                  activation='relu'))
  
  # Add a sigmoid layer with a binary output unit:
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

  model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
  
  return model

In [None]:
model = make_model()

model.fit(x_train, y_train, epochs=10, batch_size=10)

In [None]:
model.evaluate(x_test,y_test)

y_train_pred = model.predict_classes(x_train)

# confusion matrix

tf.math.confusion_matrix(tf.constant(y_train.Spam),
                         y_train_pred)

#  array([[3733,  134],
#       [ 133,  459]], dtype=int32)>


In [None]:
sentence = 'Go until Jurong point, crazy.. Available onlly in bugis n great world'
sentence.split()

In [None]:
!pip install stanza 
import stanza


In [None]:
en = stanza.download('en')

In [None]:
en = stanza.Pipeline(lang='en', processors='tokenize')

In [None]:
tokenized = en(sentence)
len(tokenized.sentences)

for snt in tokenized.sentences:
  for word in snt.tokens:
    print(word.text)
  print("<End of Sentence>")

In [None]:
en = stanza.Pipeline(lang='en') 
print(en)

def word_counts(x, pipeline=en):
  doc = pipeline(x)
  count = sum([len(sentence.tokens) for sentence in doc.sentences])
  return count


In [None]:
## This did not work due to the error described in the book. No work around found for the Pytorch issue....

#/usr/local/lib/python3.7/dist-packages/stanfordnlp/models/depparse/model.py:157: UserWarning: masked_fill_ received a mask with dtype torch.uint8, this behavior is now deprecated,please use a mask with dtype torch.bool instead. (Triggered internally at  /pytorch/aten/src/ATen/native/cuda/LegacyDefinitions.cpp:28.)
# unlabeled_scores.masked_fill_(diag, -float('inf'))

train['Words'] = train['Message'].apply(word_counts)
test['Words'] = test['Message'].apply(word_counts)

x_train = train[['Length', 'Punctuation', 'Capitals', 'Words']]
y_train = train['Spam']

x_test = test[['Length', 'Punctuation', 'Capitals', 'Words']]
y_test = test['Spam']

model = make_model(input_dims=4)

In [None]:
model.fit(x_train,  y_train, epochs=10, batch_size =10)

In [None]:
train.loc[train.Spam == 1].describe()


In [None]:
!pip install stopwordsiso

import stopwordsiso as stopwords

stopwords.langs()

In [None]:
sorted(stopwords.stopwords('en'))

In [None]:
en_sw = stopwords.stopwords('en')

def word_counts(x, pipeline=en):
  doc = pipeline(x)
  count = 0
  for sentence in doc.sentences:
    for token in sentence.tokens:
      if token.text.lower() not in en_sw:
        count += 1
       # count = count.astype('float32')
  return count

In [None]:
## Modeling with stopwords removed

train['Words'] = train['Message'].apply(word_counts)
test['Words'] = test['Message'].apply(word_counts)

x_train = train[['Length', 'Punctuation', 'Capitals', 'Words']]
y_train = train['Spam']

x_test = test[['Length', 'Punctuation', 'Capitals', 'Words']]
y_test = test['Spam']

model = make_model(input_dims=4)

In [None]:
## POS Tagging

en = stanza.Pipeline(lang='en')
print(en)

txt = "Yo you around? A friend of mine's lookin."
pos = en(txt)

In [None]:
def print_pos(doc):
  text = ""
  for sentence in doc.sentences:
    for token in sentence.tokens:
      text += token.words[0].text+"/"+ token.words[0].upos+ " "
    text += "\n"
  return text

In [None]:
print(print_pos(pos))

In [None]:
en_sw = stopwords.stopwords('en')

def word_counts_v3(x, pipeline=en):
  doc = pipeline(x)
  totals = 0.
  count = 0.
  non_word = 0.
  for sentence in doc.sentences:
    for token in sentence.tokens:
      if token.text.lower() not in en_sw:
        if token.words[0].upos not in ['PUNKT','SYM']:
          count += 1.
        else:
          non_word += 1.
  non_word = non_word / totals
  return pd.Series([count, non_word], index = ['Words_NoPunct', 'Punct'])

In [None]:
## Skipping page 32-34 as there is an error -- submitted issue https://github.com/stanfordnlp/stanfordnlp/issues/8

In [None]:
corpus = [
          "I like fruits. Fruits like bananas",
          "I love bananas but eat an apple",
          "An apple a day keeps the doctor away"
]

In [None]:
!pip install sklearn

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

vectorizer.get_feature_names()

In [None]:
X.toarray()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(X.toarray())

In [None]:
query = vectorizer.transform(["apple and bananas"])

cosine_similarity(X, query)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(X.toarray())

pd.DataFrame(tfidf.toarray(),
             columns = vectorizer.get_feature_names())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

tfidf = TfidfVectorizer(binary=True)

X = tfidf.fit_transform(train['Message']).astype('float32')
X_test = tfidf.transform(test['Message']).astype('float32')

X.shape

In [None]:
_, cols = X.shape

model2 = make_model(cols) # to match td-idf dimensions

y_train = train[['Spam']]
y_test = test[['Spam']]

model2.fit(X.toarray(), y_train, epochs = 10, batch_size = 10)

In [None]:
model2.evaluate(X_test.toarray(), y_test)

In [None]:
y_test_pred = model2.predict_classes(X_test.toarray())
tf.math.confusion_matrix(tf.constant(y_test.Spam), y_test_pred)

## Much better performance
## array([[958,   2],
## [ 17, 138]], dtype=int32)>

In [None]:
!pip install gensim

In [None]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
model_w2v = api.load("word2vec-google-news-300")

In [None]:
model_w2v.most_similar("cookies", topn=10)
model_w2v.doesnt_match(["USA","Canada","India","Tokyo"])

king = model_w2v['king']
woman = model_w2v['woman']
man = model_w2v['man']

queen = king - man + woman
model_w2v.similar_by_vector(queen)


#### Chapter 2 --- NLU ----

In [None]:
!pip install tensorflow_datasets
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np

In [None]:
", ".join(tfds.list_builders())

In [None]:
imdb_train, ds_info = tfds.load(name="imdb_reviews", split="train", with_info = True, as_supervised=True)
imdb_test = tfds.load(name="imdb_reviews",split="test",as_supervised=True)

In [None]:
print(ds_info)

In [None]:
for example, label in imdb_train.take(1):
  print(example, '\n', label)

In [None]:
tokenizer = tfds.deprecated.text.Tokenizer() #Used this instead of what the book used --- why: https://github.com/tensorflow/tensorflow/issues/45217

In [None]:

vocabulary_set = set()
MAX_TOKENS = 0

for example, label in imdb_train:
  some_tokens = tokenizer.tokenize(example.numpy())
  if MAX_TOKENS < len(some_tokens):
    MAX_TOKENS = len(some_tokens)
  vocabulary_set.update(some_tokens)

In [None]:
imdb_encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary_set, tokenizer=tokenizer)
vocab_size = imdb_encoder.vocab_size
print(vocab_size, MAX_TOKENS)


In [None]:
for example, label in imdb_train.take(1):
  print (example)
  encoded = imdb_encoder.encode(example.numpy())
  print(imdb_encoder.decode(encoded))

In [None]:
imdb_encoder.save_to_file("reviews_vocab")
enc = tfds.deprecated.text.TokenTextEncoder.load_from_file("reviews_vocab")
enc.decode(enc.encode("Good case. Excellent value"))

In [None]:
from tensorflow.keras.preprocessing import sequence
def encode_pad_transform(sample):
  encoded = imdb_encoder.encode(sample.numpy())
  pad = sequence.pad_sequences([encoded], padding = 'post', maxlen = 150)
  return np.array(pad[0], dtype=np.int64)

def encode_tf_fn(sample, label):
  encoded = tf.py_function(encode_pad_transform, inp=[sample], Tout = (tf.int64))
  encoded.set_shape([None])
  label.set_shape([])
  return encoded, label

In [None]:
subset = imdb_train.take(10)
tst = subset.map(encode_tf_fn)
for review, label in tst.take(1):
  print(review,label)
  print(imdb_encoder.decode(review))

In [None]:
# Running on the entire set

encoded_train = imdb_train.map(encode_tf_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
encoded_test = imdb_test.map(encode_tf_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)

LSTM model with embeddings

In [None]:
tf.keras.layers.LSTM(rnn_units) ## another function that doesn't work...

In [None]:
def build_model_lstm(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
          tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                    mask_zero = True,
                                    batch_input_shape =[batch_size, None]),
          tf.keras.layers.LSTM(rnn_units),
          tf.keras.layers.Dense(1,activation='sigmoid')
  ])

  return model

In [None]:
vocab_size = imdb_encoder.vocab_size

# The embedding dimention
embedding_dim = 64

# Number of RNN units
rnn_units = 64

# batch size
BATCH_SIZE = 100

In [None]:
model = build_model_lstm(
    vocab_size = vocab_size,
    embedding_dim = embedding_dim,
    rnn_units = rnn_units,
    batch_size = BATCH_SIZE)

model.summary()

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics = ['accuracy', 'Precision', 'Recall'])

encoded_train_batched = encoded_train.batch(BATCH_SIZE)
model.fit(encoded_train_batched, epochs=10)

In [None]:
model.evaluate(encoded_test.batch(BATCH_SIZE))

In [None]:
def build_model_bilstm(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
          tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                    mask_zero = True,
                                    batch_input_shape =[batch_size, None]),
          tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(rnn_units)),
          tf.keras.layers.Dense(1,activation='sigmoid')
  ])

  return model

In [None]:
bilstm = build_model_bilstm(
    vocab_size = vocab_size,
    embedding_dim = embedding_dim,
    rnn_units = rnn_units,
    batch_size = BATCH_SIZE)

bilstm.summary()

In [None]:
bilstm.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics = ['accuracy', 'Precision', 'Recall'])

encoded_train_batched = encoded_train.batch(BATCH_SIZE)

bilstm.fit(encoded_train_batched, epochs=5)

Chapter 3: NER

Chapter 4: Transfer Learning with BERT

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

imdb_train, ds_info = tfds.load(name="imdb_reviews",
                                split="train",
                                with_info=True, as_supervised= True)

imdb_test = tfds.load(name="imdb_reviews", split="test", as_supervised= True)



In [None]:
# Use default tokenizer settings
tokenizer = tfds.deprecated.text.Tokenizer()

vocabulary_set = set()
MAX_TOKENS = 0

for example, label in imdb_train:
  some_tokens = tokenizer.tokenize(example.numpy())
  if MAX_TOKENS < len(some_tokens):
    MAX_TOKENS = len(some_tokens)
  vocabulary_set.update(some_tokens)


In [None]:
imdb_encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary_set, lowercase = True, tokenizer=tokenizer)
vocab_size = imdb_encoder.vocab_size
print(vocab_size, MAX_TOKENS)

In [None]:
from tensorflow.keras.preprocessing import sequence

def encode_pad_transform(sample):
  encoded = imdb_encoder.encode(sample.numpy())
  pad = sequence.pad_sequences([encoded], padding = 'post', maxlen = 150)
  return np.array(pad[0], dtype=np.int64)

def encode_tf_fn(sample, label):
  encoded = tf.py_function(encode_pad_transform, inp=[sample], Tout = (tf.int64))
  encoded.set_shape([None])
  label.set_shape([])
  return encoded, label

In [None]:
# Running on the entire set

encoded_train = imdb_train.map(encode_tf_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
encoded_test = imdb_test.map(encode_tf_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [None]:
## Download pre-trained embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

In [None]:
dict_w2v = {}
with open('glove.6B.50d.txt',"r") as file:
  for line in file:
    tokens = line.split()
    word = tokens[0]
    vector = np.array(tokens[1:], dtype=np.float32)

    if vector.shape[0] == 50:
      dict_w2v[word] = vector
    else:
      print("There was an issue with " + word)

# Lets check the vocab size
print("Dictionary Size:", len(dict_w2v))

In [None]:
embedding_dim = 50
embedding_matrix = np.zeros((imdb_encoder.vocab_size, embedding_dim))

In [None]:
unk_cnt = 0
unk_set = set()
for word in imdb_encoder.tokens:
  embedding_vector = dict_w2v.get(word)

  if embedding_vector is not None:
    tkn_id = imdb_encoder.encode(word)[0]
    embedding_matrix[tkn_id] = embedding_vector
  else:
    unk_cnt += 1
    unk_set.add(word)

# Print how many weren't found
print("Total unknown words:", unk_cnt)

In [None]:
##################### FEATURE EXTRACTION MODEL ##########################

In [None]:
# Length of vocab in chars
vocab_size = imdb_encoder.vocab_size #len chars

# Number of RNN units
rnn_units = 64

# batch size
BATCH_SIZE = 100

In [None]:
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense

def build_model_bilstm(vocab_size, embedding_dim, rnn_units, batch_size, train_emb=False):
  model = tf.keras.Sequential([
          Embedding(vocab_size, embedding_dim,
                                    mask_zero = True,
                                    weights = [embedding_matrix],
                                    trainable = train_emb),
  Bidirectional(LSTM(rnn_units, return_sequences=True,dropout=0.5)),
  Bidirectional(LSTM(rnn_units,dropout=0.25)),
  Dense(1, activation='sigmoid')
  ])

  return model

In [None]:
model_fe = build_model_bilstm(
    vocab_size = vocab_size,
    embedding_dim = embedding_dim,
    rnn_units = rnn_units,
    batch_size = BATCH_SIZE)

model_fe.summary()

In [None]:
model_fe.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics = ['accuracy', 'Precision', 'Recall'])

encoded_train_batched = encoded_train.batch(BATCH_SIZE).prefetch(100)

model_fe.fit(encoded_train_batched, epochs=10)

In [None]:
model_fe.evaluate(encoded_test.batch(BATCH_SIZE))

In [None]:
##################### FINE TUNING MODEL ##########################

In [None]:
model_ft = build_model_bilstm(
    vocab_size = vocab_size,
    embedding_dim = embedding_dim,
    rnn_units = rnn_units,
    batch_size = BATCH_SIZE,
    train_emb = True)

model_ft.summary()

In [None]:
model_ft.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics = ['accuracy', 'Precision', 'Recall'])

encoded_train_batched = encoded_train.batch(BATCH_SIZE).prefetch(100)

model_ft.fit(encoded_train_batched, epochs=10)

In [None]:
!pip install transformers==3.0.2

In [None]:
from transformers import BertTokenizer
bert_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(bert_name,
                                          add_special_tokens=True,
                                          do_lower_case = False,
                                          max_length=150,
                                          pad_to_max_length=True)

In [None]:
tokenizer.encode_plus("Don't be lured", add_special_tokens=True,
                      max_length = 9,
                      pad_to_max_length = True,
                      return_attention_mask = True,
                      return_token_type_ids=True)

In [None]:
tokenizer.encode_plus(" Don't be", " lured", add_special_tokens=True,
                      max_length = 10,
                      pad_to_max_length = True,
                      return_attention_mask = True,
                      return_token_type_ids=True)

In [None]:
def bert_encoder(review):
  txt = review.numpy().decode('utf-8')
  encoded = tokenizer.encode_plus(txt, add_special_tokens=True,
                                  max_length = 150,
                                  pad_to_max_length = True,
                                  return_attention_mask = True,
                                  return_token_type_ids=True)
  
  return encoded['input_ids'], encoded['token_type_ids'], encoded['attention_mask']

In [None]:
bert_train = [bert_encoder(r) for r, l in imdb_train]
bert_lbl = [l for r, l in imbd_train]
bert_train = np.array(bert_train)
bert_lbl = tf.keras.utils.to_categorical(bert_lbl, num_classes=2)

In [None]:
# create training and validation splits 
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_Split(bert_train,
bert_lbl,
test_size=0.2,
random_state=42))