<a href="https://colab.research.google.com/github/AmanPriyanshu/Natural-Language-Processing/blob/master/Learning_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## IMPORTS:

In [1]:
%%capture
!pip install tensorflow_text

In [2]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import os
import string
import tensorflow_hub as hub
from absl import logging
import tensorflow_text as text

## DOWNLOADING DATASET:

In [3]:
def data_downloader():
  url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

  dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                      untar=True, cache_dir='.',
                                      cache_subdir='')

  dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
  return dataset_dir

In [4]:
dataset_dir = data_downloader()

In [5]:
train_dir = os.path.join(dataset_dir, 'train')

In [6]:
pos_dir = os.path.join(train_dir, 'pos')
print('POSITIVES', os.listdir(pos_dir)[:5], '\n')

neg_dir = os.path.join(train_dir, 'neg')
print('NEGATIVES', os.listdir(neg_dir)[:5])

POSITIVES ['8703_10.txt', '11223_9.txt', '10826_10.txt', '8748_8.txt', '10833_10.txt'] 

NEGATIVES ['5388_2.txt', '6228_1.txt', '1890_2.txt', '9921_3.txt', '9348_1.txt']


## LOADING DATASET:

In [7]:
def directory_to_array(path, n=300):
  data = []
  for file in os.listdir(path)[:n]:
    f = open(path+'/'+file)
    t = ' '.join(f.readlines())
    data.append(t)
  return data

In [8]:
positives = directory_to_array(pos_dir)
negatives = directory_to_array(neg_dir)

In [9]:
print({'Length of Positives': len(positives), 'Length of Negatives': len(negatives)})

{'Length of Positives': 300, 'Length of Negatives': 300}


## CONVERTING DATASETS INTO EMBEDDINGS:

### UNIVERSAL SENTENCE ENCODER:

In [10]:
universal_sentence_encoder = "https://tfhub.dev/google/universal-sentence-encoder/4" 
embed = hub.load(universal_sentence_encoder)
print ("module %s loaded" % universal_sentence_encoder)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [11]:
positive_use_embeddings = embed(positives).numpy()
print('POSITIVES', positive_use_embeddings.shape)

negative_use_embeddings = embed(negatives).numpy()
print('NEGATIVES', negative_use_embeddings.shape)

POSITIVES (300, 512)
NEGATIVES (300, 512)


### BERT EMBEDDINGS:

In [12]:
BERT_MODEL = "https://tfhub.dev/google/experts/bert/wiki_books/2"
PREPROCESS_MODEL = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1"

In [13]:
bert_preprocessor = hub.load(PREPROCESS_MODEL)
bert = hub.load(BERT_MODEL)

In [14]:
positives_preprocessed = bert_preprocessor(positives)
positives_embeddings = bert(positives_preprocessed)['pooled_output'].numpy()
print('POSITIVES', positives_embeddings.shape)

negatives_preprocessed = bert_preprocessor(negatives)
negatives_embeddings = bert(negatives_preprocessed)['pooled_output'].numpy()
print('NEGATIVES', negatives_embeddings.shape)









POSITIVES (300, 768)




NEGATIVES (300, 768)


## MODELS:

### UNIVERSAL SENTENCE ENCODER:

In [15]:
model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(1, activation='sigmoid'),
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [16]:
y = np.array([0 for _ in range(positive_use_embeddings.shape[0])] + [1 for _ in range(positive_use_embeddings.shape[0])])
x = np.array([i for i in positive_use_embeddings] + [i for i in negative_use_embeddings])

print('X:', x.shape)
print('Y:', y.shape)

X: (600, 512)
Y: (600,)


In [17]:
model.fit(x, y, shuffle=True, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f376de4f748>

### BERT:

In [18]:
model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(1, activation='sigmoid'),
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [19]:
y = np.array([0 for _ in range(positives_embeddings.shape[0])] + [1 for _ in range(negatives_embeddings.shape[0])])
x = np.array([i for i in positives_embeddings] + [i for i in negatives_embeddings])

print('X:', x.shape)
print('Y:', y.shape)

X: (600, 768)
Y: (600,)


In [20]:
model.fit(x, y, shuffle=True, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f376dcf2668>