In [1]:
import tensorflow as tf
print(tf.__version__)
from tensorflow.keras.preprocessing.sequence import pad_sequences

2.0.0-rc0


In [2]:
from tensorflow import keras
print(keras.__version__)
from tensorflow.keras.preprocessing.text import Tokenizer

2.2.4-tf


### Word based encoding 

In [4]:
sample_text=['This is a chapter on text processing using tf','Text processing requires careful handling']

In [6]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(sample_text)

In [7]:
word_dict=tokenizer.word_index

In [8]:
print(word_dict)

{'text': 1, 'processing': 2, 'this': 3, 'is': 4, 'a': 5, 'chapter': 6, 'on': 7, 'using': 8, 'tf': 9, 'requires': 10, 'careful': 11, 'handling': 12}


In [12]:
sample_edit_text=['This is a chapter on text processing using tf','Text processing requires careful handling','tf!']

In [14]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(sample_edit_text)

In [15]:
word_dict=tokenizer.word_index

In [16]:
print(word_dict)

{'text': 1, 'processing': 2, 'tf': 3, 'this': 4, 'is': 5, 'a': 6, 'chapter': 7, 'on': 8, 'using': 9, 'requires': 10, 'careful': 11, 'handling': 12}


### text to sequence 

In [19]:
seq=tokenizer.texts_to_sequences(sample_edit_text)

In [20]:
print(seq)

[[4, 5, 6, 7, 8, 1, 2, 9, 3], [1, 2, 10, 11, 12], [3]]


### padding

In [23]:

padded_seq=pad_sequences(seq,padding='post')
print(padded_seq)

[[ 4  5  6  7  8  1  2  9  3]
 [ 1  2 10 11 12  0  0  0  0]
 [ 3  0  0  0  0  0  0  0  0]]


## missing words 

## Embeddings 

In [None]:
## Text Classifications using Embeddings 

In [3]:
import pandas as pd
import numpy as np

In [12]:
#create the dataframe using the reviews data
df=pd.read_csv('product_reviews_dataset.csv',encoding = "ISO-8859-1")

In [13]:
# confirm the total columns in the dataset
df.columns

Index(['Sentiment', 'Summary'], dtype='object')

In [14]:
#view top 10 rows of dataframe
df.head(10)

Unnamed: 0,Sentiment,Summary
0,1,Good Quality Dog Food
1,0,Not as Advertised
2,1,"""Delight"" says it all"
3,0,Cough Medicine
4,1,Great taffy
5,1,Nice Taffy
6,1,Great! Just as good as the expensive brands!
7,1,"Wonderful, tasty taffy"
8,1,Yay Barley
9,1,Healthy Dog Food


In [15]:
#View the target class frequency 
df.Sentiment.value_counts()

1    486417
0     82037
Name: Sentiment, dtype: int64

In [16]:
# Text cleaning 
import re
def clean_reviews(text):
    text=re.sub("[^a-zA-Z]"," ",str(text))
    return re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text)


In [17]:
#apply the text cleaning function on the summary column  
df['Summary']=df.Summary.apply(clean_reviews)


In [20]:
df.head(10)

Unnamed: 0,Sentiment,Summary
0,1,Good Quality Dog Food
1,0,Not as Advertised
2,1,Delight says it all
3,0,Cough Medicine
4,1,Great taffy
5,1,Nice Taffy
6,1,Great Just as good as the expensive brands
7,1,Wonderful tasty taffy
8,1,Yay Barley
9,1,Healthy Dog Food


In [21]:
#create the input and output variable 
X=df.Summary
y=df.Sentiment

In [22]:
#tokenize the reviews 
tokenizer=Tokenizer(num_words=10000,oov_token='xxxxxxx')

In [23]:
#fit on the input data 
tokenizer.fit_on_texts(X)

In [24]:
X_dict=tokenizer.word_index

In [25]:
len(X_dict)

32763

In [26]:
X_dict.items()



In [27]:
#convert the reviews into label sequences 
X_seq=tokenizer.texts_to_sequences(X)

In [29]:
X_seq[:10]

[[4, 67, 28, 30],
 [8, 39, 572],
 [531, 487, 10, 73],
 [1723, 1450],
 [2, 1486],
 [62, 1486],
 [2, 51, 39, 4, 39, 3, 148, 832],
 [70, 35, 1486],
 [1420, 2115],
 [52, 28, 30]]

In [30]:
#from tensorflow.keras.preprocessing.sequence import pad_sequences
X_padded_seq=pad_sequences(X_seq,padding='post',maxlen=100)


In [32]:
X_padded_seq[:3]

array([[  4,  67,  28,  30,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  8,  39, 572,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [34]:
X_padded_seq.shape

(568454, 100)

In [35]:
type(y)

pandas.core.series.Series

In [39]:
y = np.array(y)
y=y.flatten()

In [40]:
y.shape

(568454,)

In [41]:
type(y)

numpy.ndarray

In [42]:
max_length = 100
vocab_size = 10000
embedding_dims = 50

In [44]:
text_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_length=100,input_dim=10000,output_dim=50),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
text_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
text_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 50)           500000    
_________________________________________________________________
flatten_2 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 30006     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 7         
Total params: 530,013
Trainable params: 530,013
Non-trainable params: 0
_________________________________________________________________


In [45]:
num_epochs = 10
text_model.fit(X_padded_seq,y, epochs=num_epochs)

Train on 568454 samples
Epoch 1/10
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x64fa8aa90>

In [46]:
embeddings = text_model.layers[0]

In [49]:
embeddings.weights

[<tf.Variable 'embedding_2/embeddings:0' shape=(10000, 50) dtype=float32, numpy=
 array([[-1.6631930e-03, -3.1805714e-03, -4.2120423e-03, ...,
          6.7197871e-03, -6.8611807e-05,  5.0362763e-03],
        [ 2.5697786e-02, -3.3429664e-01,  1.4324448e-01, ...,
          2.6591510e-01, -6.1628467e-01,  4.6738818e-01],
        [-1.2153953e+00, -5.7287562e-01,  1.3141894e+00, ...,
          1.6204183e+00, -8.5191649e-01,  9.6747494e-01],
        ...,
        [-4.6929422e-01, -7.9158318e-01,  1.0746287e+00, ...,
          1.3168679e+00, -8.7972450e-01,  7.3542255e-01],
        [-6.2262291e-01, -2.9126891e-01,  2.6975635e-01, ...,
          5.5762780e-01, -4.7142237e-01,  3.8534114e-01],
        [ 3.8236725e-01, -3.2562292e-01,  5.2412951e-01, ...,
          8.0270082e-02, -4.5245317e-01,  2.1783772e-01]], dtype=float32)>]

In [50]:
weights = embeddings.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(10000, 50)


In [56]:
index_based_embedding  = dict([(value, key) for (key, value) in X_dict.items()])

def decode_review(text):
    return ' '.join([index_based_embedding.get(i, '?') for i in text])

In [57]:
index_based_embedding[1]

'xxxxxxx'

In [58]:
index_based_embedding[2]

'great'

In [60]:
weights[1]

array([ 0.02569779, -0.33429664,  0.14324448,  0.08739081,  0.52831393,
        0.27268887,  0.07457237,  0.12381076,  0.10957576,  0.06356773,
       -0.5458272 , -0.3850583 , -0.61023813,  0.3267659 , -0.1641999 ,
        0.35547504,  0.16175786, -0.29544404, -0.29933476, -0.4590062 ,
        0.31590942,  0.43237656,  0.32122514,  0.11494219,  0.05063607,
       -0.08631186,  0.42692658,  0.44402826, -0.4839747 ,  0.2801508 ,
       -0.37493172, -0.24629472,  0.11664449,  0.30983022, -0.08926664,
        0.12418804, -0.6622275 , -0.5364327 , -0.03189574, -0.30058974,
       -0.22386044, -0.46651962,  0.3162022 , -0.19460349,  0.10765371,
        0.46291786, -0.15769395,  0.2659151 , -0.61628467,  0.46738818],
      dtype=float32)

In [61]:
vec = io.open('embedding_vectors_new.tsv', 'w', encoding='utf-8')
meta = io.open('metadata_new.tsv', 'w', encoding='utf-8')
for i in range(1, vocab_size):
  word = index_based_embedding[i]
  embedding_vec_values = weights[i]
  meta.write(word + "\n")
  vec.write('\t'.join([str(x) for x in embedding_vec_values]) + "\n")
meta.close()
vec.close()